ginkgo-project · pratikvn · Aug 23, 2024 · Aug 21, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
@@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES
     factorization/par_ilut_select_kernels.cpp
     factorization/par_ilut_spgeam_kernels.cpp
     factorization/par_ilut_sweep_kernels.cpp
+    matrix/batch_csr_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -22,14 +23,6 @@
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
 
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/batch_struct.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/batch_struct.hip.hpp"
-#else
-#error "batch struct def missing"
-#endif
-
 
 namespace gko {
 namespace kernels {

diff --git a/cuda/base/batch_struct.hpp → common/cuda_hip/base/batch_struct.hpp b/cuda/base/batch_struct.hpp → common/cuda_hip/base/batch_struct.hpp
@@ -2,21 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_
-#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 
 
 /** @file batch_struct.hpp
@@ -33,10 +34,10 @@ namespace cuda {
  * Generates an immutable uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<const cuda_type<ValueType>>
+inline batch::multi_vector::uniform_batch<const device_type<ValueType>>
 get_batch_struct(const batch::MultiVector<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -46,19 +47,19 @@ get_batch_struct(const batch::MultiVector<ValueType>* const op)
  * Generates a uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<cuda_type<ValueType>>
+inline batch::multi_vector::uniform_batch<device_type<ValueType>>
 get_batch_struct(batch::MultiVector<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
 }
 
 
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#endif  // GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
diff --git a/.../matrix/batch_csr_kernel_launcher.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.cpp b/.../matrix/batch_csr_kernel_launcher.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_csr_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 
@@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/...cuda_hip/matrix/batch_csr_kernels.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.hpp b/...cuda_hip/matrix/batch_csr_kernels.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -21,23 +51,11 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      csr::uniform_batch<
-                                                          const ValueType,
-                                                          IndexType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -71,33 +89,13 @@ __device__ __forceinline__ void advanced_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        csr::uniform_batch<
-                                                            const ValueType,
-                                                            IndexType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -196,3 +194,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko