diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 15d3a82419e..f5a28596d16 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES factorization/par_ilut_select_kernels.cpp factorization/par_ilut_spgeam_kernels.cpp factorization/par_ilut_sweep_kernels.cpp + matrix/batch_csr_kernels.cpp + matrix/batch_dense_kernels.cpp + matrix/batch_ell_kernels.cpp matrix/coo_kernels.cpp matrix/dense_kernels.cpp matrix/diagonal_kernels.cpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index bb3aac67b55..7583cc72292 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -22,14 +23,6 @@ #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/batch_struct.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/batch_struct.hip.hpp" -#else -#error "batch struct def missing" -#endif - namespace gko { namespace kernels { diff --git a/cuda/base/batch_struct.hpp b/common/cuda_hip/base/batch_struct.hpp similarity index 71% rename from cuda/base/batch_struct.hpp rename to common/cuda_hip/base/batch_struct.hpp index 9f07b6b4532..bc10752975f 100644 --- a/cuda/base/batch_struct.hpp +++ b/common/cuda_hip/base/batch_struct.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_ -#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_ +#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_ #include @@ -11,12 +11,13 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** @file batch_struct.hpp @@ -33,10 +34,10 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template -inline batch::multi_vector::uniform_batch> +inline batch::multi_vector::uniform_batch> get_batch_struct(const batch::MultiVector* const op) { - return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), + return {as_device_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -46,19 +47,19 @@ get_batch_struct(const batch::MultiVector* const op) * Generates a uniform batch struct from a batch of multi-vectors. */ template -inline batch::multi_vector::uniform_batch> +inline batch::multi_vector::uniform_batch> get_batch_struct(batch::MultiVector* const op) { - return {as_cuda_type(op->get_values()), op->get_num_batch_items(), + return {as_device_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko -#endif // GKO_CUDA_BASE_BATCH_STRUCT_HPP_ +#endif // GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_ diff --git a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp similarity index 64% rename from common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc rename to common/cuda_hip/matrix/batch_csr_kernels.cpp index 18c9dbcb29a..35dc2c17e03 100644 --- a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_csr_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_csr { + + +constexpr auto default_block_size = 256; + + template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); + batch_single_kernels::simple_apply_kernel<<get_stream()>>>( + mat_ub, b_ub, x_ub); } @@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, - beta_ub, x_ub); + batch_single_kernels::advanced_apply_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, mat_ub, b_ub, beta_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -59,8 +88,10 @@ void scale(std::shared_ptr exec, const auto col_scale_vals = col_scale->get_const_data(); const auto row_scale_vals = row_scale->get_const_data(); const auto mat_ub = get_batch_struct(input); - scale_kernel<<get_stream()>>>( - as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub); + batch_single_kernels:: + scale_kernel<<get_stream()>>>( + as_device_type(col_scale_vals), as_device_type(row_scale_vals), + mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto beta_ub = get_batch_struct(beta); const auto mat_ub = get_batch_struct(mat); - add_scaled_identity_kernel<<get_stream()>>>(alpha_ub, beta_ub, - mat_ub); + batch_single_kernels::add_scaled_identity_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, beta_ub, mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.hpp similarity index 66% rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_csr_kernels.hpp index e041dadaa3e..64611559715 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp @@ -2,6 +2,36 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, @@ -21,23 +51,11 @@ __device__ __forceinline__ void simple_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - csr::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void simple_apply_kernel( + const gko::batch::matrix::csr::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -71,33 +89,13 @@ __device__ __forceinline__ void advanced_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void advanced_apply_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::matrix:: - csr::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - beta, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::matrix::csr::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch beta, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -196,3 +194,9 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp similarity index 66% rename from common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc rename to common/cuda_hip/matrix/batch_dense_kernels.cpp index 8fdb001fd1f..44dad55aa70 100644 --- a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_dense { + + +constexpr auto default_block_size = 256; + + template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); + batch_single_kernels::simple_apply_kernel<<get_stream()>>>( + mat_ub, b_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -40,9 +69,9 @@ void advanced_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, - beta_ub, x_ub); + batch_single_kernels::advanced_apply_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, mat_ub, b_ub, beta_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -58,8 +87,10 @@ void scale(std::shared_ptr exec, const auto col_scale_vals = col_scale->get_const_data(); const auto row_scale_vals = row_scale->get_const_data(); const auto mat_ub = get_batch_struct(input); - scale_kernel<<get_stream()>>>( - as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub); + batch_single_kernels:: + scale_kernel<<get_stream()>>>( + as_device_type(col_scale_vals), as_device_type(row_scale_vals), + mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); @@ -75,7 +106,8 @@ void scale_add(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto mat_ub = get_batch_struct(mat); const auto in_out_ub = get_batch_struct(in_out); - scale_add_kernel<<get_stream()>>>( + batch_single_kernels::scale_add_kernel<<get_stream()>>>( alpha_ub, mat_ub, in_out_ub); } @@ -92,10 +124,16 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto beta_ub = get_batch_struct(beta); const auto mat_ub = get_batch_struct(mat); - add_scaled_identity_kernel<<get_stream()>>>(alpha_ub, beta_ub, - mat_ub); + batch_single_kernels::add_scaled_identity_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, beta_ub, mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp similarity index 72% rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_dense_kernels.hpp index f8abf9131a1..e4cd24bbd78 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp @@ -2,6 +2,36 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, @@ -33,22 +63,10 @@ __device__ __forceinline__ void simple_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - dense::uniform_batch< - const ValueType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void simple_apply_kernel( + const gko::batch::matrix::dense::uniform_batch mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -94,32 +112,12 @@ __device__ __forceinline__ void advanced_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void advanced_apply_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::matrix:: - dense::uniform_batch< - const ValueType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - beta, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::matrix::dense::uniform_batch mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch beta, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -243,3 +241,9 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp similarity index 64% rename from common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc rename to common/cuda_hip/matrix/batch_ell_kernels.cpp index 7e69b119c85..c56325ab824 100644 --- a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_ell_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_ell { + + +constexpr auto default_block_size = 256; + + template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); + batch_single_kernels::simple_apply_kernel<<get_stream()>>>( + mat_ub, b_ub, x_ub); } @@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, - beta_ub, x_ub); + batch_single_kernels::advanced_apply_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, mat_ub, b_ub, beta_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -59,8 +88,10 @@ void scale(std::shared_ptr exec, const auto col_scale_vals = col_scale->get_const_data(); const auto row_scale_vals = row_scale->get_const_data(); const auto mat_ub = get_batch_struct(input); - scale_kernel<<get_stream()>>>( - as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub); + batch_single_kernels:: + scale_kernel<<get_stream()>>>( + as_device_type(col_scale_vals), as_device_type(row_scale_vals), + mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto beta_ub = get_batch_struct(beta); const auto mat_ub = get_batch_struct(mat); - add_scaled_identity_kernel<<get_stream()>>>(alpha_ub, beta_ub, - mat_ub); + batch_single_kernels::add_scaled_identity_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, beta_ub, mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_ell +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp similarity index 67% rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_ell_kernels.hpp index 0a6d1927c96..52826957ddb 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp @@ -2,6 +2,36 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, @@ -28,23 +58,11 @@ __device__ __forceinline__ void simple_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - ell::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void simple_apply_kernel( + const gko::batch::matrix::ell::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -84,34 +102,15 @@ __device__ __forceinline__ void advanced_apply( } } + template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void advanced_apply_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::matrix:: - ell::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - beta, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::matrix::ell::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch beta, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -205,3 +204,9 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/batch_struct.hpp b/common/cuda_hip/matrix/batch_struct.hpp similarity index 74% rename from cuda/matrix/batch_struct.hpp rename to common/cuda_hip/matrix/batch_struct.hpp index 5845fb2235e..e88eca245bb 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/common/cuda_hip/matrix/batch_struct.hpp @@ -2,27 +2,31 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_ +#include #include #include +#include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** @file batch_struct.hpp * * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the required CUDA scalar type. + * while also shallow-casting to the required GKO_DEVICE_NAMESPACE scalar + * type. * * A specialization is needed for every format of every kind of linear algebra * object. These are intended to be called on the host. @@ -33,11 +37,11 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of csr matrices. */ template -inline batch::matrix::csr::uniform_batch, +inline batch::matrix::csr::uniform_batch, const IndexType> get_batch_struct(const batch::matrix::Csr* const op) { - return {as_cuda_type(op->get_const_values()), + return {as_device_type(op->get_const_values()), op->get_const_col_idxs(), op->get_const_row_ptrs(), op->get_num_batch_items(), @@ -51,10 +55,10 @@ get_batch_struct(const batch::matrix::Csr* const op) * Generates a uniform batch struct from a batch of csr matrices. */ template -inline batch::matrix::csr::uniform_batch, IndexType> +inline batch::matrix::csr::uniform_batch, IndexType> get_batch_struct(batch::matrix::Csr* const op) { - return {as_cuda_type(op->get_values()), + return {as_device_type(op->get_values()), op->get_col_idxs(), op->get_row_ptrs(), op->get_num_batch_items(), @@ -68,10 +72,10 @@ get_batch_struct(batch::matrix::Csr* const op) * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(const batch::matrix::Dense* const op) { - return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), + return {as_device_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -82,10 +86,10 @@ get_batch_struct(const batch::matrix::Dense* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(batch::matrix::Dense* const op) { - return {as_cuda_type(op->get_values()), op->get_num_batch_items(), + return {as_device_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -96,11 +100,11 @@ get_batch_struct(batch::matrix::Dense* const op) * Generates an immutable uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::ell::uniform_batch, +inline batch::matrix::ell::uniform_batch, const IndexType> get_batch_struct(const batch::matrix::Ell* const op) { - return {as_cuda_type(op->get_const_values()), + return {as_device_type(op->get_const_values()), op->get_const_col_idxs(), op->get_num_batch_items(), static_cast(op->get_common_size()[0]), @@ -114,10 +118,10 @@ get_batch_struct(const batch::matrix::Ell* const op) * Generates a uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::ell::uniform_batch, IndexType> +inline batch::matrix::ell::uniform_batch, IndexType> get_batch_struct(batch::matrix::Ell* const op) { - return {as_cuda_type(op->get_values()), + return {as_device_type(op->get_values()), op->get_col_idxs(), op->get_num_batch_items(), static_cast(op->get_common_size()[0]), @@ -127,9 +131,9 @@ get_batch_struct(batch::matrix::Ell* const op) } -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko -#endif // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_ diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc index c2a53b2e518..d4ce149d394 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc @@ -27,8 +27,9 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, - static_cast(1.0), r_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); __syncthreads(); if (threadIdx.x / config::warp_size == 0) { @@ -295,7 +296,8 @@ __global__ void apply_kernel( __syncthreads(); // v = A * p_hat - simple_apply(mat_entry, p_hat_sh, v_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + simple_apply(mat_entry, p_hat_sh, v_sh); __syncthreads(); // alpha = rho_new / < r_hat , v> @@ -327,7 +329,8 @@ __global__ void apply_kernel( __syncthreads(); // t = A * s_hat - simple_apply(mat_entry, s_hat_sh, t_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + simple_apply(mat_entry, s_hat_sh, t_sh); __syncthreads(); // omega = / diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc index c95a6b1cf05..4f4b382f552 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc @@ -22,8 +22,9 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, - static_cast(1.0), r_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); __syncthreads(); // z = precond * r @@ -189,7 +190,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, } // Ap = A * p - simple_apply(mat_entry, p_sh, Ap_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + simple_apply(mat_entry, p_sh, Ap_sh); __syncthreads(); // alpha = rho_old / (p' * Ap) diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 73d37eb2ac2..455d3d67a6d 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -16,7 +16,6 @@ #if defined(GKO_COMPILING_CUDA) -#define GKO_DEVICE_NAMESPACE cuda #define GKO_KERNEL __device__ #include "common/cuda_hip/base/types.hpp" @@ -43,7 +42,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_HIP) -#define GKO_DEVICE_NAMESPACE hip #define GKO_KERNEL __device__ #include "common/cuda_hip/base/types.hpp" @@ -70,7 +68,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_DPCPP) -#define GKO_DEVICE_NAMESPACE dpcpp #define GKO_KERNEL @@ -105,7 +102,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_OMP) -#define GKO_DEVICE_NAMESPACE omp #define GKO_KERNEL diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 8a142a5224a..599c708b334 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -24,10 +24,10 @@ #if defined GKO_COMPILING_CUDA -#include "cuda/base/batch_struct.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/log/batch_logger.cuh" -#include "cuda/matrix/batch_struct.hpp" #include "cuda/preconditioner/batch_preconditioners.cuh" #include "cuda/stop/batch_criteria.cuh" @@ -52,10 +52,10 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type; #elif defined GKO_COMPILING_HIP -#include "hip/base/batch_struct.hip.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/log/batch_logger.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" #include "hip/preconditioner/batch_preconditioners.hip.hpp" #include "hip/stop/batch_criteria.hip.hpp" diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 3631a65f48d..000cb7b215f 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -16,9 +16,6 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - matrix/batch_csr_kernels.cu - matrix/batch_dense_kernels.cu - matrix/batch_ell_kernels.cu ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu deleted file mode 100644 index 95b4f85cdfc..00000000000 --- a/cuda/matrix/batch_csr_kernels.cu +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu deleted file mode 100644 index 10148ee242b..00000000000 --- a/cuda/matrix/batch_dense_kernels.cu +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu deleted file mode 100644 index 25281cf6f81..00000000000 --- a/cuda/matrix/batch_ell_kernels.cu +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index 716c158ffff..edf052cb649 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -8,19 +8,19 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/matrix/batch_struct.hpp" // generated header #include "common/cuda_hip/preconditioner/jacobi_common.hpp" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 4d3deb742fe..35d567fd911 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/thrust.hpp" @@ -19,11 +20,13 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" namespace gko { @@ -31,11 +34,6 @@ namespace kernels { namespace cuda { -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - /** * @brief The batch Bicgstab solver namespace. * @@ -44,9 +42,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 21c3e3d43c4..f26f2d37313 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" @@ -18,11 +19,13 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" namespace gko { @@ -30,11 +33,6 @@ namespace kernels { namespace cuda { -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - /** * @brief The batch Cg solver namespace. * @@ -43,9 +41,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp index 9feb824a3aa..1759a959299 100644 --- a/dpcpp/matrix/batch_csr_kernels.dp.cpp +++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp @@ -21,23 +21,16 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ namespace batch_csr { -#include "dpcpp/matrix/batch_csr_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, - item_ct1); + batch_single_kernels::simple_apply(mat_b, b_b.values, + x_b.values, item_ct1); }); }); } @@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, - item_ct1); + batch_single_kernels::advanced_apply( + alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values, item_ct1); }); }); } @@ -172,9 +165,10 @@ void scale(std::shared_ptr exec, row_scale_vals + num_rows * group_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, group_id); - scale_kernel(mat_item.num_rows, col_scale_b, row_scale_b, - mat_item.col_idxs, mat_item.row_ptrs, - mat_item.values, item_ct1); + batch_single_kernels::scale(mat_item.num_rows, col_scale_b, + row_scale_b, mat_item.col_idxs, + mat_item.row_ptrs, + mat_item.values, item_ct1); }); }); } @@ -215,7 +209,7 @@ void add_scaled_identity(std::shared_ptr exec, gko::batch::extract_batch_item(beta_ub, group_id); const auto mat_b = gko::batch::matrix::extract_batch_item( mat_ub, group_id); - add_scaled_identity_kernel( + batch_single_kernels::add_scaled_identity( alpha_b.values[0], beta_b.values[0], mat_b, item_ct1); }); }); diff --git a/dpcpp/matrix/batch_csr_kernels.hpp.inc b/dpcpp/matrix/batch_csr_kernels.hpp similarity index 67% rename from dpcpp/matrix/batch_csr_kernels.hpp.inc rename to dpcpp/matrix/batch_csr_kernels.hpp index 4379e02d0b7..f51124f81a4 100644 --- a/dpcpp/matrix/batch_csr_kernels.hpp.inc +++ b/dpcpp/matrix/batch_csr_kernels.hpp @@ -2,8 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -__dpct_inline__ void simple_apply_kernel( +__dpct_inline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1) { @@ -23,7 +47,7 @@ __dpct_inline__ void simple_apply_kernel( template -__dpct_inline__ void advanced_apply_kernel( +__dpct_inline__ void advanced_apply( const ValueType alpha, const gko::batch::matrix::csr::batch_item& mat, const ValueType* b, const ValueType beta, ValueType* x, @@ -45,13 +69,11 @@ __dpct_inline__ void advanced_apply_kernel( template -__dpct_inline__ void scale_kernel(const int num_rows, - const ValueType* const col_scale, - const ValueType* const row_scale, - const IndexType* const col_idxs, - const IndexType* const row_ptrs, - ValueType* const values, - sycl::nd_item<3>& item_ct1) +__dpct_inline__ void scale(const int num_rows, const ValueType* const col_scale, + const ValueType* const row_scale, + const IndexType* const col_idxs, + const IndexType* const row_ptrs, + ValueType* const values, sycl::nd_item<3>& item_ct1) { for (int row = item_ct1.get_local_linear_id(); row < num_rows; row += item_ct1.get_local_range().size()) { @@ -64,7 +86,7 @@ __dpct_inline__ void scale_kernel(const int num_rows, template -__dpct_inline__ void add_scaled_identity_kernel( +__dpct_inline__ void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::csr::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -80,3 +102,9 @@ __dpct_inline__ void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index a9f6afce0f5..43974589abb 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -25,23 +25,16 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ namespace batch_dense { -#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -77,8 +70,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, - item_ct1); + batch_single_kernels::simple_apply(mat_b, b_b.values, + x_b.values, item_ct1); }); }); } @@ -129,9 +122,9 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, - item_ct1); + batch_single_kernels::advanced_apply( + alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values, item_ct1); }); }); } @@ -174,7 +167,8 @@ void scale(std::shared_ptr exec, row_scale_vals + num_rows * group_id; auto input_mat = batch::matrix::extract_batch_item(mat_ub, group_id); - scale_kernel(col_scale_b, row_scale_b, input_mat, item_ct1); + batch_single_kernels::scale(col_scale_b, row_scale_b, + input_mat, item_ct1); }); }); } @@ -204,18 +198,20 @@ void scale_add(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto alpha_b = - gko::batch::extract_batch_item(alpha_ub, group_id); - const auto mat_b = - gko::batch::matrix::extract_batch_item(mat_ub, group_id); - const auto in_out_b = - gko::batch::matrix::extract_batch_item(in_out_ub, group_id); - scale_add_kernel(alpha_b.values[0], mat_b, in_out_b, item_ct1); - }); + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = + gko::batch::extract_batch_item(alpha_ub, group_id); + const auto mat_b = gko::batch::matrix::extract_batch_item( + mat_ub, group_id); + const auto in_out_b = + gko::batch::matrix::extract_batch_item(in_out_ub, + group_id); + batch_single_kernels::scale_add(alpha_b.values[0], mat_b, + in_out_b, item_ct1); + }); }); } @@ -254,7 +250,7 @@ void add_scaled_identity(std::shared_ptr exec, gko::batch::extract_batch_item(beta_ub, group_id); const auto mat_b = gko::batch::matrix::extract_batch_item( mat_ub, group_id); - add_scaled_identity_kernel( + batch_single_kernels::add_scaled_identity( alpha_b.values[0], beta_b.values[0], mat_b, item_ct1); }); }); diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp similarity index 84% rename from dpcpp/matrix/batch_dense_kernels.hpp.inc rename to dpcpp/matrix/batch_dense_kernels.hpp index 98282fe253d..acf1e65939d 100644 --- a/dpcpp/matrix/batch_dense_kernels.hpp.inc +++ b/dpcpp/matrix/batch_dense_kernels.hpp @@ -2,8 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -__dpct_inline__ void simple_apply_kernel( +__dpct_inline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1) { @@ -34,7 +58,7 @@ __dpct_inline__ void simple_apply_kernel( template -__dpct_inline__ void advanced_apply_kernel( +__dpct_inline__ void advanced_apply( const ValueType alpha, const gko::batch::matrix::dense::batch_item& mat, const ValueType* b, const ValueType beta, ValueType* x, @@ -67,7 +91,7 @@ __dpct_inline__ void advanced_apply_kernel( template -__dpct_inline__ void scale_kernel( +__dpct_inline__ void scale( const ValueType* const col_scale, const ValueType* const row_scale, gko::batch::matrix::dense::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -91,7 +115,7 @@ __dpct_inline__ void scale_kernel( template -__dpct_inline__ void scale_add_kernel( +__dpct_inline__ void scale_add( const ValueType alpha, const gko::batch::matrix::dense::batch_item& mat, const gko::batch::matrix::dense::batch_item& in_out, @@ -117,7 +141,7 @@ __dpct_inline__ void scale_add_kernel( template -__dpct_inline__ void add_scaled_identity_kernel( +__dpct_inline__ void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::dense::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -140,3 +164,9 @@ __dpct_inline__ void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index 2cb40dc35eb..d9b819b101e 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -21,23 +21,16 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ namespace batch_ell { -#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, - item_ct1); + batch_single_kernels::simple_apply(mat_b, b_b.values, + x_b.values, item_ct1); }); }); } @@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, - item_ct1); + batch_single_kernels::advanced_apply( + alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values, item_ct1); }); }); } @@ -171,7 +164,8 @@ void scale(std::shared_ptr exec, row_scale_vals + num_rows * group_id; auto mat_item = batch::matrix::extract_batch_item(mat_ub, group_id); - scale_kernel(col_scale_b, row_scale_b, mat_item, item_ct1); + batch_single_kernels::scale(col_scale_b, row_scale_b, + mat_item, item_ct1); }); }); } @@ -212,7 +206,7 @@ void add_scaled_identity(std::shared_ptr exec, gko::batch::extract_batch_item(beta_ub, group_id); const auto mat_b = gko::batch::matrix::extract_batch_item( mat_ub, group_id); - add_scaled_identity_kernel( + batch_single_kernels::add_scaled_identity( alpha_b.values[0], beta_b.values[0], mat_b, item_ct1); }); }); diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp similarity index 78% rename from dpcpp/matrix/batch_ell_kernels.hpp.inc rename to dpcpp/matrix/batch_ell_kernels.hpp index 1a809664dca..48ab9318bdf 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp @@ -2,8 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -__dpct_inline__ void simple_apply_kernel( +__dpct_inline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1) { @@ -24,7 +48,7 @@ __dpct_inline__ void simple_apply_kernel( template -__dpct_inline__ void advanced_apply_kernel( +__dpct_inline__ void advanced_apply( const ValueType alpha, const gko::batch::matrix::ell::batch_item& mat, const ValueType* b, const ValueType beta, ValueType* x, @@ -47,7 +71,7 @@ __dpct_inline__ void advanced_apply_kernel( template -__dpct_inline__ void scale_kernel( +__dpct_inline__ void scale( const ValueType* const col_scale, const ValueType* const row_scale, gko::batch::matrix::ell::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -69,7 +93,7 @@ __dpct_inline__ void scale_kernel( template -__dpct_inline__ void add_scaled_identity_kernel( +__dpct_inline__ void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::ell::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -89,3 +113,9 @@ __dpct_inline__ void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 7dc8f3ec23b..291ee1d8a8b 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -23,23 +23,18 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { -#include "dpcpp/matrix/batch_csr_kernels.hpp.inc" -#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" -#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" #include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc index f5a88e9d59d..de1956c8c6c 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc +++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc @@ -33,9 +33,9 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), mat_global_entry, - x_shared_entry, static_cast(1.0), - r_shared_entry, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_global_entry, x_shared_entry, + static_cast(1.0), r_shared_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); if (sg_id == 0) { @@ -330,7 +330,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); // v = A * p_hat - simple_apply_kernel(mat_global_entry, p_hat_sh, v_sh, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + mat_global_entry, p_hat_sh, v_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_new / < r_hat , v> @@ -361,7 +362,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); // t = A * s_hat - simple_apply_kernel(mat_global_entry, s_hat_sh, t_sh, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + mat_global_entry, s_hat_sh, t_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // omega = / diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index f25d8266803..05b3f7b803c 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -23,23 +23,18 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { -#include "dpcpp/matrix/batch_csr_kernels.hpp.inc" -#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" -#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" #include "dpcpp/solver/batch_cg_kernels.hpp.inc" diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc index 7a91bcb2bbf..b233b7df680 100644 --- a/dpcpp/solver/batch_cg_kernels.hpp.inc +++ b/dpcpp/solver/batch_cg_kernels.hpp.inc @@ -27,9 +27,9 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), mat_global_entry, - x_shared_entry, static_cast(1.0), - r_shared_entry, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_global_entry, x_shared_entry, + static_cast(1.0), r_shared_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -207,7 +207,8 @@ __dpct_inline__ void apply_kernel( break; } // Ap = A * p - simple_apply_kernel(mat_global_entry, p_sh, Ap_sh, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + mat_global_entry, p_sh, Ap_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_old / (p' * Ap) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 84bba295120..7d914d57a81 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -14,9 +14,6 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - matrix/batch_csr_kernels.hip.cpp - matrix/batch_dense_kernels.hip.cpp - matrix/batch_ell_kernels.hip.cpp ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp deleted file mode 100644 index 3e4cba6a747..00000000000 --- a/hip/base/batch_struct.hip.hpp +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ -#define GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ - - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "core/base/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -/** @file batch_struct.hpp - * - * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the required Hip scalar type. - * - * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. - */ - - -/** - * Generates an immutable uniform batch struct from a batch of multi-vectors. - */ -template -inline batch::multi_vector::uniform_batch> -get_batch_struct(const batch::MultiVector* const op) -{ - return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - -/** - * Generates a uniform batch struct from a batch of multi-vectors. - */ -template -inline batch::multi_vector::uniform_batch> get_batch_struct( - batch::MultiVector* const op) -{ - return {as_hip_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp deleted file mode 100644 index b77b9416505..00000000000 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp deleted file mode 100644 index 67dfd78e264..00000000000 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp deleted file mode 100644 index 68b59c042f1..00000000000 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp deleted file mode 100644 index bb9f7912cd6..00000000000 --- a/hip/matrix/batch_struct.hip.hpp +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ -#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ - - -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -/** @file batch_struct.hpp - * - * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the required HIP scalar type. - * - * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. - */ - - -/** - * Generates an immutable uniform batch struct from a batch of csr matrices. - */ -template -inline batch::matrix::csr::uniform_batch, - const IndexType> -get_batch_struct(const batch::matrix::Csr* const op) -{ - return {as_hip_type(op->get_const_values()), - op->get_const_col_idxs(), - op->get_const_row_ptrs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_elements_per_item())}; -} - - -/** - * Generates a uniform batch struct from a batch of csr matrices. - */ -template -inline batch::matrix::csr::uniform_batch, IndexType> -get_batch_struct(batch::matrix::Csr* const op) -{ - return {as_hip_type(op->get_values()), - op->get_col_idxs(), - op->get_row_ptrs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_elements_per_item())}; -} - - -/** - * Generates an immutable uniform batch struct from a batch of dense matrices. - */ -template -inline batch::matrix::dense::uniform_batch> -get_batch_struct(const batch::matrix::Dense* const op) -{ - return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -/** - * Generates a uniform batch struct from a batch of dense matrices. - */ -template -inline batch::matrix::dense::uniform_batch> -get_batch_struct(batch::matrix::Dense* const op) -{ - return {as_hip_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -/** - * Generates an immutable uniform batch struct from a batch of ell matrices. - */ -template -inline batch::matrix::ell::uniform_batch, - const IndexType> -get_batch_struct(const batch::matrix::Ell* const op) -{ - return {as_hip_type(op->get_const_values()), - op->get_const_col_idxs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; -} - - -/** - * Generates a uniform batch struct from a batch of ell matrices. - */ -template -inline batch::matrix::ell::uniform_batch, IndexType> -get_batch_struct(batch::matrix::Ell* const op) -{ - return {as_hip_type(op->get_values()), - op->get_col_idxs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index e86bc86390a..38a81972e66 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -8,21 +8,21 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/components/diagonal_block_manipulation.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/uninitialized_array.hpp" #include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" // generated header #include "common/cuda_hip/preconditioner/jacobi_common.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 1c1be8b21f7..a5de10953bc 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -20,11 +21,13 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" namespace gko { @@ -32,9 +35,6 @@ namespace kernels { namespace hip { -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - /** * @brief The batch Bicgstab solver namespace. * @@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index c860286c17c..23bb939ead8 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -20,11 +21,13 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" namespace gko { @@ -32,9 +35,6 @@ namespace kernels { namespace hip { -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - /** * @brief The batch Cg solver namespace. * @@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp index eacb26c12cb..d4ea6cbd642 100644 --- a/omp/matrix/batch_csr_kernels.cpp +++ b/omp/matrix/batch_csr_kernels.cpp @@ -9,26 +9,20 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ namespace batch_csr { -#include "reference/matrix/batch_csr_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -99,7 +94,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index 836908260a7..cd4a7f05b4a 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -9,26 +9,20 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The Dense matrix format namespace. - * @ref Dense - * @ingroup batch_dense - */ namespace batch_dense { -#include "reference/matrix/batch_dense_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -98,7 +93,8 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto input_mat = input_vals + input->get_num_elements_per_item() * batch_id; - scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat); + batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b, + row_scale_b, input_mat); } } @@ -121,7 +117,7 @@ void scale_add(std::shared_ptr exec, const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); const auto input_mat_b = batch::matrix::extract_batch_item(in_mat_ub, batch_id); - scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b); + batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b); } } @@ -143,7 +139,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp index 4fb5aeea6fa..8b1239565a1 100644 --- a/omp/matrix/batch_ell_kernels.cpp +++ b/omp/matrix/batch_ell_kernels.cpp @@ -9,26 +9,20 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ namespace batch_ell { -#include "reference/matrix/batch_ell_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -99,7 +94,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp index c245f284106..661cdbcd2ec 100644 --- a/omp/solver/batch_bicgstab_kernels.cpp +++ b/omp/solver/batch_bicgstab_kernels.cpp @@ -10,28 +10,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp index 55d6ee29321..3a6e31256c2 100644 --- a/omp/solver/batch_cg_kernels.cpp +++ b/omp/solver/batch_cg_kernels.cpp @@ -10,28 +10,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_cg_kernels.hpp.inc" diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 0c226830637..85b8f33e38b 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -66,6 +66,7 @@ target_sources(ginkgo_reference stop/residual_norm_kernels.cpp) target_link_libraries(ginkgo_reference PUBLIC ginkgo_device) +target_compile_definitions(ginkgo_reference PRIVATE GKO_COMPILING_REFERENCE GKO_DEVICE_NAMESPACE=reference) ginkgo_compile_features(ginkgo_reference) ginkgo_default_includes(ginkgo_reference) ginkgo_install_library(ginkgo_reference) diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index f5e1c653054..d7fbf3ce214 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -10,10 +10,6 @@ #include #include - -#define GKO_DEVICE_NAMESPACE reference - - #include "core/base/batch_struct.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp index 7c6d9a6c000..d3304ab9795 100644 --- a/reference/matrix/batch_csr_kernels.cpp +++ b/reference/matrix/batch_csr_kernels.cpp @@ -12,23 +12,16 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace reference { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ namespace batch_csr { -#include "reference/matrix/batch_csr_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -42,7 +35,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -69,8 +62,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -96,7 +90,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -118,7 +112,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/reference/matrix/batch_csr_kernels.hpp.inc b/reference/matrix/batch_csr_kernels.hpp similarity index 81% rename from reference/matrix/batch_csr_kernels.hpp.inc rename to reference/matrix/batch_csr_kernels.hpp index 52e511785a0..e04b2bdf345 100644 --- a/reference/matrix/batch_csr_kernels.hpp.inc +++ b/reference/matrix/batch_csr_kernels.hpp @@ -2,8 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -inline void simple_apply_kernel( +inline void simple_apply( const gko::batch::matrix::csr::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) @@ -25,7 +42,7 @@ inline void simple_apply_kernel( template -inline void advanced_apply_kernel( +inline void advanced_apply( const ValueType alpha, const gko::batch::matrix::csr::batch_item& a, const gko::batch::multi_vector::batch_item& b, @@ -63,7 +80,7 @@ inline void scale( template -inline void add_scaled_identity_kernel( +inline void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::csr::batch_item& mat) { @@ -76,3 +93,9 @@ inline void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index 2116a691fb9..599af30ecfb 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -12,23 +12,16 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace reference { -/** - * @brief The Dense matrix format namespace. - * @ref Dense - * @ingroup batch_dense - */ namespace batch_dense { -#include "reference/matrix/batch_dense_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -42,7 +35,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -69,8 +62,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -95,7 +89,8 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto input_mat = input_vals + input->get_num_elements_per_item() * batch_id; - scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat); + batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b, + row_scale_b, input_mat); } } @@ -117,7 +112,7 @@ void scale_add(std::shared_ptr exec, const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); const auto input_mat_b = batch::matrix::extract_batch_item(in_mat_ub, batch_id); - scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b); + batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b); } } @@ -138,7 +133,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp similarity index 84% rename from reference/matrix/batch_dense_kernels.hpp.inc rename to reference/matrix/batch_dense_kernels.hpp index a017010a644..e12827c77de 100644 --- a/reference/matrix/batch_dense_kernels.hpp.inc +++ b/reference/matrix/batch_dense_kernels.hpp @@ -2,8 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -inline void simple_apply_kernel( +inline void simple_apply( const gko::batch::matrix::dense::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) @@ -27,7 +44,7 @@ inline void simple_apply_kernel( template -inline void advanced_apply_kernel( +inline void advanced_apply( const ValueType alpha, const gko::batch::matrix::dense::batch_item& a, const gko::batch::multi_vector::batch_item& b, @@ -75,7 +92,7 @@ inline void scale(const int num_rows, const int num_cols, template -inline void scale_add_kernel( +inline void scale_add( const ValueType alpha, const gko::batch::matrix::dense::batch_item& b, const gko::batch::matrix::dense::batch_item& in_out) @@ -91,7 +108,7 @@ inline void scale_add_kernel( template -inline void add_scaled_identity_kernel( +inline void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::dense::batch_item& mat) { @@ -105,3 +122,9 @@ inline void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp index 0d47f9ea601..1a4855f389f 100644 --- a/reference/matrix/batch_ell_kernels.cpp +++ b/reference/matrix/batch_ell_kernels.cpp @@ -12,23 +12,16 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace reference { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ namespace batch_ell { -#include "reference/matrix/batch_ell_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -42,7 +35,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -69,8 +62,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -96,7 +90,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -118,7 +112,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp similarity index 84% rename from reference/matrix/batch_ell_kernels.hpp.inc rename to reference/matrix/batch_ell_kernels.hpp index 7aea0946573..71bd1ce851a 100644 --- a/reference/matrix/batch_ell_kernels.hpp.inc +++ b/reference/matrix/batch_ell_kernels.hpp @@ -2,8 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -inline void simple_apply_kernel( +inline void simple_apply( const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) @@ -27,7 +44,7 @@ inline void simple_apply_kernel( template -inline void advanced_apply_kernel( +inline void advanced_apply( const ValueType alpha, const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, @@ -73,7 +90,7 @@ inline void scale( template -inline void add_scaled_identity_kernel( +inline void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::ell::batch_item& mat) { @@ -91,3 +108,9 @@ inline void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp index e68caffa936..33e1e9392d9 100644 --- a/reference/solver/batch_bicgstab_kernels.cpp +++ b/reference/solver/batch_bicgstab_kernels.cpp @@ -6,30 +6,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace reference { - - -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc index 1f8537ab66d..786e98eb5d1 100644 --- a/reference/solver/batch_bicgstab_kernels.hpp.inc +++ b/reference/solver/batch_bicgstab_kernels.hpp.inc @@ -33,9 +33,9 @@ inline void initialize( b_entry, r_entry); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), A_entry, - gko::batch::to_const(x_entry), - static_cast(1.0), r_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), + static_cast(1.0), r_entry); gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: compute_norm2_kernel(gko::batch::to_const(r_entry), res_norms_entry); @@ -271,8 +271,8 @@ inline void batch_entry_bicgstab_impl( prec.apply(gko::batch::to_const(p_entry), p_hat_entry); // v = A * p_hat - simple_apply_kernel(A_entry, gko::batch::to_const(p_hat_entry), - v_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + A_entry, gko::batch::to_const(p_hat_entry), v_entry); // alpha = rho_new / < r_hat , v> compute_alpha(gko::batch::to_const(rho_new_entry), @@ -303,8 +303,8 @@ inline void batch_entry_bicgstab_impl( prec.apply(gko::batch::to_const(s_entry), s_hat_entry); // t = A * s_hat - simple_apply_kernel(A_entry, gko::batch::to_const(s_hat_entry), - t_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + A_entry, gko::batch::to_const(s_hat_entry), t_entry); // omega = / compute_omega(gko::batch::to_const(t_entry), gko::batch::to_const(s_entry), temp_entry, omega_entry); diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp index 785a7a868a2..7c69157d4a7 100644 --- a/reference/solver/batch_cg_kernels.cpp +++ b/reference/solver/batch_cg_kernels.cpp @@ -6,30 +6,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace reference { - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_cg_kernels.hpp.inc" diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc index ca88940cd69..991db5c061c 100644 --- a/reference/solver/batch_cg_kernels.hpp.inc +++ b/reference/solver/batch_cg_kernels.hpp.inc @@ -34,9 +34,9 @@ inline void initialize( b_entry, r_entry); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), A_entry, - gko::batch::to_const(x_entry), - static_cast(1.0), r_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), + static_cast(1.0), r_entry); } @@ -181,7 +181,8 @@ inline void batch_entry_cg_impl( gko::batch::to_const(z_entry), p_entry); // Ap = A * p - simple_apply_kernel(A_entry, gko::batch::to_const(p_entry), Ap_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + A_entry, gko::batch::to_const(p_entry), Ap_entry); // temp= rho_old / (p' * Ap) // x = x + temp * p