Skip to content

Commit

Permalink
[cuda, hip] unify csr, dense and ell kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
pratikvn committed Aug 21, 2024
1 parent 83a577c commit c0e434c
Show file tree
Hide file tree
Showing 19 changed files with 356 additions and 257 deletions.
3 changes: 3 additions & 0 deletions common/cuda_hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES
factorization/par_ilut_select_kernels.cpp
factorization/par_ilut_spgeam_kernels.cpp
factorization/par_ilut_sweep_kernels.cpp
matrix/batch_csr_kernels.cpp
matrix/batch_dense_kernels.cpp
matrix/batch_ell_kernels.cpp
matrix/coo_kernels.cpp
matrix/dense_kernels.cpp
matrix/diagonal_kernels.cpp
Expand Down
4 changes: 4 additions & 0 deletions common/cuda_hip/base/batch_multi_vector_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,15 @@ namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_single_kernels {
namespace {


constexpr auto default_block_size = 256;


}


template <typename ValueType, typename Mapping>
__device__ __forceinline__ void scale(
const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"

#include <thrust/functional.h>
#include <thrust/transform.h>

#include <ginkgo/core/base/batch_multi_vector.hpp>
#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
#include <ginkgo/core/matrix/batch_csr.hpp>

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_csr_kernels.hpp"
#include "core/matrix/batch_struct.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_csr {


constexpr auto default_block_size = 256;


template <typename ValueType, typename IndexType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const batch::matrix::Csr<ValueType, IndexType>* mat,
Expand All @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
simple_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(mat_ub, b_ub, x_ub);
batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
0, exec->get_stream()>>>(
mat_ub, b_ub, x_ub);
}


Expand All @@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
advanced_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
beta_ub, x_ub);
batch_single_kernels::advanced_apply_kernel<<<
num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
Expand All @@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
const auto col_scale_vals = col_scale->get_const_data();
const auto row_scale_vals = row_scale->get_const_data();
const auto mat_ub = get_batch_struct(input);
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
batch_single_kernels::
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
as_device_type(col_scale_vals), as_device_type(row_scale_vals),
mat_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
Expand All @@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
const auto alpha_ub = get_batch_struct(alpha);
const auto beta_ub = get_batch_struct(beta);
const auto mat_ub = get_batch_struct(mat);
add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, beta_ub,
mat_ub);
batch_single_kernels::add_scaled_identity_kernel<<<
num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, beta_ub, mat_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);


} // namespace batch_csr
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,44 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include <thrust/functional.h>
#include <thrust/transform.h>

#include <ginkgo/core/base/batch_multi_vector.hpp>
#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
#include <ginkgo/core/matrix/batch_csr.hpp>

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "common/cuda_hip/components/format_conversion.hpp"
#include "common/cuda_hip/components/reduction.hpp"
#include "common/cuda_hip/components/segment_scan.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/components/warp_blas.hpp"

#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/batch_struct.hpp"
#include "cuda/matrix/batch_struct.hpp"
#elif defined(GKO_COMPILING_HIP)
#include "hip/base/batch_struct.hip.hpp"
#include "hip/matrix/batch_struct.hip.hpp"
#else
#error "batch struct def missing"
#endif


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_single_kernels {


template <typename ValueType, typename IndexType>
__device__ __forceinline__ void simple_apply(
const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
Expand All @@ -21,23 +59,11 @@ __device__ __forceinline__ void simple_apply(
}

template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(
default_block_size,
sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
csr::uniform_batch<
const ValueType,
IndexType>
mat,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
b,
const gko::batch::
multi_vector::
uniform_batch<
ValueType>
x)
__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
mat,
const gko::batch::multi_vector::uniform_batch<const ValueType> b,
const gko::batch::multi_vector::uniform_batch<ValueType> x)
{
for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
batch_id += gridDim.x) {
Expand Down Expand Up @@ -71,33 +97,13 @@ __device__ __forceinline__ void advanced_apply(
}

template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(
default_block_size,
sm_oversubscription) void advanced_apply_kernel(const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
alpha,
const gko::batch::matrix::
csr::uniform_batch<
const ValueType,
IndexType>
mat,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
b,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
beta,
const gko::batch::
multi_vector::
uniform_batch<
ValueType>
x)
__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
mat,
const gko::batch::multi_vector::uniform_batch<const ValueType> b,
const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
const gko::batch::multi_vector::uniform_batch<ValueType> x)
{
for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
batch_id += gridDim.x) {
Expand Down Expand Up @@ -196,3 +202,9 @@ __global__ void add_scaled_identity_kernel(
add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
}
}


} // namespace batch_single_kernels
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"

#include <thrust/functional.h>
#include <thrust/transform.h>

#include <ginkgo/core/base/batch_multi_vector.hpp>
#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
#include <ginkgo/core/matrix/batch_dense.hpp>

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_dense_kernels.hpp"
#include "core/matrix/batch_struct.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_dense {


constexpr auto default_block_size = 256;


template <typename ValueType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const batch::matrix::Dense<ValueType>* mat,
Expand All @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
simple_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(mat_ub, b_ub, x_ub);
batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
0, exec->get_stream()>>>(
mat_ub, b_ub, x_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand All @@ -40,9 +69,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
advanced_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
beta_ub, x_ub);
batch_single_kernels::advanced_apply_kernel<<<
num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand All @@ -58,8 +87,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
const auto col_scale_vals = col_scale->get_const_data();
const auto row_scale_vals = row_scale->get_const_data();
const auto mat_ub = get_batch_struct(input);
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
batch_single_kernels::
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
as_device_type(col_scale_vals), as_device_type(row_scale_vals),
mat_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
Expand All @@ -75,7 +106,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
const auto alpha_ub = get_batch_struct(alpha);
const auto mat_ub = get_batch_struct(mat);
const auto in_out_ub = get_batch_struct(in_out);
scale_add_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
batch_single_kernels::scale_add_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
alpha_ub, mat_ub, in_out_ub);
}

Expand All @@ -92,10 +124,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
const auto alpha_ub = get_batch_struct(alpha);
const auto beta_ub = get_batch_struct(beta);
const auto mat_ub = get_batch_struct(mat);
add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, beta_ub,
mat_ub);
batch_single_kernels::add_scaled_identity_kernel<<<
num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, beta_ub, mat_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);


} // namespace batch_dense
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Loading

0 comments on commit c0e434c

Please sign in to comment.