Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify and simplify batch functionality: Matrix formats (csr, dense, ell) #1669

Merged
merged 9 commits into from
Aug 23, 2024
Merged
3 changes: 3 additions & 0 deletions common/cuda_hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES
factorization/par_ilut_select_kernels.cpp
factorization/par_ilut_spgeam_kernels.cpp
factorization/par_ilut_sweep_kernels.cpp
matrix/batch_csr_kernels.cpp
matrix/batch_dense_kernels.cpp
matrix/batch_ell_kernels.cpp
matrix/coo_kernels.cpp
matrix/dense_kernels.cpp
matrix/diagonal_kernels.cpp
Expand Down
13 changes: 5 additions & 8 deletions common/cuda_hip/base/batch_multi_vector_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>

#include "common/cuda_hip/base/batch_struct.hpp"
#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
Expand All @@ -22,24 +23,20 @@
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/components/warp_blas.hpp"

#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/batch_struct.hpp"
#elif defined(GKO_COMPILING_HIP)
#include "hip/base/batch_struct.hip.hpp"
#else
#error "batch struct def missing"
#endif


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_single_kernels {
namespace {


constexpr auto default_block_size = 256;


}
pratikvn marked this conversation as resolved.
Show resolved Hide resolved


template <typename ValueType, typename Mapping>
__device__ __forceinline__ void scale(
const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_
#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_
#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
#define GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_


#include <ginkgo/core/base/batch_multi_vector.hpp>
#include <ginkgo/core/base/math.hpp>

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/unified/base/kernel_launch.hpp"
#include "core/base/batch_struct.hpp"


namespace gko {
namespace kernels {
namespace cuda {
namespace GKO_DEVICE_NAMESPACE {


/** @file batch_struct.hpp
Expand All @@ -33,10 +34,10 @@ namespace cuda {
* Generates an immutable uniform batch struct from a batch of multi-vectors.
*/
template <typename ValueType>
inline batch::multi_vector::uniform_batch<const cuda_type<ValueType>>
inline batch::multi_vector::uniform_batch<const device_type<ValueType>>
get_batch_struct(const batch::MultiVector<ValueType>* const op)
{
return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
static_cast<int32>(op->get_common_size()[1]),
static_cast<int32>(op->get_common_size()[0]),
static_cast<int32>(op->get_common_size()[1])};
Expand All @@ -46,19 +47,19 @@ get_batch_struct(const batch::MultiVector<ValueType>* const op)
* Generates a uniform batch struct from a batch of multi-vectors.
*/
template <typename ValueType>
inline batch::multi_vector::uniform_batch<cuda_type<ValueType>>
inline batch::multi_vector::uniform_batch<device_type<ValueType>>
get_batch_struct(batch::MultiVector<ValueType>* const op)
{
return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
return {as_device_type(op->get_values()), op->get_num_batch_items(),
static_cast<int32>(op->get_common_size()[1]),
static_cast<int32>(op->get_common_size()[0]),
static_cast<int32>(op->get_common_size()[1])};
}


} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko


#endif // GKO_CUDA_BASE_BATCH_STRUCT_HPP_
#endif // GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
4 changes: 4 additions & 0 deletions common/cuda_hip/base/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@

#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/types.hpp"
#define device_type cuda_type
#define as_device_type as_cuda_type
pratikvn marked this conversation as resolved.
Show resolved Hide resolved
#elif defined(GKO_COMPILING_HIP)
#include "hip/base/types.hip.hpp"
#define device_type hip_type
#define as_device_type as_hip_type
#else
#error "Executor definition missing"
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"

#include <thrust/functional.h>
#include <thrust/transform.h>

#include <ginkgo/core/base/batch_multi_vector.hpp>
#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
#include <ginkgo/core/matrix/batch_csr.hpp>

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_csr_kernels.hpp"
#include "core/matrix/batch_struct.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_csr {


constexpr auto default_block_size = 256;


template <typename ValueType, typename IndexType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const batch::matrix::Csr<ValueType, IndexType>* mat,
Expand All @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
simple_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(mat_ub, b_ub, x_ub);
batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
0, exec->get_stream()>>>(
mat_ub, b_ub, x_ub);
}


Expand All @@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
advanced_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
beta_ub, x_ub);
batch_single_kernels::advanced_apply_kernel<<<
num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
Expand All @@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
const auto col_scale_vals = col_scale->get_const_data();
const auto row_scale_vals = row_scale->get_const_data();
const auto mat_ub = get_batch_struct(input);
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
batch_single_kernels::
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
as_device_type(col_scale_vals), as_device_type(row_scale_vals),
mat_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
Expand All @@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
const auto alpha_ub = get_batch_struct(alpha);
const auto beta_ub = get_batch_struct(beta);
const auto mat_ub = get_batch_struct(mat);
add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, beta_ub,
mat_ub);
batch_single_kernels::add_scaled_identity_kernel<<<
num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, beta_ub, mat_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);


} // namespace batch_csr
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,36 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include <thrust/functional.h>
#include <thrust/transform.h>

#include <ginkgo/core/base/batch_multi_vector.hpp>
#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
#include <ginkgo/core/matrix/batch_csr.hpp>

#include "common/cuda_hip/base/batch_struct.hpp"
#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "common/cuda_hip/components/format_conversion.hpp"
#include "common/cuda_hip/components/reduction.hpp"
#include "common/cuda_hip/components/segment_scan.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/components/warp_blas.hpp"
#include "common/cuda_hip/matrix/batch_struct.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_single_kernels {


template <typename ValueType, typename IndexType>
__device__ __forceinline__ void simple_apply(
const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
Expand All @@ -21,23 +51,11 @@ __device__ __forceinline__ void simple_apply(
}

template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(
default_block_size,
sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
csr::uniform_batch<
const ValueType,
IndexType>
mat,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
b,
const gko::batch::
multi_vector::
uniform_batch<
ValueType>
x)
__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
mat,
const gko::batch::multi_vector::uniform_batch<const ValueType> b,
const gko::batch::multi_vector::uniform_batch<ValueType> x)
{
for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
batch_id += gridDim.x) {
Expand Down Expand Up @@ -71,33 +89,13 @@ __device__ __forceinline__ void advanced_apply(
}

template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(
default_block_size,
sm_oversubscription) void advanced_apply_kernel(const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
alpha,
const gko::batch::matrix::
csr::uniform_batch<
const ValueType,
IndexType>
mat,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
b,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
beta,
const gko::batch::
multi_vector::
uniform_batch<
ValueType>
x)
__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
mat,
const gko::batch::multi_vector::uniform_batch<const ValueType> b,
const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
const gko::batch::multi_vector::uniform_batch<ValueType> x)
{
for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
batch_id += gridDim.x) {
Expand Down Expand Up @@ -196,3 +194,9 @@ __global__ void add_scaled_identity_kernel(
add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
}
}


} // namespace batch_single_kernels
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Loading
Loading