diff --git a/common/preconditioner/jacobi_kernels.cpp b/common/preconditioner/jacobi_kernels.cpp new file mode 100644 index 00000000000..442a1b47809 --- /dev/null +++ b/common/preconditioner/jacobi_kernels.cpp @@ -0,0 +1,158 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include "common/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Jacobi preconditioner namespace. + * + * @ingroup jacobi + */ +namespace jacobi { + + +template +void scalar_conj(std::shared_ptr exec, + const Array &diag, Array &conj_diag) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto elem, auto diag, auto conj_diag) { + conj_diag[elem] = conj(diag[elem]); + }, + diag.get_num_elems(), diag, conj_diag); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); + + +template +void invert_diagonal(std::shared_ptr exec, + const Array &diag, Array &inv_diag) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto elem, auto diag, auto inv_diag) { + inv_diag[elem] = safe_divide(one(diag[elem]), diag[elem]); + }, + diag.get_num_elems(), diag, inv_diag); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); + + +template +void scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *x) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto diag, auto alpha, auto b, + auto beta, auto x) { + x(row, col) = beta[col] * x(row, col) + + alpha[col] * b(row, col) * diag[row]; + }, + x->get_size(), diag, alpha->get_const_values(), b, + beta->get_const_values(), x); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto diag, auto alpha, auto b, + auto beta, auto x) { + x(row, col) = + beta[0] * x(row, col) + alpha[0] * b(row, col) * diag[row]; + }, + x->get_size(), diag, alpha->get_const_values(), b, + beta->get_const_values(), x); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + + +template +void simple_scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *b, + matrix::Dense *x) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto diag, auto b, auto x) { + x(row, col) = b(row, col) * diag[row]; + }, + x->get_size(), diag, b, x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + + +template +void scalar_convert_to_dense(std::shared_ptr exec, + const Array &blocks, + matrix::Dense *result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto diag, auto result) { + result(row, col) = zero(diag[row]); + if (row == col) { + result(row, col) = diag[row]; + } + }, + result->get_size(), blocks, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); + + +} // namespace jacobi +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc index 61678280d56..c3fa889b210 100644 --- a/common/preconditioner/jacobi_kernels.hpp.inc +++ b/common/preconditioner/jacobi_kernels.hpp.inc @@ -213,50 +213,3 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi( }); } } - - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void scalar_apply( - size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag, - const ValueType *__restrict__ alpha, size_type source_stride, - const ValueType *__restrict__ source_values, - const ValueType *__restrict__ beta, size_type result_stride, - ValueType *__restrict__ result_values) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx / num_cols; - const auto col = tidx % num_cols; - - if (row < num_rows) { - auto diag_val = - diag[row] == zero() ? one() : diag[row]; - result_values[row * result_stride + col] = - result_values[row * result_stride + col] * beta[0] + - alpha[0] * source_values[row * source_stride + col] / diag_val; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void simple_scalar_apply( - size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag, - size_type source_stride, const ValueType *__restrict__ source_values, - size_type result_stride, ValueType *__restrict__ result_values) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx / num_cols; - const auto col = tidx % num_cols; - - if (row < num_rows) { - auto diag_val = - diag[row] == zero() ? one() : diag[row]; - result_values[row * result_stride + col] = - source_values[row * source_stride + col] / diag_val; - } -} - - -} // namespace kernel diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 0c5f46f6c1f..b9400e90b3f 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -1119,6 +1119,22 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); +template +GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); + +template +GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); + +template +GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); + template GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index da830f011a7..d62d97ec0b5 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -63,9 +63,13 @@ GKO_REGISTER_OPERATION(apply, jacobi::apply); GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply); GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks); GKO_REGISTER_OPERATION(generate, jacobi::generate); +GKO_REGISTER_OPERATION(scalar_conj, jacobi::scalar_conj); +GKO_REGISTER_OPERATION(invert_diagonal, jacobi::invert_diagonal); GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi); GKO_REGISTER_OPERATION(conj_transpose_jacobi, jacobi::conj_transpose_jacobi); GKO_REGISTER_OPERATION(convert_to_dense, jacobi::convert_to_dense); +GKO_REGISTER_OPERATION(scalar_convert_to_dense, + jacobi::scalar_convert_to_dense); GKO_REGISTER_OPERATION(initialize_precisions, jacobi::initialize_precisions); @@ -119,10 +123,14 @@ void Jacobi::convert_to( { auto exec = this->get_executor(); auto tmp = matrix::Dense::create(exec, this->get_size()); - exec->run(jacobi::make_convert_to_dense( - num_blocks_, parameters_.storage_optimization.block_wise, - parameters_.block_pointers, blocks_, storage_scheme_, tmp->get_values(), - tmp->get_stride())); + if (parameters_.max_block_size == 1) { + exec->run(jacobi::make_scalar_convert_to_dense(blocks_, tmp.get())); + } else { + exec->run(jacobi::make_convert_to_dense( + num_blocks_, parameters_.storage_optimization.block_wise, + parameters_.block_pointers, blocks_, storage_scheme_, + tmp->get_values(), tmp->get_stride())); + } tmp->move_to(result); } @@ -141,29 +149,40 @@ void Jacobi::write(mat_data &data) const make_temporary_clone(this->get_executor()->get_master(), this); data = {local_clone->get_size(), {}}; - const auto ptrs = local_clone->parameters_.block_pointers.get_const_data(); - for (size_type block = 0; block < local_clone->get_num_blocks(); ++block) { - const auto scheme = local_clone->get_storage_scheme(); - const auto group_data = local_clone->blocks_.get_const_data() + - scheme.get_group_offset(block); - const auto block_size = ptrs[block + 1] - ptrs[block]; - const auto precisions = local_clone->parameters_.storage_optimization - .block_wise.get_const_data(); - const auto prec = - precisions ? precisions[block] : precision_reduction(); - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(ValueType, prec, { - const auto block_data = - reinterpret_cast(group_data) + - scheme.get_block_offset(block); - for (IndexType row = 0; row < block_size; ++row) { - for (IndexType col = 0; col < block_size; ++col) { - data.nonzeros.emplace_back( - ptrs[block] + row, ptrs[block] + col, - static_cast( - block_data[row + col * scheme.get_stride()])); + if (parameters_.max_block_size == 1) { + for (IndexType row = 0; row < data.size[0]; ++row) { + data.nonzeros.emplace_back( + row, row, + static_cast(local_clone->get_blocks()[row])); + } + } else { + const auto ptrs = + local_clone->parameters_.block_pointers.get_const_data(); + for (size_type block = 0; block < local_clone->get_num_blocks(); + ++block) { + const auto scheme = local_clone->get_storage_scheme(); + const auto group_data = local_clone->blocks_.get_const_data() + + scheme.get_group_offset(block); + const auto block_size = ptrs[block + 1] - ptrs[block]; + const auto precisions = + local_clone->parameters_.storage_optimization.block_wise + .get_const_data(); + const auto prec = + precisions ? precisions[block] : precision_reduction(); + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(ValueType, prec, { + const auto block_data = + reinterpret_cast(group_data) + + scheme.get_block_offset(block); + for (IndexType row = 0; row < block_size; ++row) { + for (IndexType col = 0; col < block_size; ++col) { + data.nonzeros.emplace_back( + ptrs[block] + row, ptrs[block] + col, + static_cast( + block_data[row + col * scheme.get_stride()])); + } } - } - }); + }); + } } } @@ -180,10 +199,15 @@ std::unique_ptr Jacobi::transpose() const res->blocks_.resize_and_reset(blocks_.get_num_elems()); res->conditioning_ = conditioning_; res->parameters_ = parameters_; - this->get_executor()->run(jacobi::make_transpose_jacobi( - num_blocks_, parameters_.max_block_size, - parameters_.storage_optimization.block_wise, parameters_.block_pointers, - blocks_, storage_scheme_, res->blocks_)); + if (parameters_.max_block_size == 1) { + res->blocks_ = blocks_; + } else { + this->get_executor()->run(jacobi::make_transpose_jacobi( + num_blocks_, parameters_.max_block_size, + parameters_.storage_optimization.block_wise, + parameters_.block_pointers, blocks_, storage_scheme_, + res->blocks_)); + } return std::move(res); } @@ -201,10 +225,16 @@ std::unique_ptr Jacobi::conj_transpose() const res->blocks_.resize_and_reset(blocks_.get_num_elems()); res->conditioning_ = conditioning_; res->parameters_ = parameters_; - this->get_executor()->run(jacobi::make_conj_transpose_jacobi( - num_blocks_, parameters_.max_block_size, - parameters_.storage_optimization.block_wise, parameters_.block_pointers, - blocks_, storage_scheme_, res->blocks_)); + if (parameters_.max_block_size == 1) { + this->get_executor()->run( + jacobi::make_scalar_conj(this->blocks_, res->blocks_)); + } else { + this->get_executor()->run(jacobi::make_conj_transpose_jacobi( + num_blocks_, parameters_.max_block_size, + parameters_.storage_optimization.block_wise, + parameters_.block_pointers, blocks_, storage_scheme_, + res->blocks_)); + } return std::move(res); } @@ -244,7 +274,8 @@ void Jacobi::generate(const LinOp *system_matrix, auto temp = Array::view(diag_vt->get_executor(), diag_vt->get_size()[0], diag_vt->get_values()); - this->blocks_ = temp; + this->blocks_ = Array(exec, temp.get_num_elems()); + exec->run(jacobi::make_invert_diagonal(temp, this->blocks_)); this->num_blocks_ = diag_vt->get_size()[0]; } else { auto csr_mtx = convert_to_with_sorting(exec, system_matrix, diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp index 22cfa512bb8..7745438abda 100644 --- a/core/preconditioner/jacobi_kernels.hpp +++ b/core/preconditioner/jacobi_kernels.hpp @@ -62,6 +62,16 @@ namespace kernels { Array &block_precisions, \ const Array &block_pointers, Array &blocks) +#define GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL(ValueType) \ + void scalar_conj(std::shared_ptr exec, \ + const Array &diag, \ + Array &conj_diag) + +#define GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL(ValueType) \ + void invert_diagonal(std::shared_ptr exec, \ + const Array &diag, \ + Array &inv_diag) + #define GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType) \ void apply( \ std::shared_ptr exec, size_type num_blocks, \ @@ -120,6 +130,11 @@ namespace kernels { &storage_scheme, \ Array &out_blocks) +#define GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL(ValueType) \ + void scalar_convert_to_dense(std::shared_ptr exec, \ + const Array &blocks, \ + matrix::Dense *result) + #define GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ void convert_to_dense( \ std::shared_ptr exec, size_type num_blocks, \ @@ -141,6 +156,10 @@ namespace kernels { template \ GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL(ValueType); \ + template \ + GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL(ValueType); \ + template \ GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType); \ template \ GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType); \ @@ -152,6 +171,8 @@ namespace kernels { GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL(ValueType); \ template \ GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL() diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index df512019d88..602dea8ee24 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -71,6 +71,7 @@ set(GKO_CUDA_COMMON_SOURCES ../common/matrix/csr_kernels.cpp ../common/matrix/dense_kernels.cpp ../common/matrix/diagonal_kernels.cpp + ../common/preconditioner/jacobi_kernels.cpp ../common/solver/bicg_kernels.cpp ../common/solver/bicgstab_kernels.cpp ../common/solver/cg_kernels.cpp diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index bb9b918f5d9..1ff609791c7 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -243,61 +243,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); -template -void scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *alpha, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *x) -{ - const auto b_size = b->get_size(); - const auto num_rows = b_size[0]; - const auto num_cols = b_size[1]; - const auto b_stride = b->get_stride(); - const auto x_stride = x->get_stride(); - const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); - - const auto b_values = b->get_const_values(); - const auto diag_values = diag.get_const_data(); - auto x_values = x->get_values(); - - kernel::scalar_apply<<>>( - num_rows, num_cols, as_cuda_type(diag_values), - as_cuda_type(alpha->get_const_values()), b_stride, - as_cuda_type(b_values), as_cuda_type(beta->get_const_values()), - x_stride, as_cuda_type(x_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); - - -template -void simple_scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *b, - matrix::Dense *x) -{ - const auto b_size = b->get_size(); - const auto num_rows = b_size[0]; - const auto num_cols = b_size[1]; - const auto b_stride = b->get_stride(); - const auto x_stride = x->get_stride(); - const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); - - const auto b_values = b->get_const_values(); - const auto diag_values = diag.get_const_data(); - auto x_values = x->get_values(); - - kernel::simple_scalar_apply<<>>( - num_rows, num_cols, as_cuda_type(diag_values), b_stride, - as_cuda_type(b_values), x_stride, as_cuda_type(x_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); - - } // namespace jacobi } // namespace cuda } // namespace kernels diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index f167bac3742..92ac330364b 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -52,6 +52,7 @@ target_sources(ginkgo_dpcpp ../common/matrix/csr_kernels.cpp ../common/matrix/dense_kernels.cpp ../common/matrix/diagonal_kernels.cpp + ../common/preconditioner/jacobi_kernels.cpp ../common/solver/bicg_kernels.cpp ../common/solver/bicgstab_kernels.cpp ../common/solver/cg_kernels.cpp diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 3b7f1efb90f..623e2c121ec 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -280,27 +280,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); -template -void scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *alpha, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *x) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); - - -template -void simple_scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *b, - matrix::Dense *x) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); - - } // namespace jacobi } // namespace dpcpp } // namespace kernels diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 0778c41a39e..0bae0c057fc 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -190,6 +190,7 @@ set(GINKGO_HIP_SOURCES ../common/matrix/csr_kernels.cpp ../common/matrix/dense_kernels.cpp ../common/matrix/diagonal_kernels.cpp + ../common/preconditioner/jacobi_kernels.cpp ../common/solver/bicg_kernels.cpp ../common/solver/bicgstab_kernels.cpp ../common/solver/cg_kernels.cpp diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index 57d513774b5..425820ead4e 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -257,62 +257,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); -template -void scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *alpha, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *x) -{ - const auto b_size = b->get_size(); - const auto num_rows = b_size[0]; - const auto num_cols = b_size[1]; - const auto b_stride = b->get_stride(); - const auto x_stride = x->get_stride(); - const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); - - const auto b_values = b->get_const_values(); - const auto diag_values = diag.get_const_data(); - auto x_values = x->get_values(); - - hipLaunchKernelGGL( - kernel::scalar_apply, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, num_cols, as_hip_type(diag_values), - as_hip_type(alpha->get_const_values()), b_stride, as_hip_type(b_values), - as_hip_type(beta->get_const_values()), x_stride, as_hip_type(x_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); - - -template -void simple_scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *b, - matrix::Dense *x) -{ - const auto b_size = b->get_size(); - const auto num_rows = b_size[0]; - const auto num_cols = b_size[1]; - const auto b_stride = b->get_stride(); - const auto x_stride = x->get_stride(); - const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); - - const auto b_values = b->get_const_values(); - const auto diag_values = diag.get_const_data(); - auto x_values = x->get_values(); - - hipLaunchKernelGGL(kernel::simple_scalar_apply, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, num_cols, - as_hip_type(diag_values), b_stride, - as_hip_type(b_values), x_stride, as_hip_type(x_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); - - } // namespace jacobi } // namespace hip } // namespace kernels diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp index dd05296abde..76e571baf10 100644 --- a/include/ginkgo/core/preconditioner/jacobi.hpp +++ b/include/ginkgo/core/preconditioner/jacobi.hpp @@ -200,6 +200,10 @@ struct block_interleaved_storage_scheme { * @note When using the adaptive variant, there may be a trade-off in terms of * slightly longer preconditioner generation due to extra work required to * detect the optimal precision of the blocks. + * @note When the max_block_size is set to 1, specialized kernels are used, + * both for generation (inverting the diagonals) and application (diagonal + * scaling) to reduce the overhead involved in the usual (adaptive) block + * case. * * @ingroup jacobi * @ingroup precond @@ -300,7 +304,10 @@ class Jacobi : public EnableLinOp>, /** * Maximal size of diagonal blocks. * - * @note This value has to be between 1 and 32 (NVIDIA)/64 (AMD). + * @note This value has to be between 1 and 32 (NVIDIA)/64 (AMD). For + * efficiency, when the max_block_size is set to 1, specialized kernels + * are used and the additional objects (block_ptrs etc) are set to null + * values. */ uint32 GKO_FACTORY_PARAMETER_SCALAR(max_block_size, 32u); diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index faf49154f34..f2bc90284b1 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(ginkgo_omp ../common/matrix/csr_kernels.cpp ../common/matrix/dense_kernels.cpp ../common/matrix/diagonal_kernels.cpp + ../common/preconditioner/jacobi_kernels.cpp ../common/solver/bicg_kernels.cpp ../common/solver/bicgstab_kernels.cpp ../common/solver/cg_kernels.cpp diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 605ba433578..1b0c0c6c1a5 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -696,50 +696,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); -template -void scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *alpha, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *x) -{ -#pragma omp parallel for - for (size_type i = 0; i < x->get_size()[0]; ++i) { - for (size_type j = 0; j < x->get_size()[1]; ++j) { - auto diag_val = diag.get_const_data()[i] == zero() - ? one() - : diag.get_const_data()[i]; - x->at(i, j) = beta->at(0) * x->at(i, j) + - alpha->at(0) * b->at(i, j) / diag_val; - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); - - -template -void simple_scalar_apply(std::shared_ptr exec, - const Array &diag, - const matrix::Dense *b, - matrix::Dense *x) -{ -#pragma omp parallel for - for (size_type i = 0; i < x->get_size()[0]; ++i) { - for (size_type j = 0; j < x->get_size()[1]; ++j) { - auto diag_val = diag.get_const_data()[i] == zero() - ? one() - : diag.get_const_data()[i]; - x->at(i, j) = b->at(i, j) / diag_val; - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); - - } // namespace jacobi } // namespace omp } // namespace kernels diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index a176b01e4bf..e8abd8b8958 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -570,11 +570,8 @@ void scalar_apply(std::shared_ptr exec, { for (size_type i = 0; i < x->get_size()[0]; ++i) { for (size_type j = 0; j < x->get_size()[1]; ++j) { - auto diag_val = diag.get_const_data()[i] == zero() - ? one() - : diag.get_const_data()[i]; x->at(i, j) = beta->at(0) * x->at(i, j) + - alpha->at(0) * b->at(i, j) / diag_val; + alpha->at(0) * b->at(i, j) * diag.get_const_data()[i]; } } } @@ -590,10 +587,7 @@ void simple_scalar_apply(std::shared_ptr exec, { for (size_type i = 0; i < x->get_size()[0]; ++i) { for (size_type j = 0; j < x->get_size()[1]; ++j) { - auto diag_val = diag.get_const_data()[i] == zero() - ? one() - : diag.get_const_data()[i]; - x->at(i, j) = b->at(i, j) / diag_val; + x->at(i, j) = b->at(i, j) * diag.get_const_data()[i]; } } } @@ -602,6 +596,33 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); +template +void scalar_conj(std::shared_ptr exec, + const Array &diag, Array &conj_diag) +{ + for (size_type i = 0; i < diag.get_num_elems(); ++i) { + conj_diag.get_data()[i] = conj(diag.get_const_data()[i]); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); + + +template +void invert_diagonal(std::shared_ptr exec, + const Array &diag, Array &inv_diag) +{ + for (size_type i = 0; i < diag.get_num_elems(); ++i) { + auto diag_val = diag.get_const_data()[i] == zero() + ? one() + : diag.get_const_data()[i]; + inv_diag.get_data()[i] = one() / diag_val; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); + + template void transpose_jacobi( std::shared_ptr exec, size_type num_blocks, @@ -674,6 +695,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); +template +void scalar_convert_to_dense(std::shared_ptr exec, + const Array &blocks, + matrix::Dense *result) +{ + auto matrix_size = result->get_size(); + for (size_type i = 0; i < matrix_size[0]; ++i) { + for (size_type j = 0; j < matrix_size[1]; ++j) { + result->at(i, j) = zero(); + if (i == j) { + result->at(i, j) = blocks.get_const_data()[i]; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); + + template void convert_to_dense( std::shared_ptr exec, size_type num_blocks, diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 8f79e8976fb..330195cee6a 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -107,9 +107,7 @@ class Jacobi : public ::testing::Test { template void init_array(T *arr, std::initializer_list vals) { - for (auto elem : vals) { - *(arr++) = elem; - } + std::copy(std::begin(vals), std::end(vals), arr); } template @@ -310,6 +308,108 @@ TYPED_TEST(Jacobi, CanBeClearedWithAdaptivePrecision) } +TYPED_TEST(Jacobi, ScalarJacobiConvertsToDense) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Bj = typename TestFixture::Bj; + gko::matrix_data data; + auto csr = gko::share( + gko::matrix::Csr::create(this->exec)); + csr->copy_from(gko::lend(this->mtx)); + auto scalar_j = this->scalar_j_factory->generate(csr); + + auto dense_j = gko::matrix::Dense::create(this->exec); + dense_j->copy_from(scalar_j.get()); + auto j_val = scalar_j->get_blocks(); + + for (auto i = 0; i < dense_j->get_size()[0]; ++i) { + for (auto j = 0; j < dense_j->get_size()[1]; ++j) { + if (i == j) { + EXPECT_EQ(dense_j->at(i, j), j_val[j]); + } else { + EXPECT_EQ(dense_j->at(i, j), value_type{0.0}); + } + } + } +} + + +TYPED_TEST(Jacobi, ScalarJacobiCanBeTransposed) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Bj = typename TestFixture::Bj; + gko::matrix_data data; + auto csr = gko::share( + gko::matrix::Csr::create(this->exec)); + csr->copy_from(gko::lend(this->mtx)); + auto scalar_j = this->scalar_j_factory->generate(csr); + + auto dense_j = gko::matrix::Dense::create(this->exec); + auto t_j = scalar_j->transpose(); + auto trans_j = gko::as(t_j.get())->get_blocks(); + auto scal_j = scalar_j->get_blocks(); + + ASSERT_EQ(scalar_j->get_size(), gko::dim<2>(5, 5)); + ASSERT_EQ(scalar_j->get_num_stored_elements(), 5); + ASSERT_EQ(scalar_j->get_parameters().max_block_size, 1); + ASSERT_EQ(scalar_j->get_num_blocks(), 5); + ASSERT_EQ(scalar_j->get_parameters().block_pointers.get_const_data(), + nullptr); + EXPECT_EQ(trans_j[0], scal_j[0]); + EXPECT_EQ(trans_j[1], scal_j[1]); + EXPECT_EQ(trans_j[2], scal_j[2]); + EXPECT_EQ(trans_j[3], scal_j[3]); + EXPECT_EQ(trans_j[4], scal_j[4]); +} + + +template +void init_array(T *arr, std::initializer_list vals) +{ + std::copy(std::begin(vals), std::end(vals), arr); +} + + +TEST(Jacobi, ScalarJacobiCanBeConjTransposed) +{ + using value_type = std::complex; + using vt = value_type; + using index_type = int; + using Bj = gko::preconditioner::Jacobi; + gko::matrix_data data; + using Mtx = gko::matrix::Csr; + auto exec = gko::ReferenceExecutor::create(); + auto csr = gko::share(Mtx::create(exec, gko::dim<2>(5, 5), 13)); + auto scalar_j_factory = Bj::build().with_max_block_size(1u).on(exec); + init_array(csr->get_row_ptrs(), {0, 3, 5, 7, 10, 13}); + init_array(csr->get_col_idxs(), + {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4}); + init_array( + csr->get_values(), + {vt(4.0, 1), vt(-2.0), vt(-2.0), vt(-1.0), vt(4.0, -1), vt(4.0), + vt(-2.0), vt(-1.0), vt(4.0), vt(-2.0), vt(-1.0), vt(-1.0), vt(4.0)}); + auto scalar_j = scalar_j_factory->generate(csr); + + auto t_j = scalar_j->conj_transpose(); + auto trans_j = gko::as(t_j.get())->get_blocks(); + auto scal_j = scalar_j->get_blocks(); + + ASSERT_EQ(scalar_j->get_size(), gko::dim<2>(5, 5)); + ASSERT_EQ(scalar_j->get_num_stored_elements(), 5); + ASSERT_EQ(scalar_j->get_parameters().max_block_size, 1); + ASSERT_EQ(scalar_j->get_num_blocks(), 5); + ASSERT_EQ(scalar_j->get_parameters().block_pointers.get_const_data(), + nullptr); + EXPECT_EQ(trans_j[0], gko::conj(scal_j[0])); + EXPECT_EQ(trans_j[1], gko::conj(scal_j[1])); + EXPECT_EQ(trans_j[2], gko::conj(scal_j[2])); + EXPECT_EQ(trans_j[3], gko::conj(scal_j[3])); + EXPECT_EQ(trans_j[4], gko::conj(scal_j[4])); +} + + #define GKO_EXPECT_NONZERO_NEAR(first, second, tol) \ EXPECT_EQ(first.row, second.row); \ EXPECT_EQ(first.column, second.column); \ @@ -351,6 +451,31 @@ TYPED_TEST(Jacobi, GeneratesCorrectMatrixData) } +TYPED_TEST(Jacobi, ScalarJacobiGeneratesCorrectMatrixData) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Bj = typename TestFixture::Bj; + gko::matrix_data data; + using tpl = typename decltype(data)::nonzero_type; + auto csr = gko::share( + gko::matrix::Csr::create(this->exec)); + csr->copy_from(gko::lend(this->mtx)); + auto scalar_j = this->scalar_j_factory->generate(csr); + + scalar_j->write(data); + + auto tol = r::value; + ASSERT_EQ(data.size, gko::dim<2>{5}); + ASSERT_EQ(data.nonzeros.size(), 5); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 1 / 4.0), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(1, 1, 1 / 4.0), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(2, 2, 1 / 4.0), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(3, 3, 1 / 4.0), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(4, 4, 1 / 4.0), tol); +} + + TYPED_TEST(Jacobi, GeneratesCorrectMatrixDataWithAdaptivePrecision) { using value_type = typename TestFixture::value_type; @@ -393,11 +518,11 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision) ASSERT_NO_THROW(bj = this->scalar_j_factory->generate(csr)); ASSERT_EQ(bj->get_num_blocks(), 5u); - ASSERT_EQ(bj->get_blocks()[0], value_type{4}); - ASSERT_EQ(bj->get_blocks()[1], value_type{4}); - ASSERT_EQ(bj->get_blocks()[2], value_type{4}); - ASSERT_EQ(bj->get_blocks()[3], value_type{4}); - ASSERT_EQ(bj->get_blocks()[4], value_type{4}); + ASSERT_EQ(bj->get_blocks()[0], value_type{0.25}); + ASSERT_EQ(bj->get_blocks()[1], value_type{0.25}); + ASSERT_EQ(bj->get_blocks()[2], value_type{0.25}); + ASSERT_EQ(bj->get_blocks()[3], value_type{0.25}); + ASSERT_EQ(bj->get_blocks()[4], value_type{0.25}); }