Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ginkgo own ILU and IC #1684

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 29 additions & 11 deletions common/cuda_hip/factorization/cholesky_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ constexpr int default_block_size = 512;


#include "core/factorization/elimination_forest.hpp"


namespace kernel {


Expand Down Expand Up @@ -161,7 +163,7 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(
}


template <typename ValueType, typename IndexType>
template <bool full_fillin, typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void factorize(
const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
const IndexType* __restrict__ storage_offsets,
Expand Down Expand Up @@ -200,9 +202,16 @@ __global__ __launch_bounds__(default_block_size) void factorize(
const auto upper_col = cols[upper_nz];
if (upper_col >= row) {
const auto upper_val = vals[upper_nz];
const auto output_pos =
lookup.lookup_unsafe(upper_col) + row_begin;
vals[output_pos] -= scale * upper_val;
if (!full_fillin) {
const auto pos = lookup[upper_col];
if (pos != invalid_index<IndexType>()) {
vals[row_begin + pos] -= scale * upper_val;
}
} else {
const auto output_pos =
lookup.lookup_unsafe(upper_col) + row_begin;
vals[output_pos] -= scale * upper_val;
}
}
}
}
Expand Down Expand Up @@ -355,20 +364,29 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
const int32* lookup_storage, const IndexType* diag_idxs,
const IndexType* transpose_idxs,
const factorization::elimination_forest<IndexType>& forest,
matrix::Csr<ValueType, IndexType>* factors,
matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
array<int>& tmp_storage)
{
const auto num_rows = factors->get_size()[0];
if (num_rows > 0) {
syncfree_storage storage(exec, tmp_storage, num_rows);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::factorize<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
transpose_idxs, as_device_type(factors->get_values()), storage,
num_rows);
if (!full_fillin) {
kernel::factorize<false>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
factors->get_const_row_ptrs(),
factors->get_const_col_idxs(), lookup_offsets,
lookup_storage, lookup_descs, diag_idxs, transpose_idxs,
as_device_type(factors->get_values()), storage, num_rows);
} else {
kernel::factorize<true>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
factors->get_const_row_ptrs(),
factors->get_const_col_idxs(), lookup_offsets,
lookup_storage, lookup_descs, diag_idxs, transpose_idxs,
as_device_type(factors->get_values()), storage, num_rows);
}
}
}

Expand Down
6 changes: 3 additions & 3 deletions common/cuda_hip/factorization/ilu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ namespace ilu_factorization {


template <typename ValueType, typename IndexType>
void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
matrix::Csr<ValueType, IndexType>* m)
void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
matrix::Csr<ValueType, IndexType>* m)
{
const auto id = exec->get_device_id();
auto handle = exec->get_sparselib_handle();
Expand Down Expand Up @@ -55,7 +55,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);


} // namespace ilu_factorization
Expand Down
36 changes: 27 additions & 9 deletions common/cuda_hip/factorization/lu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ __global__ __launch_bounds__(default_block_size) void initialize(
}


template <typename ValueType, typename IndexType>
template <bool full_fillin, typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void factorize(
const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
const IndexType* __restrict__ storage_offsets,
Expand Down Expand Up @@ -130,8 +130,16 @@ __global__ __launch_bounds__(default_block_size) void factorize(
upper_nz += config::warp_size) {
const auto upper_col = cols[upper_nz];
const auto upper_val = vals[upper_nz];
const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin;
vals[output_pos] -= scale * upper_val;
if (!full_fillin) {
const auto pos = lookup[upper_col];
if (pos != invalid_index<IndexType>()) {
vals[row_begin + pos] -= scale * upper_val;
}
} else {
const auto output_pos =
lookup.lookup_unsafe(upper_col) + row_begin;
vals[output_pos] -= scale * upper_val;
}
}
}
scheduler.mark_ready();
Expand Down Expand Up @@ -252,19 +260,29 @@ template <typename ValueType, typename IndexType>
void factorize(std::shared_ptr<const DefaultExecutor> exec,
const IndexType* lookup_offsets, const int64* lookup_descs,
const int32* lookup_storage, const IndexType* diag_idxs,
matrix::Csr<ValueType, IndexType>* factors,
matrix::Csr<ValueType, IndexType>* factors, bool full_fillin,
array<int>& tmp_storage)
{
const auto num_rows = factors->get_size()[0];
if (num_rows > 0) {
syncfree_storage storage(exec, tmp_storage, num_rows);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::factorize<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
as_device_type(factors->get_values()), storage, num_rows);
if (!full_fillin) {
kernel::factorize<false>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
factors->get_const_row_ptrs(),
factors->get_const_col_idxs(), lookup_offsets,
lookup_storage, lookup_descs, diag_idxs,
as_device_type(factors->get_values()), storage, num_rows);
} else {
kernel::factorize<true>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
factors->get_const_row_ptrs(),
factors->get_const_col_idxs(), lookup_offsets,
lookup_storage, lookup_descs, diag_idxs,
as_device_type(factors->get_values()), storage, num_rows);
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
namespace ilu_factorization {


GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);


} // namespace ilu_factorization
Expand Down
2 changes: 1 addition & 1 deletion core/factorization/cholesky.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
exec->run(make_factorize(
storage_offsets.get_const_data(), row_descs.get_const_data(),
storage.get_const_data(), diag_idxs.get_const_data(),
transpose_idxs.get_const_data(), *forest, factors.get(), tmp));
transpose_idxs.get_const_data(), *forest, factors.get(), true, tmp));
return factorization_type::create_from_combined_cholesky(
std::move(factors));
}
Expand Down
3 changes: 2 additions & 1 deletion core/factorization/cholesky_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ namespace kernels {
const int32* lookup_storage, const IndexType* diag_idxs, \
const IndexType* transpose_idxs, \
const gko::factorization::elimination_forest<IndexType>& forest, \
matrix::Csr<ValueType, IndexType>* factors, array<int>& tmp_storage)
matrix::Csr<ValueType, IndexType>* factors, bool full_fillin, \
array<int>& tmp_storage)


#define GKO_DECLARE_ALL_AS_TEMPLATES \
Expand Down
96 changes: 90 additions & 6 deletions core/factorization/ic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,17 @@
#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/config/config.hpp>
#include <ginkgo/core/config/registry.hpp>
#include <ginkgo/core/factorization/cholesky.hpp>

#include "core/base/array_access.hpp"
#include "core/components/fill_array_kernels.hpp"
#include "core/config/config_helper.hpp"
#include "core/factorization/cholesky_kernels.hpp"
#include "core/factorization/elimination_forest.hpp"
#include "core/factorization/factorization_kernels.hpp"
#include "core/factorization/ic_kernels.hpp"
#include "core/matrix/csr_kernels.hpp"
#include "core/matrix/csr_lookup.hpp"


namespace gko {
Expand All @@ -30,6 +36,13 @@ GKO_REGISTER_OPERATION(add_diagonal_elements,
GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
factorization::initialize_row_ptrs_l);
GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
// for gko syncfree implementation
GKO_REGISTER_OPERATION(fill_array, components::fill_array);
GKO_REGISTER_OPERATION(build_lookup_offsets, csr::build_lookup_offsets);
GKO_REGISTER_OPERATION(build_lookup, csr::build_lookup);
GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor);
GKO_REGISTER_OPERATION(initialize, cholesky::initialize);
GKO_REGISTER_OPERATION(factorize, cholesky::factorize);


} // anonymous namespace
Expand All @@ -52,6 +65,17 @@ Ic<ValueType, IndexType>::parse(const config::pnode& config,
if (auto& obj = config.get("both_factors")) {
params.with_both_factors(config::get_value<bool>(obj));
}
if (auto& obj = config.get("algorithm")) {
using gko::factorization::factorize_algorithm;
auto str = obj.get_string();
if (str == "sparselib") {
params.with_algorithm(factorize_algorithm::sparselib);
} else if (str == "syncfree") {
params.with_algorithm(factorize_algorithm::syncfree);
} else {
GKO_INVALID_CONFIG_VALUE("algorithm", str);
}
}
return params;
}

Expand All @@ -67,7 +91,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(

// Converts the system matrix to CSR.
// Throws an exception if it is not convertible.
auto local_system_matrix = matrix_type::create(exec);
auto local_system_matrix = share(matrix_type::create(exec));
as<ConvertibleTo<matrix_type>>(system_matrix.get())
->convert_to(local_system_matrix);

Expand All @@ -79,15 +103,75 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
exec->run(ic_factorization::make_add_diagonal_elements(
local_system_matrix.get(), false));

std::shared_ptr<const matrix_type> ic;
// Compute LC factorization
exec->run(ic_factorization::make_compute(local_system_matrix.get()));
if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
parameters_.algorithm == factorize_algorithm::syncfree) {
std::unique_ptr<gko::factorization::elimination_forest<IndexType>>
forest;
const auto nnz = local_system_matrix->get_num_stored_elements();
const auto num_rows = local_system_matrix->get_size()[0];
auto factors = share(
matrix_type::create(exec, local_system_matrix->get_size(), nnz));
exec->copy_from(exec, nnz, local_system_matrix->get_const_col_idxs(),
factors->get_col_idxs());
exec->copy_from(exec, num_rows + 1,
local_system_matrix->get_const_row_ptrs(),
factors->get_row_ptrs());
// update srow to be safe
factors->set_strategy(factors->get_strategy());
forest =
std::make_unique<gko::factorization::elimination_forest<IndexType>>(
exec, num_rows);
exec->run(
ic_factorization::make_forest_from_factor(factors.get(), *forest));

// setup lookup structure on factors
array<IndexType> storage_offsets{exec, num_rows + 1};
array<int64> row_descs{exec, num_rows};
array<IndexType> diag_idxs{exec, num_rows};
array<IndexType> transpose_idxs{exec,
factors->get_num_stored_elements()};
const auto allowed_sparsity = gko::matrix::csr::sparsity_type::bitmap |
gko::matrix::csr::sparsity_type::full |
gko::matrix::csr::sparsity_type::hash;
exec->run(ic_factorization::make_build_lookup_offsets(
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
num_rows, allowed_sparsity, storage_offsets.get_data()));
const auto storage_size =
static_cast<size_type>(get_element(storage_offsets, num_rows));
array<int32> storage{exec, storage_size};
exec->run(ic_factorization::make_build_lookup(
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
num_rows, allowed_sparsity, storage_offsets.get_const_data(),
row_descs.get_data(), storage.get_data()));
// initialize factors
exec->run(ic_factorization::make_fill_array(
factors->get_values(), factors->get_num_stored_elements(),
zero<ValueType>()));
exec->run(ic_factorization::make_initialize(
local_system_matrix.get(), storage_offsets.get_const_data(),
row_descs.get_const_data(), storage.get_const_data(),
diag_idxs.get_data(), transpose_idxs.get_data(), factors.get()));
// run numerical factorization
array<int> tmp{exec};
exec->run(ic_factorization::make_factorize(
storage_offsets.get_const_data(), row_descs.get_const_data(),
storage.get_const_data(), diag_idxs.get_const_data(),
transpose_idxs.get_const_data(), *forest, factors.get(), false,
tmp));
ic = factors;
} else {
exec->run(ic_factorization::make_compute(local_system_matrix.get()));
ic = local_system_matrix;
}

// Extract lower factor: compute non-zeros
const auto matrix_size = local_system_matrix->get_size();
const auto matrix_size = ic->get_size();
const auto num_rows = matrix_size[0];
array<IndexType> l_row_ptrs{exec, num_rows + 1};
exec->run(ic_factorization::make_initialize_row_ptrs_l(
local_system_matrix.get(), l_row_ptrs.get_data()));
ic.get(), l_row_ptrs.get_data()));

// Get nnz from device memory
auto l_nnz = static_cast<size_type>(get_element(l_row_ptrs, num_rows));
Expand All @@ -100,8 +184,8 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
std::move(l_row_ptrs), parameters_.l_strategy);

// Extract lower factor: columns and values
exec->run(ic_factorization::make_initialize_l(local_system_matrix.get(),
l_factor.get(), false));
exec->run(
ic_factorization::make_initialize_l(ic.get(), l_factor.get(), false));

if (both_factors) {
auto lh_factor = l_factor->conj_transpose();
Expand Down
Loading
Loading