Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for host operations to executor #1232

Merged
merged 3 commits into from
Dec 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions benchmark/preconditioner/preconditioner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,16 @@ void run_preconditioner(const char* precond_name,
auto gen_logger =
std::make_shared<OperationLogger>(FLAGS_nested_names);
exec->add_logger(gen_logger);
if (exec->get_master() != exec) {
exec->get_master()->add_logger(gen_logger);
}
std::unique_ptr<gko::LinOp> precond_op;
for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) {
precond_op = precond->generate(system_matrix);
}
if (exec->get_master() != exec) {
exec->get_master()->remove_logger(gko::lend(gen_logger));
}
exec->remove_logger(gko::lend(gen_logger));

gen_logger->write_data(this_precond_data["generate"]["components"],
Expand All @@ -213,9 +219,15 @@ void run_preconditioner(const char* precond_name,
auto apply_logger =
std::make_shared<OperationLogger>(FLAGS_nested_names);
exec->add_logger(apply_logger);
if (exec->get_master() != exec) {
exec->get_master()->add_logger(apply_logger);
}
for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) {
precond_op->apply(lend(b), lend(x_clone));
}
if (exec->get_master() != exec) {
exec->get_master()->remove_logger(gko::lend(apply_logger));
}
exec->remove_logger(gko::lend(apply_logger));

apply_logger->write_data(this_precond_data["apply"]["components"],
Expand Down
12 changes: 12 additions & 0 deletions benchmark/solver/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,12 +435,18 @@ void solve_system(const std::string& solver_name,
auto gen_logger =
std::make_shared<OperationLogger>(FLAGS_nested_names);
exec->add_logger(gen_logger);
if (exec->get_master() != exec) {
exec->get_master()->add_logger(gen_logger);
}

auto precond = precond_factory.at(precond_name)(exec);
solver = generate_solver(exec, give(precond), solver_name,
FLAGS_max_iters)
->generate(system_matrix);

if (exec->get_master() != exec) {
exec->get_master()->remove_logger(gko::lend(gen_logger));
}
exec->remove_logger(gko::lend(gen_logger));
gen_logger->write_data(solver_json["generate"]["components"],
allocator, 1);
Expand All @@ -459,9 +465,15 @@ void solve_system(const std::string& solver_name,
auto apply_logger =
std::make_shared<OperationLogger>(FLAGS_nested_names);
exec->add_logger(apply_logger);
if (exec->get_master() != exec) {
exec->get_master()->add_logger(apply_logger);
}

solver->apply(lend(b), lend(x_clone));

if (exec->get_master() != exec) {
exec->get_master()->remove_logger(gko::lend(apply_logger));
}
exec->remove_logger(gko::lend(apply_logger));
apply_logger->write_data(solver_json["apply"]["components"],
allocator, 1);
Expand Down
35 changes: 19 additions & 16 deletions core/factorization/elimination_forest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,35 +181,38 @@ void elimination_forest<IndexType>::set_executor(


template <typename ValueType, typename IndexType>
elimination_forest<IndexType> compute_elim_forest(
const matrix::Csr<ValueType, IndexType>* mtx)
void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
std::unique_ptr<elimination_forest<IndexType>>& forest)
{
const auto host_exec = mtx->get_executor()->get_master();
const auto host_mtx = make_temporary_clone(host_exec, mtx);
const auto num_rows = static_cast<IndexType>(host_mtx->get_size()[0]);
elimination_forest<IndexType> forest{host_exec, num_rows};
forest =
std::make_unique<elimination_forest<IndexType>>(host_exec, num_rows);
compute_elim_forest_parent_impl(host_exec, host_mtx->get_const_row_ptrs(),
host_mtx->get_const_col_idxs(), num_rows,
forest.parents.get_data());
compute_elim_forest_children_impl(forest.parents.get_const_data(), num_rows,
forest.child_ptrs.get_data(),
forest.children.get_data());
forest->parents.get_data());
compute_elim_forest_children_impl(forest->parents.get_const_data(),
num_rows, forest->child_ptrs.get_data(),
forest->children.get_data());
compute_elim_forest_postorder_impl(
host_exec, forest.parents.get_const_data(),
forest.child_ptrs.get_const_data(), forest.children.get_const_data(),
num_rows, forest.postorder.get_data(), forest.inv_postorder.get_data());
host_exec, forest->parents.get_const_data(),
forest->child_ptrs.get_const_data(), forest->children.get_const_data(),
num_rows, forest->postorder.get_data(),
forest->inv_postorder.get_data());
compute_elim_forest_postorder_parent_impl(
forest.parents.get_const_data(), forest.inv_postorder.get_const_data(),
num_rows, forest.postorder_parents.get_data());
forest->parents.get_const_data(),
forest->inv_postorder.get_const_data(), num_rows,
forest->postorder_parents.get_data());

forest.set_executor(mtx->get_executor());
return forest;
forest->set_executor(mtx->get_executor());
}


#define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \
elimination_forest<IndexType> compute_elim_forest( \
const matrix::Csr<ValueType, IndexType>* mtx)
void compute_elim_forest( \
const matrix::Csr<ValueType, IndexType>* mtx, \
std::unique_ptr<elimination_forest<IndexType>>& forest)

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST);

Expand Down
5 changes: 3 additions & 2 deletions core/factorization/elimination_forest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ struct elimination_forest {


template <typename ValueType, typename IndexType>
elimination_forest<IndexType> compute_elim_forest(
const matrix::Csr<ValueType, IndexType>* mtx);
void compute_elim_forest(
const matrix::Csr<ValueType, IndexType>* mtx,
std::unique_ptr<elimination_forest<IndexType>>& forest);


} // namespace factorization
Expand Down
7 changes: 5 additions & 2 deletions core/factorization/lu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ GKO_REGISTER_OPERATION(build_lookup_offsets, csr::build_lookup_offsets);
GKO_REGISTER_OPERATION(build_lookup, csr::build_lookup);
GKO_REGISTER_OPERATION(initialize, lu_factorization::initialize);
GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
GKO_REGISTER_HOST_OPERATION(symbolic_cholesky,
gko::factorization::symbolic_cholesky);
GKO_REGISTER_HOST_OPERATION(symbolic_lu, gko::factorization::symbolic_lu);


} // namespace
Expand Down Expand Up @@ -93,9 +96,9 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
std::unique_ptr<matrix_type> factors;
if (!parameters_.symbolic_factorization) {
if (parameters_.symmetric_sparsity) {
factors = gko::factorization::symbolic_cholesky(mtx.get());
exec->run(make_symbolic_cholesky(mtx.get(), factors));
} else {
factors = gko::factorization::symbolic_lu(mtx.get());
exec->run(make_symbolic_lu(mtx.get(), factors));
}
} else {
const auto& symbolic = parameters_.symbolic_factorization;
Expand Down
43 changes: 23 additions & 20 deletions core/factorization/symbolic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,45 +58,48 @@ GKO_REGISTER_OPERATION(cholesky_symbolic,
GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum);
GKO_REGISTER_OPERATION(initialize, lu_factorization::initialize);
GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
GKO_REGISTER_HOST_OPERATION(compute_elim_forest, compute_elim_forest);


} // namespace


/** Computes the symbolic Cholesky factorization of the given matrix. */
template <typename ValueType, typename IndexType>
std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_cholesky(
const matrix::Csr<ValueType, IndexType>* mtx)
void symbolic_cholesky(
const matrix::Csr<ValueType, IndexType>* mtx,
std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
{
using matrix_type = matrix::Csr<ValueType, IndexType>;
const auto exec = mtx->get_executor();
const auto host_exec = exec->get_master();
const auto forest = compute_elim_forest(mtx);
std::unique_ptr<elimination_forest<IndexType>> forest;
exec->run(make_compute_elim_forest(mtx, forest));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there some drawback in doing exec->get_master()->run(...) instead of having a separate call ? Then you dont need the GKO_REGISTER_HOST_OPERATION at all, right ?

Copy link
Member Author

@upsj upsj Dec 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not being logged with the executor the object lives on, which is also the reason that we need the workaround in the benchmarks. With the suggested approach, we'd need to pass an additional Executor parameter and provide overloads for both OmpExecutor and ReferenceExecutor and put them inside the kernels namespace, which IMO isn't a good place to put core functionality.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, anything computationally expensive shouldn't be happening on the host. It should be on the kernel side. That also is in line with the Ginkgo philosophy of kernels doing the hard work and the host orchestrating the calling. It is perfectly fine if these kernels then are serial (with reference or with single threaded OpenMP), but I dont think we should be doing anything intensive on the host.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The operations we are looking at here (symbolic factorizations, elimination tree, later on MC64, AMD, METIS, ...) are either inherently sequential or very hard to parallelize (see e.g. RCM), let alone implement on GPUs, so either we duplicate the same implementation to OpenMP and Reference and call the kernels for host data only, or we have just a single place where they are implemented, in this case core. I think the latter is the more sensible choice here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the operations are sequential and probably not suited for GPUs. My point is that they perform computations which are potentially expensive (with scaled up problem size) and hence they should not be on the core side. Regarding the duplication, we can just have a common kernel file (like we do for HIP and CUDA) and have the serial implementation on OpenMP (forcing it on one thread) and reference alike.

I think performing computation on the host side can have implications for future design. For example, if we want to add some stream/asynchronous tasking for these operations. Performing these operations on the kernel side would allow us to keep the asynchronous tasking/streaming kernel interface and keep the core side clean.

IMO avoiding this also keeps our logging and profiling interface clean, without fragmenting it into host operations and kernel operations.

Copy link
Member Author

@upsj upsj Dec 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to unpack the meaning of core and kernel a bit more here. core is everything in libginkgo.so, kernel is everything in libginkgo_<backend>.so. You are talking about a conceptional separation that IMO doesn't apply here - we already have some expensive operations in core, mainly factorization-related. The only thing this PR does is make them visible to profilers regardless of the executor the objects live on. The logging interface is not impacted by this change, since to the loggers, host and kernel operations look the same.

If we were to make them kernel operations, we would have to do a lot more work: Kernels can't create Ginkgo objects, so we need to operate on arrays and combine them into Ginkgo objects on the core side again. Also as a minor point, it also doubles compilation time and binary size of the resulting functions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I would advocate for making them kernel operations, so this might be something we discuss before merging this PR.

Allocations should not happen inside a operation, and I think that is all the more reason to have allocations on the host side and do the operations on the kernel side. We log and profile the allocations separately anyway, so having the allocations inside the host operation makes the profiling and logging muddled.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We allocate a lot inside kernels, e.g. SpGEMM/SpGEAM, sparselib bindings, all kernels that use temporary memory, ..., removing these allocations/moving them into core would complicate the control flow significantly.

MarcelKoch marked this conversation as resolved.
Show resolved Hide resolved
const auto num_rows = mtx->get_size()[0];
array<IndexType> row_ptrs{exec, num_rows + 1};
array<IndexType> tmp{exec};
exec->run(
make_cholesky_symbolic_count(mtx, forest, row_ptrs.get_data(), tmp));
make_cholesky_symbolic_count(mtx, *forest, row_ptrs.get_data(), tmp));
exec->run(make_prefix_sum(row_ptrs.get_data(), num_rows + 1));
const auto factor_nnz = static_cast<size_type>(
exec->copy_val_to_host(row_ptrs.get_const_data() + num_rows));
auto factor = matrix_type::create(
factors = matrix_type::create(
exec, mtx->get_size(), array<ValueType>{exec, factor_nnz},
array<IndexType>{exec, factor_nnz}, std::move(row_ptrs));
exec->run(make_cholesky_symbolic(mtx, forest, factor.get(), tmp));
factor->sort_by_column_index();
auto lt_factor = as<matrix_type>(factor->transpose());
exec->run(make_cholesky_symbolic(mtx, *forest, factors.get(), tmp));
factors->sort_by_column_index();
auto lt_factor = as<matrix_type>(factors->transpose());
const auto scalar =
initialize<matrix::Dense<ValueType>>({one<ValueType>()}, exec);
const auto id = matrix::Identity<ValueType>::create(exec, num_rows);
lt_factor->apply(scalar.get(), id.get(), scalar.get(), factor.get());
return factor;
lt_factor->apply(scalar.get(), id.get(), scalar.get(), factors.get());
}


#define GKO_DECLARE_SYMBOLIC_CHOLESKY(ValueType, IndexType) \
std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_cholesky( \
const matrix::Csr<ValueType, IndexType>* mtx)
#define GKO_DECLARE_SYMBOLIC_CHOLESKY(ValueType, IndexType) \
void symbolic_cholesky( \
const matrix::Csr<ValueType, IndexType>* mtx, \
std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);

Expand All @@ -109,8 +112,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);
* "GSoFa: Scalable Sparse Symbolic LU Factorization on GPUs," arXiv 2021
*/
template <typename ValueType, typename IndexType>
std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_lu(
const matrix::Csr<ValueType, IndexType>* mtx)
void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
{
using matrix_type = matrix::Csr<ValueType, IndexType>;
const auto exec = mtx->get_executor();
Expand Down Expand Up @@ -179,16 +182,16 @@ std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_lu(
array<ValueType> out_val_array{exec, out_nnz};
exec->copy_from(host_exec.get(), out_nnz, out_col_idxs.data(),
out_col_idx_array.get_data());
auto result = matrix_type::create(
factors = matrix_type::create(
exec, mtx->get_size(), std::move(out_val_array),
std::move(out_col_idx_array), std::move(out_row_ptr_array));
return result;
}


#define GKO_DECLARE_SYMBOLIC_LU(ValueType, IndexType) \
std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_lu( \
const matrix::Csr<ValueType, IndexType>* mtx)
#define GKO_DECLARE_SYMBOLIC_LU(ValueType, IndexType) \
void symbolic_lu( \
const matrix::Csr<ValueType, IndexType>* mtx, \
std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_LU);

Expand Down
25 changes: 18 additions & 7 deletions core/factorization/symbolic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,26 @@ namespace gko {
namespace factorization {


/** Computes the symbolic Cholesky factorization of the given matrix. */
/**
* Computes the symbolic Cholesky factorization of the given matrix.
*
* @param mtx the input matrix
* @param factors the output factors stored in a combined pattern
*/
template <typename ValueType, typename IndexType>
std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_cholesky(
const matrix::Csr<ValueType, IndexType>*);

/** Computes the symbolic LU factorization of the given matrix. */
void symbolic_cholesky(
const matrix::Csr<ValueType, IndexType>* mtx,
std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors);

/**
* Computes the symbolic LU factorization of the given matrix.
*
* @param mtx the input matrix
* @param factors the output factors stored in a combined pattern
*/
template <typename ValueType, typename IndexType>
std::unique_ptr<matrix::Csr<ValueType, IndexType>> symbolic_lu(
const matrix::Csr<ValueType, IndexType>*);
void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors);


} // namespace factorization
Expand Down
Loading