diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 6439dcdc1ed..3457dfb99ed 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -106,7 +106,6 @@ Matrix::Matrix( recv_offsets_(comm.size() + 1), recv_sizes_(comm.size()), gather_idxs_{exec}, - non_local_to_global_{exec}, one_scalar_{}, local_mtx_{local_matrix_template->clone(exec)}, non_local_mtx_{non_local_matrix_template->clone(exec)} @@ -134,7 +133,6 @@ Matrix::Matrix( recv_offsets_(comm.size() + 1), recv_sizes_(comm.size()), gather_idxs_{exec}, - non_local_to_global_{exec}, one_scalar_{}, non_local_mtx_(::gko::matrix::Coo::create( exec, dim<2>{local_linop->get_size()[0], 0})) @@ -159,7 +157,6 @@ Matrix::Matrix( recv_offsets_(comm.size() + 1), recv_sizes_(comm.size()), gather_idxs_{exec}, - non_local_to_global_{exec}, one_scalar_{} { this->set_size({imap_.get_global_size(), imap_.get_global_size()}); @@ -243,7 +240,6 @@ void Matrix::convert_to( result->recv_offsets_ = this->recv_offsets_; result->recv_sizes_ = this->recv_sizes_; result->send_sizes_ = this->send_sizes_; - result->non_local_to_global_ = this->non_local_to_global_; result->set_size(this->get_size()); } @@ -263,7 +259,6 @@ void Matrix::move_to( result->recv_offsets_ = std::move(this->recv_offsets_); result->recv_sizes_ = std::move(this->recv_sizes_); result->send_sizes_ = std::move(this->send_sizes_); - result->non_local_to_global_ = std::move(this->non_local_to_global_); result->set_size(this->get_size()); this->set_size({}); } @@ -315,11 +310,6 @@ void Matrix::read_distributed( auto non_local_col_idxs = imap_.map_to_local(global_non_local_col_idxs, index_space::non_local); - non_local_to_global_ = - make_const_array_view( - imap_.get_executor(), imap_.get_remote_global_idxs().get_size(), - imap_.get_remote_global_idxs().get_const_flat_data()) - .copy_to_array(); // read the local matrix data const auto num_local_rows = @@ -537,7 +527,6 @@ Matrix::operator=( recv_offsets_ = other.recv_offsets_; send_sizes_ = other.send_sizes_; recv_sizes_ = other.recv_sizes_; - non_local_to_global_ = other.non_local_to_global_; one_scalar_.init(this->get_executor(), dim<2>{1, 1}); one_scalar_->fill(one()); } @@ -562,7 +551,6 @@ Matrix::operator=(Matrix&& other) recv_offsets_ = std::move(other.recv_offsets_); send_sizes_ = std::move(other.send_sizes_); recv_sizes_ = std::move(other.recv_sizes_); - non_local_to_global_ = std::move(other.non_local_to_global_); one_scalar_.init(this->get_executor(), dim<2>{1, 1}); one_scalar_->fill(one()); } diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 9f1f5b50ba6..6234f072dd5 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -247,13 +249,18 @@ Pgm::generate_local( #if GINKGO_BUILD_MPI + + template template -void Pgm::communicate( +array Pgm::communicate_non_local_agg( std::shared_ptr> matrix, - const array& local_agg, array& non_local_agg) + std::shared_ptr< + experimental::distributed::Partition> + coarse_partition, + const array& local_agg) { auto exec = gko::as(matrix)->get_executor(); const auto comm = matrix->get_communicator(); @@ -270,20 +277,29 @@ void Pgm::communicate( send_agg.get_size(), local_agg.get_const_data(), gather_idxs.get_const_data(), send_agg.get_data())); + // temporary index map that contains no remote connections to map + // local indices to global + experimental::distributed::index_map imap( + exec, coarse_partition, comm.rank(), array{exec}); + auto seng_global_agg = imap.map_to_global( + send_agg, experimental::distributed::index_space::local); + + array non_local_agg(exec, total_recv_size); + auto use_host_buffer = experimental::mpi::requires_host_buffer(exec, comm); - array host_recv_buffer(exec->get_master()); - array host_send_buffer(exec->get_master()); + array host_recv_buffer(exec->get_master()); + array host_send_buffer(exec->get_master()); if (use_host_buffer) { host_recv_buffer.resize_and_reset(total_recv_size); host_send_buffer.resize_and_reset(total_send_size); exec->get_master()->copy_from(exec, total_send_size, - send_agg.get_data(), + seng_global_agg.get_data(), host_send_buffer.get_data()); } - auto type = experimental::mpi::type_impl::get_type(); + auto type = experimental::mpi::type_impl::get_type(); const auto send_ptr = use_host_buffer ? host_send_buffer.get_const_data() - : send_agg.get_const_data(); + : seng_global_agg.get_const_data(); auto recv_ptr = use_host_buffer ? host_recv_buffer.get_data() : non_local_agg.get_data(); exec->synchronize(); @@ -294,92 +310,11 @@ void Pgm::communicate( exec->copy_from(exec->get_master(), total_recv_size, recv_ptr, non_local_agg.get_data()); } -} -#endif - - -#define GKO_ASSERT_HOST_ARRAY(array) \ - GKO_ASSERT(array.get_executor() == array.get_executor()->get_master()) - - -template -void generate_non_local_map( - const std::vector& recv_offsets, - array& non_local_agg, array& non_local_col_map, - array& renumber) -{ - GKO_ASSERT_HOST_ARRAY(non_local_agg); - GKO_ASSERT_HOST_ARRAY(non_local_col_map); - GKO_ASSERT_HOST_ARRAY(renumber); - auto exec = renumber.get_executor(); - auto non_local_size = non_local_agg.get_size(); - array part_id(exec, non_local_size); - array index(exec, non_local_size); - - for (int i = 0; i + 1 < recv_offsets.size(); i++) { - for (auto j = recv_offsets.at(i); j < recv_offsets.at(i + 1); j++) { - part_id.get_data()[j] = i; - index.get_data()[j] = j; - } - } - // do it in host currently. - auto it = detail::make_zip_iterator( - part_id.get_data(), non_local_agg.get_data(), index.get_data()); - // prepare tuple - // sort by or did segment sort - std::sort(it, it + non_local_size); - - renumber.get_data()[0] = 0; - // renumber (prefix_sum) with not equal - for (int i = 1; i < non_local_size; i++) { - if (part_id.get_const_data()[i] != part_id.get_const_data()[i - 1] || - non_local_agg.get_const_data()[i] != - non_local_agg.get_const_data()[i - 1]) { - renumber.get_data()[i] = renumber.get_data()[i - 1] + 1; - } else { - renumber.get_data()[i] = renumber.get_data()[i - 1]; - } - } - renumber.get_data()[non_local_size] = - renumber.get_data()[non_local_size - 1] + 1; - // create col map - // for each thread i, col_map[tuple[i].index] = map[i] - for (int i = 0; i < non_local_size; i++) { - non_local_col_map.get_data()[index.get_data()[i]] = - renumber.get_data()[i]; - } -} - - -template -void compute_communication( - const std::vector recv_offsets, - const array& non_local_agg, const array& renumber, - std::vector& new_recv_size, - std::vector& new_recv_offsets, - array& new_recv_gather_idxs) -{ - GKO_ASSERT_HOST_ARRAY(non_local_agg); - GKO_ASSERT_HOST_ARRAY(renumber); - GKO_ASSERT_HOST_ARRAY(new_recv_gather_idxs); - new_recv_offsets.at(0) = 0; - for (int i = 0; i < new_recv_size.size(); i++) { - new_recv_size.at(i) = - renumber.get_const_data()[recv_offsets.at(i + 1)] - - renumber.get_const_data()[recv_offsets.at(i)]; - new_recv_offsets.at(i + 1) = - new_recv_offsets.at(i) + new_recv_size.at(i); - } - IndexType non_local_num_agg = new_recv_offsets.back(); - new_recv_gather_idxs.resize_and_reset(non_local_num_agg); - for (int i = 0; i < non_local_agg.get_size(); i++) { - new_recv_gather_idxs.get_data()[renumber.get_const_data()[i]] = - non_local_agg.get_const_data()[i]; - } + return non_local_agg; } -#undef GKO_ASSERT_HOST_ARRAY +#endif template @@ -440,72 +375,71 @@ void Pgm::generate() } auto distributed_setup = [&](auto matrix) { + using global_index_type = + typename std::decay_t::global_index_type; + auto exec = gko::as(matrix)->get_executor(); auto comm = gko::as(matrix) ->get_communicator(); - auto num_rank = comm.size(); auto pgm_local_op = gko::as(matrix->get_local_matrix()); auto result = this->generate_local(pgm_local_op); - auto non_local_csr = - as(matrix->get_non_local_matrix()); - auto non_local_size = non_local_csr->get_size()[1]; - array non_local_agg(exec, non_local_size); - // get agg information (prolong_row_gather row idx) - communicate(matrix, agg_, non_local_agg); - // generate non_local_col_map - non_local_agg.set_executor(exec->get_master()); - array non_local_col_map(exec->get_master(), - non_local_size); - // add additional entry in tail such that the offset easily handle - // it. - array renumber(exec->get_master(), non_local_size + 1); - auto recv_offsets = matrix->recv_offsets_; - generate_non_local_map(recv_offsets, non_local_agg, - non_local_col_map, renumber); - - // get new recv_size and recv_offsets - std::vector - new_recv_size(num_rank); - std::vector - new_recv_offsets(num_rank + 1); - array new_recv_gather_idxs(exec->get_master()); - compute_communication(recv_offsets, non_local_agg, renumber, - new_recv_size, new_recv_offsets, - new_recv_gather_idxs); - - non_local_col_map.set_executor(exec); - IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); + // create the coarse partition + // the coarse partition will have only one range per part + // and only one part per rank. + // The global indices are ordered block-wise by rank, i.e. rank 1 + // owns [0, ..., N_1), rank 2 [N_1, ..., N_2), ... + auto coarse_local_size = + static_cast(std::get<1>(result)->get_size()[0]); + auto coarse_partition = gko::share( + experimental::distributed::build_partition_from_local_size< + IndexType, global_index_type>(exec, comm, + coarse_local_size)); + + // get the non-local aggregates as coarse global indices + auto non_local_agg = + communicate_non_local_agg(matrix, coarse_partition, agg_); + + // create a coarse index map based on the connection given by the + // non-local aggregates + auto coarse_imap = + experimental::distributed::index_map( + exec, coarse_partition, comm.rank(), non_local_agg); + + // a mapping from the fine non-local indices to the coarse non-local + // indices. + // non_local_agg already maps the fine non-local indices to coarse + // global indices, so mapping it with the coarse index map results + // in the coarse non-local indices. + auto non_local_map = coarse_imap.map_to_local( + non_local_agg, + experimental::distributed::index_space::non_local); + // build csr from row and col map // unlike non-distributed version, generate_coarse uses different // row and col maps. + auto non_local_csr = + as(matrix->get_non_local_matrix()); auto result_non_local_csr = generate_coarse( exec, non_local_csr.get(), static_cast(std::get<1>(result)->get_size()[0]), - agg_, non_local_num_agg, non_local_col_map); - // use local and non-local to build coarse matrix - // also restriction and prolongation (Local-only-global matrix) - auto coarse_size = - static_cast(std::get<1>(result)->get_size()[0]); - comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); - new_recv_gather_idxs.set_executor(exec); + agg_, static_cast(coarse_imap.get_non_local_size()), + non_local_map); // setup the generated linop. - using global_index_type = - typename std::decay_t::global_index_type; auto coarse = share( experimental::distributed:: Matrix::create( - exec, comm, gko::dim<2>(coarse_size, coarse_size), - std::get<1>(result), result_non_local_csr, - new_recv_size, new_recv_offsets, new_recv_gather_idxs)); + exec, comm, std::move(coarse_imap), std::get<1>(result), + result_non_local_csr)); auto restrict_op = share( experimental::distributed:: Matrix::create( exec, comm, - dim<2>(coarse_size, + dim<2>(coarse->get_size()[0], gko::as(matrix)->get_size()[0]), std::get<2>(result))); auto prolong_op = share( @@ -513,7 +447,7 @@ void Pgm::generate() Matrix::create( exec, comm, dim<2>(gko::as(matrix)->get_size()[0], - coarse_size), + coarse->get_size()[0]), std::get<0>(result))); this->set_multigrid_level(prolong_op, coarse, restrict_op); }; diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index ad0b9d1cce0..77648353a98 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -614,7 +614,6 @@ class Matrix std::vector recv_offsets_; std::vector recv_sizes_; array gather_idxs_; - array non_local_to_global_; gko::detail::DenseCache one_scalar_; gko::detail::DenseCache host_send_buffer_; gko::detail::DenseCache host_recv_buffer_; diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp index d07001be2f1..47f78cb3d5d 100644 --- a/include/ginkgo/core/multigrid/pgm.hpp +++ b/include/ginkgo/core/multigrid/pgm.hpp @@ -194,12 +194,28 @@ class Pgm : public EnableLinOp>, std::shared_ptr> local_matrix); #if GINKGO_BUILD_MPI + /** + * Communicates the non-local aggregates (as global indices) + * + * @tparam GlobalIndexType Global index type + * + * @param matrix a distributed matrix + * @param coarse_partition the coarse partition to compute the new global + * indices + * @param local_agg the local aggregate indices + * + * @return the aggregates for non-local columns. The aggregated indices are + * in the new global indexing + */ template - void communicate(std::shared_ptr> - matrix, - const array& local_agg, - array& non_local_agg); + array communicate_non_local_agg( + std::shared_ptr> + matrix, + std::shared_ptr< + experimental::distributed::Partition> + coarse_partition, + const array& local_agg); #endif private: