From ecdcf5f8929602c347dba4a8218393774a1995a0 Mon Sep 17 00:00:00 2001 From: Markus Hrywniak Date: Fri, 19 Jul 2019 02:38:14 -0700 Subject: [PATCH] Add isContiguous check for new AmgX API The new AmgX API allows passing in partition offsets instead of a full partition vector. Perform this check using the PETSc index set API to transparently enable the optimization. * Updated documentation (dependencies) * Added timing to poisson example to allow verification --- doc/dependencies.md | 4 +- example/poisson/src/main.cpp | 10 +++- src/AmgXSolver.hpp | 27 ++++++++-- src/setA.cpp | 99 +++++++++++++++++++++++++++--------- 4 files changed, 107 insertions(+), 33 deletions(-) diff --git a/doc/dependencies.md b/doc/dependencies.md index b447c73..07ade58 100644 --- a/doc/dependencies.md +++ b/doc/dependencies.md @@ -3,12 +3,12 @@ So far the, the following sets of dependencies and versions have been tested and worked with AmgXWrapper: -#### Using AmgX GitHub Repository -- commit 3049527e0c396424df4582e837f9dd89a20f50df +#### Using AmgX GitHub Repository -- commit aba9132119fd9efde679f41369628c04e3452a14 * [OpenMPI v4.0.0](https://www.open-mpi.org/software/ompi/v4.0/) * [CUDA v10.0.130](https://developer.nvidia.com/cuda-10.0-download-archive) * [PETSc v3.10.4](https://www.mcs.anl.gov/petsc/download/index.html) -* [AmgX GitHub commit 3049527](https://github.com/NVIDIA/AMGX/tree/3049527e0c396424df4582e837f9dd89a20f50df) +* [AmgX GitHub commit aba9132](https://github.com/NVIDIA/AMGX/commit/aba9132119fd9efde679f41369628c04e3452a14) The C and C++ compilers for this set of dependencies are GCC 7. diff --git a/example/poisson/src/main.cpp b/example/poisson/src/main.cpp index 5bd8656..3355a11 100644 --- a/example/poisson/src/main.cpp +++ b/example/poisson/src/main.cpp @@ -85,10 +85,12 @@ int main(int argc, char **argv) myRank; // rank of current process PetscClassId solvingID, - warmUpID; + warmUpID, + setAID; PetscLogEvent solvingEvent, - warmUpEvent; + warmUpEvent, + setAEvent; @@ -208,8 +210,10 @@ int main(int argc, char **argv) // register a PETSc event for warm-up and solving ierr = PetscClassIdRegister("SolvingClass", &solvingID); CHKERRQ(ierr); ierr = PetscClassIdRegister("WarmUpClass", &warmUpID); CHKERRQ(ierr); + ierr = PetscClassIdRegister("SetAClass", &setAID); CHKERRQ(ierr); ierr = PetscLogEventRegister("Solving", solvingID, &solvingEvent); CHKERRQ(ierr); ierr = PetscLogEventRegister("WarmUp", warmUpID, &warmUpEvent); CHKERRQ(ierr); + ierr = PetscLogEventRegister("setA", setAID, &setAEvent); CHKERRQ(ierr); @@ -240,7 +244,9 @@ int main(int argc, char **argv) ierr = MPI_Barrier(PETSC_COMM_WORLD); CHKERRQ(ierr); + PetscLogEventBegin(setAEvent, 0, 0, 0, 0); ierr = amgx.setA(A); CHKERRQ(ierr); + PetscLogEventEnd(setAEvent, 0, 0, 0, 0); ierr = solve(amgx, A, lhs, rhs, u_exact, err, args, warmUpEvent, solvingEvent); CHKERRQ(ierr); diff --git a/src/AmgXSolver.hpp b/src/AmgXSolver.hpp index ce8b8fe..5e2e06e 100644 --- a/src/AmgXSolver.hpp +++ b/src/AmgXSolver.hpp @@ -4,6 +4,7 @@ * \author Pi-Yueh Chuang (pychuang@gwu.edu) * \date 2015-09-01 * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * This project is released under MIT License. */ @@ -403,16 +404,34 @@ class AmgXSolver */ PetscErrorCode destroyLocalA(const Mat &A, Mat &localA); + /** \brief Check whether the global matrix distribution is contiguous + * + * If the global matrix is distributed such that contiguous chunks of rows are + * distributed over the individual ranks in ascending rank order, the partition vector + * has trivial structure (i.e. [0, ..., 0, 1, ..., 1, ..., R-1, ..., R-1] for R ranks) and + * its calculation can be skipped since all information is available to AmgX through + * the number of ranks and the partition *offsets* (i.e. how many rows are on each rank). + * + * \param devIS [in] PETSc IS representing redistributed row indices. + * \param isContiguous [out] Whether the global matrix is contiguously distributed. + * \param partOffsets [out] If contiguous, holds the partition offsets for all R ranks + * \return PetscErrorCode. + */ + PetscErrorCode checkForContiguousPartitioning( + const IS &devIS, PetscBool &isContiguous, std::vector &partOffsets); - /** \brief Get a partition vector required by AmgX. + /** \brief Get partition data required by AmgX. * * \param devIS [in] PETSc IS representing redistributed row indices. * \param N [in] Total number of rows in global matrix. - * \param partVec [out] Partition vector. + * \param partData [out] Partition data, either explicit vector or offsets. + * \param usesOffsets [out] If PETSC_TRUE, partitioning is contiguous and partData contains + * partition offsets, see checkForContiguousPartitioning(). Otherwise, contains explicit + * partition vector. * \return PetscErrorCode. */ - PetscErrorCode getPartVec(const IS &devIS, - const PetscInt &N, std::vector &partVec); + PetscErrorCode getPartData(const IS &devIS, + const PetscInt &N, std::vector &partData, PetscBool &usesOffsets); /** \brief Function that actually solves the system. diff --git a/src/setA.cpp b/src/setA.cpp index 325dc55..59572da 100644 --- a/src/setA.cpp +++ b/src/setA.cpp @@ -4,12 +4,14 @@ * \author Pi-Yueh Chuang (pychuang@gwu.edu) * \date 2016-01-08 * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * This project is released under MIT License. */ // STD # include +# include // AmgXSolver # include "AmgXSolver.hpp" @@ -28,11 +30,12 @@ PetscErrorCode AmgXSolver::setA(const Mat &A) PetscInt nGlobalRows, nLocalRows; + PetscBool usesOffsets; std::vector row; std::vector col; std::vector data; - std::vector partVec; + std::vector partData; // get number of rows in global matrix @@ -51,18 +54,29 @@ PetscErrorCode AmgXSolver::setA(const Mat &A) ierr = destroyLocalA(A, localA); CHK; // get a partition vector required by AmgX - ierr = getPartVec(devIS, nGlobalRows, partVec); CHK; + ierr = getPartData(devIS, nGlobalRows, partData, usesOffsets); CHK; // upload matrix A to AmgX if (gpuWorld != MPI_COMM_NULL) { ierr = MPI_Barrier(gpuWorld); CHK; - - AMGX_matrix_upload_all_global( + // offsets need to be 64 bit, since we use 64 bit column indices + std::vector offsets; + + AMGX_distribution_handle dist; + AMGX_distribution_create(&dist, cfg); + if (usesOffsets) { + offsets.assign(partData.begin(), partData.end()); + AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_OFFSETS, offsets.data()); + } else { + AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_VECTOR, partData.data()); + } + AMGX_matrix_upload_distributed( AmgXA, nGlobalRows, nLocalRows, row[nLocalRows], 1, 1, row.data(), col.data(), data.data(), - nullptr, ring, ring, partVec.data()); + nullptr, dist); + AMGX_distribution_destroy(dist); // bind the matrix A to the solver ierr = MPI_Barrier(gpuWorld); CHK; @@ -298,10 +312,41 @@ PetscErrorCode AmgXSolver::destroyLocalA(const Mat &A, Mat &localA) PetscFunctionReturn(0); } +/* \implements AmgXSolver::checkForContiguousPartitioning */ +PetscErrorCode AmgXSolver::checkForContiguousPartitioning( + const IS &devIS, PetscBool &isContiguous, std::vector &partOffsets) +{ + PetscFunctionBeginUser; + PetscErrorCode ierr; + PetscBool sorted; + PetscInt ismax= -2; // marker for "unsorted", allows to check after global sort -/* \implements AmgXSolver::getPartVec */ -PetscErrorCode AmgXSolver::getPartVec( - const IS &devIS, const PetscInt &N, std::vector &partVec) + ierr = ISSorted(devIS, &sorted); CHK; + if (sorted) + { + ierr = ISGetMinMax(devIS, NULL, &ismax); CHK; + } + partOffsets.resize(gpuWorldSize); + ++ismax; // add 1 to allow reusing gathered ismax values as partition offsets for AMGX + MPI_Allgather(&ismax, 1, MPIU_INT, &partOffsets[0], 1, MPIU_INT, gpuWorld); + bool all_sorted = std::is_sorted(partOffsets.begin(), partOffsets.end()) + && partOffsets[0] != -1; + if (all_sorted) + { + partOffsets.insert(partOffsets.begin(), 0); // partition 0 always starts at 0 + isContiguous = PETSC_TRUE; + } + else + { + isContiguous = PETSC_FALSE; + } + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getPartData */ +PetscErrorCode AmgXSolver::getPartData( + const IS &devIS, const PetscInt &N, std::vector &partData, PetscBool &usesOffsets) { PetscFunctionBeginUser; @@ -312,35 +357,39 @@ PetscErrorCode AmgXSolver::getPartVec( tempSEQ; PetscInt n; - PetscScalar *tempPartVec; ierr = ISGetLocalSize(devIS, &n); CHK; if (gpuWorld != MPI_COMM_NULL) { - ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK; + // check if sorted/contiguous, then we can skip expensive scatters + checkForContiguousPartitioning(devIS, usesOffsets, partData); + if (!usesOffsets) + { + ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK; - IS is; - ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK; - ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK; - ierr = ISDestroy(&is); CHK; + IS is; + ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK; + ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK; + ierr = ISDestroy(&is); CHK; - ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK; - ierr = VecScatterBegin(scatter, - tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK; - ierr = VecScatterEnd(scatter, - tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK; - ierr = VecScatterDestroy(&scatter); CHK; - ierr = VecDestroy(&tempMPI); CHK; + ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK; + ierr = VecScatterBegin(scatter, + tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK; + ierr = VecScatterEnd(scatter, + tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK; + ierr = VecScatterDestroy(&scatter); CHK; + ierr = VecDestroy(&tempMPI); CHK; - ierr = VecGetArray(tempSEQ, &tempPartVec); CHK; + ierr = VecGetArray(tempSEQ, &tempPartVec); CHK; - partVec.assign(tempPartVec, tempPartVec+N); + partData.assign(tempPartVec, tempPartVec+N); - ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK; + ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK; - ierr = VecDestroy(&tempSEQ); CHK; + ierr = VecDestroy(&tempSEQ); CHK; + } } ierr = MPI_Barrier(globalCpuWorld); CHK;