Skip to content

Commit

Permalink
Merge pull request #26 from mhrywniak/contiguous_opt
Browse files Browse the repository at this point in the history
Add isContiguous check for new AmgX API
  • Loading branch information
piyueh authored Jul 23, 2019
2 parents 7044182 + ecdcf5f commit 1a93865
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 33 deletions.
4 changes: 2 additions & 2 deletions doc/dependencies.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
So far the, the following sets of dependencies and versions have been tested and
worked with AmgXWrapper:

#### Using AmgX GitHub Repository -- commit 3049527e0c396424df4582e837f9dd89a20f50df
#### Using AmgX GitHub Repository -- commit aba9132119fd9efde679f41369628c04e3452a14

* [OpenMPI v4.0.0](https://www.open-mpi.org/software/ompi/v4.0/)
* [CUDA v10.0.130](https://developer.nvidia.com/cuda-10.0-download-archive)
* [PETSc v3.10.4](https://www.mcs.anl.gov/petsc/download/index.html)
* [AmgX GitHub commit 3049527](https://github.com/NVIDIA/AMGX/tree/3049527e0c396424df4582e837f9dd89a20f50df)
* [AmgX GitHub commit aba9132](https://github.com/NVIDIA/AMGX/commit/aba9132119fd9efde679f41369628c04e3452a14)

The C and C++ compilers for this set of dependencies are GCC 7.

Expand Down
10 changes: 8 additions & 2 deletions example/poisson/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,12 @@ int main(int argc, char **argv)
myRank; // rank of current process

PetscClassId solvingID,
warmUpID;
warmUpID,
setAID;

PetscLogEvent solvingEvent,
warmUpEvent;
warmUpEvent,
setAEvent;



Expand Down Expand Up @@ -208,8 +210,10 @@ int main(int argc, char **argv)
// register a PETSc event for warm-up and solving
ierr = PetscClassIdRegister("SolvingClass", &solvingID); CHKERRQ(ierr);
ierr = PetscClassIdRegister("WarmUpClass", &warmUpID); CHKERRQ(ierr);
ierr = PetscClassIdRegister("SetAClass", &setAID); CHKERRQ(ierr);
ierr = PetscLogEventRegister("Solving", solvingID, &solvingEvent); CHKERRQ(ierr);
ierr = PetscLogEventRegister("WarmUp", warmUpID, &warmUpEvent); CHKERRQ(ierr);
ierr = PetscLogEventRegister("setA", setAID, &setAEvent); CHKERRQ(ierr);



Expand Down Expand Up @@ -240,7 +244,9 @@ int main(int argc, char **argv)


ierr = MPI_Barrier(PETSC_COMM_WORLD); CHKERRQ(ierr);
PetscLogEventBegin(setAEvent, 0, 0, 0, 0);
ierr = amgx.setA(A); CHKERRQ(ierr);
PetscLogEventEnd(setAEvent, 0, 0, 0, 0);

ierr = solve(amgx, A, lhs, rhs, u_exact, err,
args, warmUpEvent, solvingEvent); CHKERRQ(ierr);
Expand Down
27 changes: 23 additions & 4 deletions src/AmgXSolver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* \author Pi-Yueh Chuang (pychuang@gwu.edu)
* \date 2015-09-01
* \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
* \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* This project is released under MIT License.
*/

Expand Down Expand Up @@ -403,16 +404,34 @@ class AmgXSolver
*/
PetscErrorCode destroyLocalA(const Mat &A, Mat &localA);

/** \brief Check whether the global matrix distribution is contiguous
*
* If the global matrix is distributed such that contiguous chunks of rows are
* distributed over the individual ranks in ascending rank order, the partition vector
* has trivial structure (i.e. [0, ..., 0, 1, ..., 1, ..., R-1, ..., R-1] for R ranks) and
* its calculation can be skipped since all information is available to AmgX through
* the number of ranks and the partition *offsets* (i.e. how many rows are on each rank).
*
* \param devIS [in] PETSc IS representing redistributed row indices.
* \param isContiguous [out] Whether the global matrix is contiguously distributed.
* \param partOffsets [out] If contiguous, holds the partition offsets for all R ranks
* \return PetscErrorCode.
*/
PetscErrorCode checkForContiguousPartitioning(
const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets);

/** \brief Get a partition vector required by AmgX.
/** \brief Get partition data required by AmgX.
*
* \param devIS [in] PETSc IS representing redistributed row indices.
* \param N [in] Total number of rows in global matrix.
* \param partVec [out] Partition vector.
* \param partData [out] Partition data, either explicit vector or offsets.
* \param usesOffsets [out] If PETSC_TRUE, partitioning is contiguous and partData contains
* partition offsets, see checkForContiguousPartitioning(). Otherwise, contains explicit
* partition vector.
* \return PetscErrorCode.
*/
PetscErrorCode getPartVec(const IS &devIS,
const PetscInt &N, std::vector<PetscInt> &partVec);
PetscErrorCode getPartData(const IS &devIS,
const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets);


/** \brief Function that actually solves the system.
Expand Down
99 changes: 74 additions & 25 deletions src/setA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
* \author Pi-Yueh Chuang (pychuang@gwu.edu)
* \date 2016-01-08
* \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
* \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* This project is released under MIT License.
*/


// STD
# include <cstring>
# include <algorithm>

// AmgXSolver
# include "AmgXSolver.hpp"
Expand All @@ -28,11 +30,12 @@ PetscErrorCode AmgXSolver::setA(const Mat &A)

PetscInt nGlobalRows,
nLocalRows;
PetscBool usesOffsets;

std::vector<PetscInt> row;
std::vector<PetscInt64> col;
std::vector<PetscScalar> data;
std::vector<PetscInt> partVec;
std::vector<PetscInt> partData;


// get number of rows in global matrix
Expand All @@ -51,18 +54,29 @@ PetscErrorCode AmgXSolver::setA(const Mat &A)
ierr = destroyLocalA(A, localA); CHK;

// get a partition vector required by AmgX
ierr = getPartVec(devIS, nGlobalRows, partVec); CHK;
ierr = getPartData(devIS, nGlobalRows, partData, usesOffsets); CHK;


// upload matrix A to AmgX
if (gpuWorld != MPI_COMM_NULL)
{
ierr = MPI_Barrier(gpuWorld); CHK;

AMGX_matrix_upload_all_global(
// offsets need to be 64 bit, since we use 64 bit column indices
std::vector<PetscInt64> offsets;

AMGX_distribution_handle dist;
AMGX_distribution_create(&dist, cfg);
if (usesOffsets) {
offsets.assign(partData.begin(), partData.end());
AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_OFFSETS, offsets.data());
} else {
AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_VECTOR, partData.data());
}
AMGX_matrix_upload_distributed(
AmgXA, nGlobalRows, nLocalRows, row[nLocalRows],
1, 1, row.data(), col.data(), data.data(),
nullptr, ring, ring, partVec.data());
nullptr, dist);
AMGX_distribution_destroy(dist);

// bind the matrix A to the solver
ierr = MPI_Barrier(gpuWorld); CHK;
Expand Down Expand Up @@ -298,10 +312,41 @@ PetscErrorCode AmgXSolver::destroyLocalA(const Mat &A, Mat &localA)
PetscFunctionReturn(0);
}

/* \implements AmgXSolver::checkForContiguousPartitioning */
PetscErrorCode AmgXSolver::checkForContiguousPartitioning(
const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets)
{
PetscFunctionBeginUser;
PetscErrorCode ierr;
PetscBool sorted;
PetscInt ismax= -2; // marker for "unsorted", allows to check after global sort

/* \implements AmgXSolver::getPartVec */
PetscErrorCode AmgXSolver::getPartVec(
const IS &devIS, const PetscInt &N, std::vector<PetscInt> &partVec)
ierr = ISSorted(devIS, &sorted); CHK;
if (sorted)
{
ierr = ISGetMinMax(devIS, NULL, &ismax); CHK;
}
partOffsets.resize(gpuWorldSize);
++ismax; // add 1 to allow reusing gathered ismax values as partition offsets for AMGX
MPI_Allgather(&ismax, 1, MPIU_INT, &partOffsets[0], 1, MPIU_INT, gpuWorld);
bool all_sorted = std::is_sorted(partOffsets.begin(), partOffsets.end())
&& partOffsets[0] != -1;
if (all_sorted)
{
partOffsets.insert(partOffsets.begin(), 0); // partition 0 always starts at 0
isContiguous = PETSC_TRUE;
}
else
{
isContiguous = PETSC_FALSE;
}
PetscFunctionReturn(0);
}


/* \implements AmgXSolver::getPartData */
PetscErrorCode AmgXSolver::getPartData(
const IS &devIS, const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets)
{
PetscFunctionBeginUser;

Expand All @@ -312,35 +357,39 @@ PetscErrorCode AmgXSolver::getPartVec(
tempSEQ;

PetscInt n;

PetscScalar *tempPartVec;

ierr = ISGetLocalSize(devIS, &n); CHK;

if (gpuWorld != MPI_COMM_NULL)
{
ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK;
// check if sorted/contiguous, then we can skip expensive scatters
checkForContiguousPartitioning(devIS, usesOffsets, partData);
if (!usesOffsets)
{
ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK;

IS is;
ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK;
ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK;
ierr = ISDestroy(&is); CHK;
IS is;
ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK;
ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK;
ierr = ISDestroy(&is); CHK;

ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK;
ierr = VecScatterBegin(scatter,
tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
ierr = VecScatterEnd(scatter,
tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
ierr = VecScatterDestroy(&scatter); CHK;
ierr = VecDestroy(&tempMPI); CHK;
ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK;
ierr = VecScatterBegin(scatter,
tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
ierr = VecScatterEnd(scatter,
tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
ierr = VecScatterDestroy(&scatter); CHK;
ierr = VecDestroy(&tempMPI); CHK;

ierr = VecGetArray(tempSEQ, &tempPartVec); CHK;
ierr = VecGetArray(tempSEQ, &tempPartVec); CHK;

partVec.assign(tempPartVec, tempPartVec+N);
partData.assign(tempPartVec, tempPartVec+N);

ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK;
ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK;

ierr = VecDestroy(&tempSEQ); CHK;
ierr = VecDestroy(&tempSEQ); CHK;
}
}
ierr = MPI_Barrier(globalCpuWorld); CHK;

Expand Down

0 comments on commit 1a93865

Please sign in to comment.