From ecdcf5f8929602c347dba4a8218393774a1995a0 Mon Sep 17 00:00:00 2001
From: Markus Hrywniak <mhrywniak@nvidia.com>
Date: Fri, 19 Jul 2019 02:38:14 -0700
Subject: [PATCH] Add isContiguous check for new AmgX API

The new AmgX API allows passing in partition offsets instead of a full partition vector.
Perform this check using the PETSc index set API to transparently enable the optimization.
* Updated documentation (dependencies)
* Added timing to poisson example to allow verification
---
 doc/dependencies.md          |  4 +-
 example/poisson/src/main.cpp | 10 +++-
 src/AmgXSolver.hpp           | 27 ++++++++--
 src/setA.cpp                 | 99 +++++++++++++++++++++++++++---------
 4 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/doc/dependencies.md b/doc/dependencies.md
index b447c73..07ade58 100644
--- a/doc/dependencies.md
+++ b/doc/dependencies.md
@@ -3,12 +3,12 @@
 So far the, the following sets of dependencies and versions have been tested and
 worked with AmgXWrapper:
 
-#### Using AmgX GitHub Repository -- commit 3049527e0c396424df4582e837f9dd89a20f50df
+#### Using AmgX GitHub Repository -- commit aba9132119fd9efde679f41369628c04e3452a14
 
 * [OpenMPI v4.0.0](https://www.open-mpi.org/software/ompi/v4.0/)
 * [CUDA v10.0.130](https://developer.nvidia.com/cuda-10.0-download-archive)
 * [PETSc v3.10.4](https://www.mcs.anl.gov/petsc/download/index.html)
-* [AmgX GitHub commit 3049527](https://github.com/NVIDIA/AMGX/tree/3049527e0c396424df4582e837f9dd89a20f50df)
+* [AmgX GitHub commit aba9132](https://github.com/NVIDIA/AMGX/commit/aba9132119fd9efde679f41369628c04e3452a14)
 
 The C and C++ compilers for this set of dependencies are GCC 7.
 
diff --git a/example/poisson/src/main.cpp b/example/poisson/src/main.cpp
index 5bd8656..3355a11 100644
--- a/example/poisson/src/main.cpp
+++ b/example/poisson/src/main.cpp
@@ -85,10 +85,12 @@ int main(int argc, char **argv)
                         myRank; // rank of current process
 
     PetscClassId        solvingID,
-                        warmUpID;
+                        warmUpID,
+                        setAID;
 
     PetscLogEvent       solvingEvent,
-                        warmUpEvent;
+                        warmUpEvent,
+                        setAEvent;
 
 
 
@@ -208,8 +210,10 @@ int main(int argc, char **argv)
     // register a PETSc event for warm-up and solving
     ierr = PetscClassIdRegister("SolvingClass", &solvingID); CHKERRQ(ierr);
     ierr = PetscClassIdRegister("WarmUpClass", &warmUpID); CHKERRQ(ierr);
+    ierr = PetscClassIdRegister("SetAClass", &setAID); CHKERRQ(ierr);
     ierr = PetscLogEventRegister("Solving", solvingID, &solvingEvent); CHKERRQ(ierr);
     ierr = PetscLogEventRegister("WarmUp", warmUpID, &warmUpEvent); CHKERRQ(ierr);
+    ierr = PetscLogEventRegister("setA", setAID, &setAEvent); CHKERRQ(ierr);
 
 
 
@@ -240,7 +244,9 @@ int main(int argc, char **argv)
 
 
         ierr = MPI_Barrier(PETSC_COMM_WORLD); CHKERRQ(ierr);
+        PetscLogEventBegin(setAEvent, 0, 0, 0, 0);
         ierr = amgx.setA(A); CHKERRQ(ierr);
+        PetscLogEventEnd(setAEvent, 0, 0, 0, 0);
 
         ierr = solve(amgx, A, lhs, rhs, u_exact, err, 
                 args, warmUpEvent, solvingEvent); CHKERRQ(ierr);
diff --git a/src/AmgXSolver.hpp b/src/AmgXSolver.hpp
index ce8b8fe..5e2e06e 100644
--- a/src/AmgXSolver.hpp
+++ b/src/AmgXSolver.hpp
@@ -4,6 +4,7 @@
  * \author Pi-Yueh Chuang (pychuang@gwu.edu)
  * \date 2015-09-01
  * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *            This project is released under MIT License.
  */
 
@@ -403,16 +404,34 @@ class AmgXSolver
          */
         PetscErrorCode destroyLocalA(const Mat &A, Mat &localA);
 
+        /** \brief Check whether the global matrix distribution is contiguous
+         *
+         * If the global matrix is distributed such that contiguous chunks of rows are
+         * distributed over the individual ranks in ascending rank order, the partition vector
+         * has trivial structure (i.e. [0, ..., 0, 1, ..., 1, ..., R-1, ..., R-1] for R ranks) and
+         * its calculation can be skipped since all information is available to AmgX through
+         * the number of ranks and the partition *offsets* (i.e. how many rows are on each rank).
+         *
+         * \param devIS [in] PETSc IS representing redistributed row indices.
+         * \param isContiguous [out] Whether the global matrix is contiguously distributed.
+         * \param partOffsets [out] If contiguous, holds the partition offsets for all R ranks
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode checkForContiguousPartitioning(
+            const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets);
 
-        /** \brief Get a partition vector required by AmgX.
+        /** \brief Get partition data required by AmgX.
          *
          * \param devIS [in] PETSc IS representing redistributed row indices.
          * \param N [in] Total number of rows in global matrix.
-         * \param partVec [out] Partition vector.
+         * \param partData [out] Partition data, either explicit vector or offsets.
+         * \param usesOffsets [out] If PETSC_TRUE, partitioning is contiguous and partData contains
+         *      partition offsets, see checkForContiguousPartitioning(). Otherwise, contains explicit 
+         *      partition vector.
          * \return PetscErrorCode.
          */
-        PetscErrorCode getPartVec(const IS &devIS,
-                const PetscInt &N, std::vector<PetscInt> &partVec);
+        PetscErrorCode getPartData(const IS &devIS,
+                const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets);
 
 
         /** \brief Function that actually solves the system.
diff --git a/src/setA.cpp b/src/setA.cpp
index 325dc55..59572da 100644
--- a/src/setA.cpp
+++ b/src/setA.cpp
@@ -4,12 +4,14 @@
  * \author Pi-Yueh Chuang (pychuang@gwu.edu)
  * \date 2016-01-08
  * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *            This project is released under MIT License.
  */
 
 
 // STD
 # include <cstring>
+# include <algorithm>
 
 // AmgXSolver
 # include "AmgXSolver.hpp"
@@ -28,11 +30,12 @@ PetscErrorCode AmgXSolver::setA(const Mat &A)
 
     PetscInt            nGlobalRows,
                         nLocalRows;
+    PetscBool           usesOffsets;
 
     std::vector<PetscInt>       row;
     std::vector<PetscInt64>     col;
     std::vector<PetscScalar>    data;
-    std::vector<PetscInt>       partVec;
+    std::vector<PetscInt>       partData;
 
 
     // get number of rows in global matrix
@@ -51,18 +54,29 @@ PetscErrorCode AmgXSolver::setA(const Mat &A)
     ierr = destroyLocalA(A, localA); CHK;
 
     // get a partition vector required by AmgX
-    ierr = getPartVec(devIS, nGlobalRows, partVec); CHK;
+    ierr = getPartData(devIS, nGlobalRows, partData, usesOffsets); CHK;
 
 
     // upload matrix A to AmgX
     if (gpuWorld != MPI_COMM_NULL)
     {
         ierr = MPI_Barrier(gpuWorld); CHK;
-
-        AMGX_matrix_upload_all_global(
+        // offsets need to be 64 bit, since we use 64 bit column indices
+        std::vector<PetscInt64> offsets;
+
+        AMGX_distribution_handle dist;
+        AMGX_distribution_create(&dist, cfg);
+        if (usesOffsets) {
+            offsets.assign(partData.begin(), partData.end());
+            AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_OFFSETS, offsets.data());
+        } else {
+            AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_VECTOR, partData.data());
+        }
+        AMGX_matrix_upload_distributed(
                 AmgXA, nGlobalRows, nLocalRows, row[nLocalRows],
                 1, 1, row.data(), col.data(), data.data(),
-                nullptr, ring, ring, partVec.data());
+                nullptr, dist);
+        AMGX_distribution_destroy(dist);
 
         // bind the matrix A to the solver
         ierr = MPI_Barrier(gpuWorld); CHK;
@@ -298,10 +312,41 @@ PetscErrorCode AmgXSolver::destroyLocalA(const Mat &A, Mat &localA)
     PetscFunctionReturn(0);
 }
 
+/* \implements AmgXSolver::checkForContiguousPartitioning */
+PetscErrorCode AmgXSolver::checkForContiguousPartitioning(
+    const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets)
+{
+    PetscFunctionBeginUser;
+    PetscErrorCode      ierr;
+    PetscBool sorted;
+    PetscInt ismax= -2; // marker for "unsorted", allows to check after global sort
 
-/* \implements AmgXSolver::getPartVec */
-PetscErrorCode AmgXSolver::getPartVec(
-        const IS &devIS, const PetscInt &N, std::vector<PetscInt> &partVec)
+    ierr = ISSorted(devIS, &sorted); CHK;
+    if (sorted)
+    {
+        ierr = ISGetMinMax(devIS, NULL, &ismax); CHK;
+    }
+    partOffsets.resize(gpuWorldSize);
+    ++ismax; // add 1 to allow reusing gathered ismax values as partition offsets for AMGX
+    MPI_Allgather(&ismax, 1, MPIU_INT, &partOffsets[0], 1, MPIU_INT, gpuWorld);
+    bool all_sorted = std::is_sorted(partOffsets.begin(), partOffsets.end())
+                        && partOffsets[0] != -1;
+    if (all_sorted)
+    {
+        partOffsets.insert(partOffsets.begin(), 0); // partition 0 always starts at 0
+        isContiguous = PETSC_TRUE;
+    }
+    else
+    {
+        isContiguous = PETSC_FALSE;
+    }
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getPartData */
+PetscErrorCode AmgXSolver::getPartData(
+        const IS &devIS, const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets)
 {
     PetscFunctionBeginUser;
 
@@ -312,35 +357,39 @@ PetscErrorCode AmgXSolver::getPartVec(
                         tempSEQ;
 
     PetscInt            n;
-
     PetscScalar         *tempPartVec;
 
     ierr = ISGetLocalSize(devIS, &n); CHK;
 
     if (gpuWorld != MPI_COMM_NULL)
     {
-        ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK;
+        // check if sorted/contiguous, then we can skip expensive scatters
+        checkForContiguousPartitioning(devIS, usesOffsets, partData);
+        if (!usesOffsets)
+        {
+            ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK;
 
-        IS      is;
-        ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK;
-        ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK;
-        ierr = ISDestroy(&is); CHK;
+            IS      is;
+            ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK;
+            ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK;
+            ierr = ISDestroy(&is); CHK;
 
-        ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK;
-        ierr = VecScatterBegin(scatter,
-                tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
-        ierr = VecScatterEnd(scatter,
-                tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
-        ierr = VecScatterDestroy(&scatter); CHK;
-        ierr = VecDestroy(&tempMPI); CHK;
+            ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK;
+            ierr = VecScatterBegin(scatter,
+                    tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
+            ierr = VecScatterEnd(scatter,
+                    tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
+            ierr = VecScatterDestroy(&scatter); CHK;
+            ierr = VecDestroy(&tempMPI); CHK;
 
-        ierr = VecGetArray(tempSEQ, &tempPartVec); CHK;
+            ierr = VecGetArray(tempSEQ, &tempPartVec); CHK;
 
-        partVec.assign(tempPartVec, tempPartVec+N);
+            partData.assign(tempPartVec, tempPartVec+N);
 
-        ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK;
+            ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK;
 
-        ierr = VecDestroy(&tempSEQ); CHK;
+            ierr = VecDestroy(&tempSEQ); CHK;
+        }
     }
     ierr = MPI_Barrier(globalCpuWorld); CHK;