From f8a5afe1f252665e5ca9ee99d343d57fe58e6aa0 Mon Sep 17 00:00:00 2001
From: stgeke <stgeke@gmail.com>
Date: Sun, 20 Dec 2020 11:36:19 +0100
Subject: [PATCH] Revert "Enable residual projection for block solver (#189)"

This reverts commit 2eb77a5e17ada55de827a8833c658cb91f25219a.
---
 .travis.yml                                 |  6 --
 examples/ethier/ci.inc                      | 38 ++--------
 okl/elliptic/ellipticResidualProjection.okl | 37 +++------
 src/core/parReader.cpp                      |  2 +
 src/elliptic/ellipticResidualProjection.cpp | 83 +++++++++------------
 src/elliptic/ellipticResidualProjection.h   |  3 +-
 src/elliptic/ellipticSolve.cpp              |  2 +-
 7 files changed, 55 insertions(+), 116 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 862706fc8..e6ba2eb7b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,12 +37,6 @@ jobs:
    - stage: test
      name: "ethier subcycle"
      script: cd $NEKRS_EXAMPLES/ethier && nrsmpi ethier 2 2 
-   - stage: test
-     name: "ethier velocity and pressure projection"
-     script: cd $NEKRS_EXAMPLES/ethier && nrsmpi ethier 2 3 
-   - stage: test
-     name: "ethier (block) velocity and pressure projection with subcycling"
-     script: cd $NEKRS_EXAMPLES/ethier && nrsmpi ethier 2 4 
    - stage: test
      name: "lowMach default"
      script: cd $NEKRS_EXAMPLES/lowMach && nrsmpi lowMach 2 1 
diff --git a/examples/ethier/ci.inc b/examples/ethier/ci.inc
index c89a4861b..9ec77e509 100644
--- a/examples/ethier/ci.inc
+++ b/examples/ethier/ci.inc
@@ -19,7 +19,7 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("SCALAR00 DENSITY", string("1"));
   options.setArgs("SCALAR01 DIFFUSIVITY", string("0.01"));
   options.setArgs("SCALAR01 DENSITY", string("1"));
-  options.setArgs("END TIME", string("0.06"));
+  options.setArgs("END TIME", string("0.2"));
   options.setArgs("DT", string("2e-3"));
   options.setArgs("SUBCYCLING STEPS", string("0"));
   options.setArgs("PRESSURE RESIDUAL PROJECTION", "FALSE");
@@ -33,17 +33,6 @@ void ciSetup(MPI_Comm comm, setupAide &options)
     options.setArgs("SUBCYCLING STEPS", string("1"));
     options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
   }
-  if (ciMode == 3) {
-    options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
-    options.setArgs("VELOCITY RESIDUAL PROJECTION", "TRUE");
-  }
-  if (ciMode == 4) {
-    options.setArgs("END TIME", string("0.2"));
-    options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
-    options.setArgs("SUBCYCLING STEPS", string("1"));
-    options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
-    options.setArgs("VELOCITY RESIDUAL PROJECTION", "TRUE");
-  }
   options.setArgs("TIME INTEGRATOR", "TOMBO3");
   options.setArgs("ADVECTION TYPE", "CONVECTIVE+CUBATURE");
   options.setArgs("VELOCITY SOLVER TOLERANCE", string("1e-12"));
@@ -64,7 +53,8 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
 
   double *err = (double *) nek_scPtr(1);
 
-  double vxErr, prErr;
+  const double vxErr = abs((err[0] - 3.65E-10)/err[0]);
+  const double prErr = abs((err[1] - 6.71E-10)/err[1]);
   double s1Err, s2Err;
   
   int pIterErr;
@@ -72,32 +62,14 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
 
   switch (ciMode) {
     case 1 : velIterErr = abs(nrs->NiterU - 10);
-             s1Err = abs((err[2] - 5.25E-12)/err[2]);
-             s2Err = abs((err[3] - 6.09E-12)/err[3]);
+             s1Err = abs((err[2] - 1.00E-11)/err[2]);
+             s2Err = abs((err[3] - 1.31E-11)/err[3]);
              pIterErr = abs(nrs->NiterP - 4);
-             vxErr = abs((err[0] - 2.78E-10)/err[0]);
-             prErr = abs((err[1] - 7.17E-10)/err[1]);
              break;
     case 2 : velIterErr = abs(nrs->NiterU - 10);
-             s1Err = abs((err[2] - 6.49E-12)/err[2]);
-             s2Err = abs((err[3] - 7.28E-12)/err[3]);
-             pIterErr = abs(nrs->NiterP - 1);
-             vxErr = abs((err[0] - 2.78E-10)/err[0]);
-             prErr = abs((err[1] - 8.38E-10)/err[1]);
-             break;
-    case 3 : velIterErr = abs(nrs->NiterU - 4);
-             s1Err = abs((err[2] - 5.25E-12)/err[2]);
-             s2Err = abs((err[3] - 6.09E-12)/err[3]);
-             pIterErr = abs(nrs->NiterP - 1);
-             vxErr = abs((err[0] - 2.78E-10)/err[0]);
-             prErr = abs((err[1] - 8.34E-10)/err[1]);
-             break;
-    case 4 : velIterErr = abs(nrs->NiterU - 1);
              s1Err = abs((err[2] - 1.71E-11)/err[2]);
              s2Err = abs((err[3] - 2.00E-11)/err[3]);
              pIterErr = abs(nrs->NiterP - 1);
-             vxErr = abs((err[0] - 3.65E-10)/err[0]);
-             prErr = abs((err[1] - 6.71E-10)/err[1]);
              break;
   }
 
diff --git a/okl/elliptic/ellipticResidualProjection.okl b/okl/elliptic/ellipticResidualProjection.okl
index 99545e9f6..b54e39962 100644
--- a/okl/elliptic/ellipticResidualProjection.okl
+++ b/okl/elliptic/ellipticResidualProjection.okl
@@ -25,18 +25,13 @@
  */
 
 @kernel void scalarMultiply(const dlong N,
-                            const dlong fieldOffset,
                             const dlong offset,
                             const dfloat alpha,
                             @restrict dfloat*  x)
 {
   for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
-    if(n < N){
-      #pragma unroll p_Nfields
-      for(dlong field = 0 ; field < p_Nfields; ++field){
-        x[n + offset + field * fieldOffset] = alpha * x[n + offset + field * fieldOffset];
-      }
-    }
+    if(n < N)
+      x[n + offset] = alpha * x[n + offset];
 }
 
 @kernel void multiScaledAddwOffset(const dlong N,
@@ -48,11 +43,9 @@
                                    @restrict dfloat*  x)
 {
   for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
-    if(n < N){
+    if(n < N)
       for(dlong k = 0; k < m - 1; ++k)
-        for(dlong fld = 0; fld < p_Nfields; ++fld)
-          x[n + destOffset + fld * fieldOffset] = -alphas[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset] + beta * x[n + destOffset + fld * fieldOffset];
-    }
+        x[n + destOffset] = -alphas[k] * x[n + k * fieldOffset] + beta * x[n + destOffset];
 }
 
 @kernel void accumulate(const dlong N,
@@ -65,12 +58,10 @@
   for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
     if(n < N) {
       // y = alpha[0] * o_xx[:,0]
-      for(dlong fld = 0 ; fld < p_Nfields; ++fld)
-        y[n + fld * fieldOffset] = alpha[0] * x[n + fld * fieldOffset];
+      y[n] = alpha[0] * x[n];
       for(dlong k = 1; k < m; ++k)
-        for(dlong fld = 0 ; fld < p_Nfields; ++fld)
-          // y += alpha[k] * o_xx[:,k]
-          y[n + fld * fieldOffset] += alpha[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset];
+        // y += alpha[k] * o_xx[:,k]
+        y[n] += alpha[k] * x[n + k * fieldOffset];
     }
 }
 
@@ -87,24 +78,16 @@
   for(dlong b = 0; b < (N + p_threadBlockSize - 1) / p_threadBlockSize; ++b; @outer(0)) {
     @shared volatile dfloat s_wxy[p_threadBlockSize];
     @exclusive dfloat w_loc;
-    @exclusive dfloat y_loc[p_Nfields];
+    @exclusive dfloat y_loc;
 
     for(int v = 0; v < m; ++v) {
       for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) {
         const dlong id = t + p_threadBlockSize * b;
         if(v == 0 && id < N) {
           w_loc = w[id];
-          for(dlong fld = 0; fld < p_Nfields; ++fld){
-            y_loc[fld] = y[id + offset + fld * fieldOffset];
-          }
-        }
-        dfloat res = 0.f;
-        if(id < N){
-          for(dlong fld = 0; fld < p_Nfields; ++fld){
-            res += w_loc * x[id + p_Nfields * fieldOffset * v + fld * fieldOffset] * y_loc[fld];
-          }
+          y_loc = y[id + offset];
         }
-        s_wxy[t] = res;
+        s_wxy[t] = (id < N) ? w_loc * x[id + fieldOffset * v] * y_loc : 0.f;
       }
 
       @barrier("local");
diff --git a/src/core/parReader.cpp b/src/core/parReader.cpp
index 7539d381d..dad1ac845 100644
--- a/src/core/parReader.cpp
+++ b/src/core/parReader.cpp
@@ -503,6 +503,8 @@ setupAide parRead(std::string &setupFile, MPI_Comm comm)
       options.setArgs("VELOCITY BLOCK SOLVER", "FALSE");
       if(std::strstr(vsolver.c_str(), "block")) {
         options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
+        if(options.compareArgs("VELOCITY RESIDUAL PROJECTION","TRUE"))
+          exit("Residual projection is not enabled for the velocity block solver!", EXIT_FAILURE);
       }
     }
 
diff --git a/src/elliptic/ellipticResidualProjection.cpp b/src/elliptic/ellipticResidualProjection.cpp
index 641eab748..97dbdafd8 100644
--- a/src/elliptic/ellipticResidualProjection.cpp
+++ b/src/elliptic/ellipticResidualProjection.cpp
@@ -33,30 +33,31 @@ void ResidualProjection::matvec(occa::memory& o_Ax,
                                 occa::memory& o_x,
                                 const dlong x_offset)
 {
-  occa::memory o_xtmp = o_x + Nfields * fieldOffset * x_offset * sizeof(dfloat);
-  occa::memory o_Axtmp = o_Ax + Nfields * fieldOffset * Ax_offset * sizeof(dfloat);
+  occa::memory o_xtmp = o_x + fieldOffset * x_offset * sizeof(dfloat);
+  occa::memory o_Axtmp = o_Ax + fieldOffset * Ax_offset * sizeof(dfloat);
   matvecOperator(o_xtmp, o_Axtmp);
 }
 
 void ResidualProjection::updateProjectionSpace()
 {
-  if(numVecsProjection <= 0) return;
+  dlong m = numVecsProjection;
+  if(m <= 0) return;
 
-  multiWeightedInnerProduct(o_xx, o_bb, numVecsProjection - 1);
-  const dfloat norm_orig = alpha[numVecsProjection - 1];
+  multiWeightedInnerProduct(o_xx, m, o_bb, m - 1);
+  const dfloat norm_orig = alpha[m - 1];
   dfloat norm_new = norm_orig;
   const dfloat one = 1.0;
-  multiScaledAddwOffsetKernel(Nlocal, numVecsProjection, Nfields * (numVecsProjection - 1) * fieldOffset, fieldOffset, o_alpha, one, o_xx);
-  multiScaledAddwOffsetKernel(Nlocal, numVecsProjection, Nfields * (numVecsProjection - 1) * fieldOffset, fieldOffset, o_alpha, one, o_bb);
-  for(int k = 0; k < numVecsProjection - 1; ++k)
+  multiScaledAddwOffsetKernel(Nlocal, m, (m - 1) * fieldOffset, fieldOffset, o_alpha, one, o_xx);
+  multiScaledAddwOffsetKernel(Nlocal, m, (m - 1) * fieldOffset, fieldOffset, o_alpha, one, o_bb);
+  for(int k = 0; k < m - 1; ++k)
     norm_new = norm_new - alpha[k] * alpha[k];
   norm_new = sqrt(norm_new);
   dfloat tol = 1e-7;
   const dfloat test = norm_new / norm_orig;
   if(test > tol) {
     const dfloat scale = 1.0 / norm_new;
-    scalarMultiplyKernel(Nlocal, fieldOffset, Nfields * (numVecsProjection - 1) * fieldOffset, scale, o_xx);
-    scalarMultiplyKernel(Nlocal, fieldOffset, Nfields * (numVecsProjection - 1) * fieldOffset, scale, o_bb);
+    scalarMultiplyKernel(Nlocal, (m - 1) * fieldOffset, scale, o_xx);
+    scalarMultiplyKernel(Nlocal, (m - 1) * fieldOffset, scale, o_bb);
   } else {
     if(verbose && rank == 0) {
       std::cout << "Detected rank deficiency: " << test << ".\n";
@@ -71,16 +72,13 @@ void ResidualProjection::computePreProjection(occa::memory& o_r)
   dfloat one = 1.0;
   dfloat zero = 0.0;
   dfloat mone = -1.0;
-  if(numVecsProjection <= 0) return;
-  multiWeightedInnerProduct(o_xx,o_r,0);
+  const int m = numVecsProjection;
+  if(m <= 0) return;
+  multiWeightedInnerProduct(o_xx,m,o_r,0);
 
-  accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_xx, o_xbar);
-  accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_bb, o_rtmp);
-  if(blockSolver){
-    scaledAddKernel(Nlocal, fieldOffset, mone, o_rtmp, one, o_r);
-  } else {
-    scaledAddKernel(Nlocal, mone, o_rtmp, one, o_r);
-  }
+  accumulateKernel(Nlocal, m, fieldOffset, o_alpha, o_xx, o_xbar);
+  accumulateKernel(Nlocal, m, fieldOffset, o_alpha, o_bb, o_rtmp);
+  scaledAddKernel(Nlocal, mone, o_rtmp, one, o_r);
 }
 
 void ResidualProjection::computePostProjection(occa::memory & o_x)
@@ -91,33 +89,25 @@ void ResidualProjection::computePostProjection(occa::memory & o_x)
   if(numVecsProjection == 0) {
     // reset bases
     numVecsProjection = 1;
-    o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat));
+    o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat));
   } else if(numVecsProjection == maxNumVecsProjection) {
     numVecsProjection = 1;
-    if(blockSolver){
-      scaledAddKernel(Nlocal, fieldOffset, one, o_xbar, one, o_x);
-    } else {
-      scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
-    }
-    o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat));
+    scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
+    o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat));
   } else {
     numVecsProjection++;
     // xx[m-1] = x
-    o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat), Nfields * (numVecsProjection - 1) * fieldOffset * sizeof(dfloat), 0);
+    o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat), (numVecsProjection - 1) * fieldOffset * sizeof(dfloat), 0);
     // x = x + xbar
-    if(blockSolver){
-      scaledAddKernel(Nlocal, fieldOffset, one, o_xbar, one, o_x);
-    } else {
-      scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
-    }
+    scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
   }
-  const dlong previousNumVecsProjection = numVecsProjection;
+  const dlong m_save = numVecsProjection;
   matvec(o_bb,numVecsProjection - 1,o_xx,numVecsProjection - 1);
 
   updateProjectionSpace();
-  if (numVecsProjection < previousNumVecsProjection) { // Last vector was linearly dependent, reset space
+  if (numVecsProjection < m_save) { // Last vector was linearly dependent, reset space
     numVecsProjection = 1;
-    o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat)); // writes first n words of o_xx, first approximation vector
+    o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat)); // writes first n words of o_xx, first approximation vector
     matvec(o_bb,0,o_xx,0);
     updateProjectionSpace();
   }
@@ -131,14 +121,12 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
   numTimeSteps(_numTimeSteps),
   Nlocal(elliptic.mesh->Np * elliptic.mesh->Nelements),
   fieldOffset(elliptic.Ntotal),
-  Nfields(elliptic.Nfields),
   Nblock(elliptic.Nblock),
   Nblock2(elliptic.Nblock2),
   resNormFactor(elliptic.resNormFactor),
   rank(elliptic.mesh->rank),
   size(elliptic.mesh->size),
   comm(elliptic.mesh->comm),
-  blockSolver(elliptic.blockSolver),
   o_tmp(elliptic.o_tmp),
   o_tmp2(elliptic.o_tmp2),
   o_wrk(elliptic.o_wrk),
@@ -155,9 +143,9 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
   work = (dfloat*) calloc(maxNumVecsProjection, sizeof(dfloat));
   multiwork = (dfloat*) calloc(Nblock * maxNumVecsProjection, sizeof(dfloat));
   o_alpha = elliptic.mesh->device.malloc(maxNumVecsProjection * sizeof(dfloat));
-  o_xbar = elliptic.mesh->device.malloc(Nfields * fieldOffset * sizeof(dfloat));
-  o_xx = elliptic.mesh->device.malloc(Nfields * fieldOffset * maxNumVecsProjection * sizeof(dfloat));
-  o_bb = elliptic.mesh->device.malloc(Nfields * fieldOffset * maxNumVecsProjection * sizeof(dfloat));
+  o_xbar = elliptic.mesh->device.malloc(Nlocal * sizeof(dfloat));
+  o_xx = elliptic.mesh->device.malloc(fieldOffset * maxNumVecsProjection * sizeof(dfloat));
+  o_bb = elliptic.mesh->device.malloc(fieldOffset * maxNumVecsProjection * sizeof(dfloat));
 
   string install_dir;
   install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
@@ -172,7 +160,6 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
       properties["defines/p_blockSize"] = BLOCKSIZE;
       properties["defines/dfloat"] = dfloatString;
       properties["defines/dlong"] = dlongString;
-      properties["defines/p_Nfields"] = Nfields;
 
       filename = oklpath + "ellipticResidualProjection.okl";
       scalarMultiplyKernel = elliptic.mesh->device.buildKernel(filename.c_str(),
@@ -206,7 +193,8 @@ void ResidualProjection::pre(occa::memory& o_r)
   if(timestep < numTimeSteps)
     return;
 
-  if(numVecsProjection <= 0) return;
+  const int m = numVecsProjection;
+  if(m <= 0) return;
   dfloat priorResidualNorm = 0.0;
   if(verbose) priorResidualNorm =
       sqrt(weightedNorm(o_r) * resNormFactor);
@@ -234,23 +222,24 @@ void ResidualProjection::post(occa::memory& o_x)
 
 void ResidualProjection::multiWeightedInnerProduct(
   occa::memory &o_a,
+  const dlong m,
   occa::memory &o_b,
   const dlong offset)
 {
 #ifdef ELLIPTIC_ENABLE_TIMER
   timer::tic("dotp",1);
 #endif
-  multiWeightedInnerProduct2Kernel(Nlocal, fieldOffset, Nblock, numVecsProjection, Nfields * offset * fieldOffset, o_invDegree, o_a, o_b, o_wrk);
+  multiWeightedInnerProduct2Kernel(Nlocal, fieldOffset, Nblock, m, offset * fieldOffset, o_invDegree, o_a, o_b, o_wrk);
 
-  o_wrk.copyTo(multiwork, sizeof(dfloat) * numVecsProjection * Nblock);
-  for(dlong k = 0; k < numVecsProjection; ++k) {
+  o_wrk.copyTo(multiwork, sizeof(dfloat) * m * Nblock);
+  for(dlong k = 0; k < m; ++k) {
     dfloat accum = 0.0;
     for(dlong n = 0; n < Nblock; ++n)
       accum += multiwork[n + k * Nblock];
     alpha[k] = accum;
   }
-  MPI_Allreduce(MPI_IN_PLACE, alpha, numVecsProjection, MPI_DFLOAT, MPI_SUM, comm);
-  o_alpha.copyFrom(alpha,sizeof(dfloat) * numVecsProjection);
+  MPI_Allreduce(MPI_IN_PLACE, alpha, m, MPI_DFLOAT, MPI_SUM, comm);
+  o_alpha.copyFrom(alpha,sizeof(dfloat) * m);
 #ifdef ELLIPTIC_ENABLE_TIMER
   timer::toc("dotp");
 #endif
diff --git a/src/elliptic/ellipticResidualProjection.h b/src/elliptic/ellipticResidualProjection.h
index 2e414d848..625419e94 100644
--- a/src/elliptic/ellipticResidualProjection.h
+++ b/src/elliptic/ellipticResidualProjection.h
@@ -46,6 +46,7 @@ class ResidualProjection final
   void matvec(occa::memory& o_Ax, const dlong Ax_offset, occa::memory& o_x, const dlong x_offset);
   void multiWeightedInnerProduct(
     occa::memory& o_a,
+    const dlong m,
     occa::memory& o_b,
     const dlong offset);
   const dlong maxNumVecsProjection;
@@ -80,13 +81,11 @@ class ResidualProjection final
   dlong numVecsProjection;
   const dlong Nlocal; // vector size
   const dlong fieldOffset; // offset
-  const dlong Nfields;
   const dlong Nblock;
   const dlong Nblock2;
   const dfloat resNormFactor;
   const int rank;
   const int size;
-  const int blockSolver;
   MPI_Comm comm;
 
   std::function<void(occa::memory&,occa::memory&)> matvecOperator;
diff --git a/src/elliptic/ellipticSolve.cpp b/src/elliptic/ellipticSolve.cpp
index 9dac1beac..1cb888870 100644
--- a/src/elliptic/ellipticSolve.cpp
+++ b/src/elliptic/ellipticSolve.cpp
@@ -44,7 +44,7 @@ int ellipticSolve(elliptic_t* elliptic,
     ellipticUpdateJacobi(elliptic);
 
   if(options.compareArgs("RESIDUAL PROJECTION","TRUE"))
-    elliptic->o_x0.copyFrom(o_x, elliptic->Nfields * elliptic->Ntotal * sizeof(dfloat));
+    elliptic->o_x0.copyFrom(o_x, elliptic->Ntotal * sizeof(dfloat));
 
   // compute initial residual
   ellipticOperator(elliptic, o_x, elliptic->o_Ap, dfloatString);