Performance improvements for the residual projection scheme (#115) (#120

)
Nek5000 · Jul 24, 2020 · 1462cd1 · 1462cd1
1 parent ef4a051
commit 1462cd1
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 13 deletions.
diff --git a/src/libP/solvers/elliptic/ellipticResidualProjection.h b/src/libP/solvers/elliptic/ellipticResidualProjection.h
@@ -57,6 +57,11 @@ class ResidualProjection final
                               const dlong a_offset,
                               occa::memory& o_b,
                               const dlong b_offset);
+  void multiWeightedInnerProduct(occa::memory& o_w,
+                              occa::memory& o_a,
+                              const dlong m,
+                              occa::memory& o_b,
+                              const dlong offset);
   elliptic_t& elliptic;
   const dlong maxNumVecsProjection;
   const dlong numTimeSteps;
@@ -72,14 +77,18 @@ class ResidualProjection final
   occa::kernel scalarMultiplyKernel;
   occa::kernel scaledAddwOffsetTwoVecKernel;
   occa::kernel scaledAddwOffsetKernel;
+  occa::kernel multiScaledAddwOffsetKernel;
+  occa::kernel subtractedMultiScaledAddwOffsetKernel;
   occa::kernel placeVectorKernel;
   occa::kernel extractVectorKernel;
   occa::kernel weightedInnerProduct2Kernel;
+  occa::kernel multiWeightedInnerProduct2Kernel;
   occa::kernel innerProductKernel;
   occa::kernel accumulateKernel;
 
   std::vector<dfloat> alpha; // host shadow
   std::vector<dfloat> work; // O(m) work array
+  std::vector<dfloat> multiwork; // O(Nblock*m) work array
 
   dlong numVecsProjection;
   dlong Ntotal; // vector size

diff --git a/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl b/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl
@@ -62,6 +62,36 @@
     if(n < N)
       x[n + destOffset * N] = alpha * x[n + sourceOffset * N] + beta * x[n + destOffset * N];
 }
+@kernel void multiScaledAddwOffset(const dlong N,
+                              const dlong m,
+                              @restrict const dfloat * alphas,
+                              @restrict dfloat*  x,
+                              const dfloat beta,
+                              const dlong destOffset)
+{
+  for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){
+    if(n < N){
+      for(dlong k = 0; k < m-1; ++k){
+        x[n + destOffset * N] = -alphas[k] * x[n + k * N] + beta * x[n + destOffset * N];
+      }
+    }
+  }
+}
+@kernel void subtractedMultiScaledAddwOffset(const dlong N,
+                              const dlong m,
+                              @restrict const dfloat * alphas,
+                              @restrict dfloat*  x,
+                              const dfloat beta,
+                              const dlong k)
+{
+  for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){
+    if(n < N){
+      for(dlong j = m-1; j >= k + 1; j--){
+        x[n + k * N] = -alphas[j] * x[n + j * N] + beta * x[n + k * N];
+      }
+    }
+  }
+}
 @kernel void scaledAddwOffsetTwoVec(const dlong N,
                                     const dfloat alpha,
                                     @restrict const dfloat*  x,
@@ -143,7 +173,6 @@
   }
 }
 
-// TODO: add offset
 @kernel void weightedInnerProduct2(const dlong N,
                                    @restrict const dfloat*  w,
                                    @restrict const dfloat*  x,
@@ -184,4 +213,55 @@
 
     for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  1) wxy[b] = s_wxy[0] + s_wxy[1];
   }
+}
+@kernel void multiWeightedInnerProduct2(const dlong N,
+                                   const dlong Nblock,
+                                   const dlong m,
+                                   @restrict const dfloat*  w,
+                                   @restrict const dfloat*  x,
+                                   @restrict const dfloat*  y,
+                                   const dlong offset,
+                                   @restrict dfloat*  wxy)
+{
+  for(dlong b = 0; b < (N + p_threadBlockSize - 1) / p_threadBlockSize; ++b; @outer(0)) {
+    @shared volatile dfloat s_wxy[p_threadBlockSize];
+    @exclusive dfloat w_loc;
+    @exclusive dfloat y_loc;
+
+    for(int v = 0 ; v < m; ++v){
+
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) {
+      const dlong id = t + p_threadBlockSize * b;
+      if(v == 0 && id < N){
+        w_loc = w[id];
+        y_loc = y[id+N*offset];
+      }
+      s_wxy[t] = (id < N) ? w_loc * x[id + N * v] * y_loc : 0.f;
+    }
+
+    @barrier("local");
+#if p_threadBlockSize > 512
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512];
+    @barrier("local");
+#endif
+#if p_threadBlockSize > 256
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256];
+    @barrier("local");
+#endif
+
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128];
+    @barrier("local");
+
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64];
+    @barrier("local");
+
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32];
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16];
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  8) s_wxy[t] += s_wxy[t + 8];
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  4) s_wxy[t] += s_wxy[t + 4];
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  2) s_wxy[t] += s_wxy[t + 2];
+
+    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  1) wxy[b+v*Nblock] = s_wxy[0] + s_wxy[1];
+  }
+  }
 }
diff --git a/src/libP/solvers/elliptic/src/ellipticResidualProjection.c b/src/libP/solvers/elliptic/src/ellipticResidualProjection.c
@@ -64,10 +64,9 @@ void ResidualProjection::reOrthogonalize()
                             + computeInnerProduct(o_bb,j,o_xx,k));
       }
       gop(alpha.data() + k,work.data(),(m - k) + 1);
-      for(int j = m - 1; j >= k + 1; j--) {
-        scaledAddwOffsetKernel(Ntotal, -alpha[j], o_xx,j, one, k);
-        scaledAddwOffsetKernel(Ntotal, -alpha[j], o_bb,j, one, k);
-      }
+      o_alpha.copyFrom(alpha.data(),sizeof(dfloat)*((m-k)+1));
+      subtractedMultiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_xx, one, k);
+      subtractedMultiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_bb, one, k);
       dfloat normp = sqrt(alpha[k]);
       dfloat normk = 0.0;
       if(useWeightedFormulation) {
@@ -114,20 +113,19 @@ void ResidualProjection::updateProjectionSpace()
   dlong m = numVecsProjection;
   if(m <= 0) return;
   if(useWeightedFormulation) {
-    for(int k = 0; k < m; ++k)
-      alpha[k] = weightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx,k,o_bb,m - 1);
+    multiWeightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx, m, o_bb, m-1);
   } else {
     for(int k = 0; k < m; ++k)
       alpha[k] = computeInnerProduct(o_xx,k,o_bb,m - 1);
   }
   gop(alpha.data(),work.data(),m);
+  o_alpha.copyFrom(alpha.data(),sizeof(dfloat)*m);
   const dfloat norm_orig = alpha[m - 1];
   dfloat norm_new = norm_orig;
   const dfloat one = 1.0;
+  multiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_xx, one, m-1);
+  multiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_bb, one, m-1);
   for(int k = 0; k < m - 1; ++k) {
-    const dfloat scale = -alpha[k];
-    scaledAddwOffsetKernel(Ntotal, scale, o_xx,k, one, m - 1);
-    scaledAddwOffsetKernel(Ntotal, scale, o_bb,k, one, m - 1);
     norm_new = norm_new - alpha[k] * alpha[k];
   }
   norm_new = sqrt(norm_new);
@@ -153,8 +151,7 @@ void ResidualProjection::computePreProjection(occa::memory& o_r)
   const int m = numVecsProjection;
   if(m <= 0) return;
   if(useWeightedFormulation) {
-    for(int k = 0; k < m; ++k)
-      alpha[k] = weightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_r,0,o_xx,k);
+    multiWeightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx,m,o_r,0);
   } else {
     for(int k = 0; k < m; ++k)
       alpha[k] = computeInnerProduct(o_r,0,o_xx,k);
@@ -211,6 +208,7 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic,
   verbose = elliptic.options.compareArgs("VERBOSE","TRUE");
   alpha.resize(maxNumVecsProjection);
   work.resize(maxNumVecsProjection);
+  multiwork.resize(Nblock*maxNumVecsProjection);
   o_alpha = elliptic.mesh->device.malloc < dfloat > (maxNumVecsProjection);
   o_xbar = elliptic.mesh->device.malloc < dfloat > (Ntotal);
   o_xx = elliptic.mesh->device.malloc < dfloat > (Ntotal * maxNumVecsProjection);
@@ -222,7 +220,9 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic,
     if ((r == 0 && elliptic.mesh->rank == 0) || (r == 1 && elliptic.mesh->rank > 0)) {
       occa::properties properties;
       properties += elliptic.mesh->device.properties();
-      properties["defines/p_threadBlockSize"] = 256;
+      properties["defines/p_threadBlockSize"] = blockSize;
+      properties["defines/p_blockSize"] = blockSize;
+      properties["defines/p_maxMultiVectors"] = maxNumVecsProjection;
       properties["defines/dfloat"] = dfloatString;
       properties["defines/dlong"] = dlongString;
 
@@ -238,10 +238,19 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic,
       scaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
                                                                  "scaledAddwOffset",
                                                                  properties);
+      multiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
+                                                                 "multiScaledAddwOffset",
+                                                                 properties);
+      subtractedMultiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
+                                                                 "subtractedMultiScaledAddwOffset",
+                                                                 properties);
       placeVectorKernel = elliptic.mesh->device.buildKernel(fileName, "placeVector", properties);
       weightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName,
                                                                       "weightedInnerProduct2",
                                                                       properties);
+      multiWeightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName,
+                                                                      "multiWeightedInnerProduct2",
+                                                                      properties);
       innerProductKernel = elliptic.mesh->device.buildKernel(fileName, "innerProduct", properties);
       accumulateKernel = elliptic.mesh->device.buildKernel(fileName, "accumulate", properties);
     }
@@ -354,3 +363,32 @@ dfloat ResidualProjection::weightedInnerProduct(occa::memory &o_w,
 
   return wab;
 }
+void ResidualProjection::multiWeightedInnerProduct(occa::memory &o_w,
+                                                occa::memory &o_a,
+                                                const dlong m,
+                                                occa::memory &o_b,
+                                                const dlong offset)
+{
+  setupAide &options = elliptic.options;
+
+  const int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS");
+  const int serial = options.compareArgs("THREAD MODEL", "SERIAL");
+  int enableReductions = 1;
+  options.getArgs("DEBUG ENABLE REDUCTIONS", enableReductions);
+
+  mesh_t* mesh = elliptic.mesh;
+
+  const dlong Nlocal = mesh->Np * mesh->Nelements;
+  const dlong Nblock = elliptic.Nblock;
+
+  multiWeightedInnerProduct2Kernel(Nlocal, Nblock, m, o_w, o_a, o_b, offset, elliptic.o_wrk);
+  elliptic.o_wrk.copyTo(multiwork.data(), sizeof(dfloat)*m*Nblock);
+  for(dlong k = 0 ; k < m; ++k){
+    dfloat accum = 0.0;
+    for(dlong n = 0; n < Nblock; ++n){
+      accum += multiwork[n+k*Nblock];
+    }
+    alpha[k] = accum;
+  }
+  o_alpha.copyFrom(alpha.data(),sizeof(double)*m);
+}