diff --git a/src/libP/solvers/elliptic/ellipticResidualProjection.h b/src/libP/solvers/elliptic/ellipticResidualProjection.h index 7028ce27b..68a6ab96b 100644 --- a/src/libP/solvers/elliptic/ellipticResidualProjection.h +++ b/src/libP/solvers/elliptic/ellipticResidualProjection.h @@ -57,6 +57,11 @@ class ResidualProjection final const dlong a_offset, occa::memory& o_b, const dlong b_offset); + void multiWeightedInnerProduct(occa::memory& o_w, + occa::memory& o_a, + const dlong m, + occa::memory& o_b, + const dlong offset); elliptic_t& elliptic; const dlong maxNumVecsProjection; const dlong numTimeSteps; @@ -72,14 +77,18 @@ class ResidualProjection final occa::kernel scalarMultiplyKernel; occa::kernel scaledAddwOffsetTwoVecKernel; occa::kernel scaledAddwOffsetKernel; + occa::kernel multiScaledAddwOffsetKernel; + occa::kernel subtractedMultiScaledAddwOffsetKernel; occa::kernel placeVectorKernel; occa::kernel extractVectorKernel; occa::kernel weightedInnerProduct2Kernel; + occa::kernel multiWeightedInnerProduct2Kernel; occa::kernel innerProductKernel; occa::kernel accumulateKernel; std::vector alpha; // host shadow std::vector work; // O(m) work array + std::vector multiwork; // O(Nblock*m) work array dlong numVecsProjection; dlong Ntotal; // vector size diff --git a/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl b/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl index 85e80d173..edb40993d 100644 --- a/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl +++ b/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl @@ -62,6 +62,36 @@ if(n < N) x[n + destOffset * N] = alpha * x[n + sourceOffset * N] + beta * x[n + destOffset * N]; } +@kernel void multiScaledAddwOffset(const dlong N, + const dlong m, + @restrict const dfloat * alphas, + @restrict dfloat* x, + const dfloat beta, + const dlong destOffset) +{ + for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){ + if(n < N){ + for(dlong k = 0; k < m-1; ++k){ + x[n + destOffset * N] = -alphas[k] * x[n + k * N] + beta * x[n + destOffset * N]; + } + } + } +} +@kernel void subtractedMultiScaledAddwOffset(const dlong N, + const dlong m, + @restrict const dfloat * alphas, + @restrict dfloat* x, + const dfloat beta, + const dlong k) +{ + for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){ + if(n < N){ + for(dlong j = m-1; j >= k + 1; j--){ + x[n + k * N] = -alphas[j] * x[n + j * N] + beta * x[n + k * N]; + } + } + } +} @kernel void scaledAddwOffsetTwoVec(const dlong N, const dfloat alpha, @restrict const dfloat* x, @@ -143,7 +173,6 @@ } } -// TODO: add offset @kernel void weightedInnerProduct2(const dlong N, @restrict const dfloat* w, @restrict const dfloat* x, @@ -184,4 +213,55 @@ for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 1) wxy[b] = s_wxy[0] + s_wxy[1]; } +} +@kernel void multiWeightedInnerProduct2(const dlong N, + const dlong Nblock, + const dlong m, + @restrict const dfloat* w, + @restrict const dfloat* x, + @restrict const dfloat* y, + const dlong offset, + @restrict dfloat* wxy) +{ + for(dlong b = 0; b < (N + p_threadBlockSize - 1) / p_threadBlockSize; ++b; @outer(0)) { + @shared volatile dfloat s_wxy[p_threadBlockSize]; + @exclusive dfloat w_loc; + @exclusive dfloat y_loc; + + for(int v = 0 ; v < m; ++v){ + + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) { + const dlong id = t + p_threadBlockSize * b; + if(v == 0 && id < N){ + w_loc = w[id]; + y_loc = y[id+N*offset]; + } + s_wxy[t] = (id < N) ? w_loc * x[id + N * v] * y_loc : 0.f; + } + + @barrier("local"); +#if p_threadBlockSize > 512 + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512]; + @barrier("local"); +#endif +#if p_threadBlockSize > 256 + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256]; + @barrier("local"); +#endif + + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128]; + @barrier("local"); + + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64]; + @barrier("local"); + + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32]; + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16]; + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 8) s_wxy[t] += s_wxy[t + 8]; + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 4) s_wxy[t] += s_wxy[t + 4]; + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 2) s_wxy[t] += s_wxy[t + 2]; + + for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 1) wxy[b+v*Nblock] = s_wxy[0] + s_wxy[1]; + } + } } \ No newline at end of file diff --git a/src/libP/solvers/elliptic/src/ellipticResidualProjection.c b/src/libP/solvers/elliptic/src/ellipticResidualProjection.c index e639ac6a0..4717c7259 100644 --- a/src/libP/solvers/elliptic/src/ellipticResidualProjection.c +++ b/src/libP/solvers/elliptic/src/ellipticResidualProjection.c @@ -64,10 +64,9 @@ void ResidualProjection::reOrthogonalize() + computeInnerProduct(o_bb,j,o_xx,k)); } gop(alpha.data() + k,work.data(),(m - k) + 1); - for(int j = m - 1; j >= k + 1; j--) { - scaledAddwOffsetKernel(Ntotal, -alpha[j], o_xx,j, one, k); - scaledAddwOffsetKernel(Ntotal, -alpha[j], o_bb,j, one, k); - } + o_alpha.copyFrom(alpha.data(),sizeof(dfloat)*((m-k)+1)); + subtractedMultiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_xx, one, k); + subtractedMultiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_bb, one, k); dfloat normp = sqrt(alpha[k]); dfloat normk = 0.0; if(useWeightedFormulation) { @@ -114,20 +113,19 @@ void ResidualProjection::updateProjectionSpace() dlong m = numVecsProjection; if(m <= 0) return; if(useWeightedFormulation) { - for(int k = 0; k < m; ++k) - alpha[k] = weightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx,k,o_bb,m - 1); + multiWeightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx, m, o_bb, m-1); } else { for(int k = 0; k < m; ++k) alpha[k] = computeInnerProduct(o_xx,k,o_bb,m - 1); } gop(alpha.data(),work.data(),m); + o_alpha.copyFrom(alpha.data(),sizeof(dfloat)*m); const dfloat norm_orig = alpha[m - 1]; dfloat norm_new = norm_orig; const dfloat one = 1.0; + multiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_xx, one, m-1); + multiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_bb, one, m-1); for(int k = 0; k < m - 1; ++k) { - const dfloat scale = -alpha[k]; - scaledAddwOffsetKernel(Ntotal, scale, o_xx,k, one, m - 1); - scaledAddwOffsetKernel(Ntotal, scale, o_bb,k, one, m - 1); norm_new = norm_new - alpha[k] * alpha[k]; } norm_new = sqrt(norm_new); @@ -153,8 +151,7 @@ void ResidualProjection::computePreProjection(occa::memory& o_r) const int m = numVecsProjection; if(m <= 0) return; if(useWeightedFormulation) { - for(int k = 0; k < m; ++k) - alpha[k] = weightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_r,0,o_xx,k); + multiWeightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx,m,o_r,0); } else { for(int k = 0; k < m; ++k) alpha[k] = computeInnerProduct(o_r,0,o_xx,k); @@ -211,6 +208,7 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic, verbose = elliptic.options.compareArgs("VERBOSE","TRUE"); alpha.resize(maxNumVecsProjection); work.resize(maxNumVecsProjection); + multiwork.resize(Nblock*maxNumVecsProjection); o_alpha = elliptic.mesh->device.malloc < dfloat > (maxNumVecsProjection); o_xbar = elliptic.mesh->device.malloc < dfloat > (Ntotal); o_xx = elliptic.mesh->device.malloc < dfloat > (Ntotal * maxNumVecsProjection); @@ -222,7 +220,9 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic, if ((r == 0 && elliptic.mesh->rank == 0) || (r == 1 && elliptic.mesh->rank > 0)) { occa::properties properties; properties += elliptic.mesh->device.properties(); - properties["defines/p_threadBlockSize"] = 256; + properties["defines/p_threadBlockSize"] = blockSize; + properties["defines/p_blockSize"] = blockSize; + properties["defines/p_maxMultiVectors"] = maxNumVecsProjection; properties["defines/dfloat"] = dfloatString; properties["defines/dlong"] = dlongString; @@ -238,10 +238,19 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic, scaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName, "scaledAddwOffset", properties); + multiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName, + "multiScaledAddwOffset", + properties); + subtractedMultiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName, + "subtractedMultiScaledAddwOffset", + properties); placeVectorKernel = elliptic.mesh->device.buildKernel(fileName, "placeVector", properties); weightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName, "weightedInnerProduct2", properties); + multiWeightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName, + "multiWeightedInnerProduct2", + properties); innerProductKernel = elliptic.mesh->device.buildKernel(fileName, "innerProduct", properties); accumulateKernel = elliptic.mesh->device.buildKernel(fileName, "accumulate", properties); } @@ -354,3 +363,32 @@ dfloat ResidualProjection::weightedInnerProduct(occa::memory &o_w, return wab; } +void ResidualProjection::multiWeightedInnerProduct(occa::memory &o_w, + occa::memory &o_a, + const dlong m, + occa::memory &o_b, + const dlong offset) +{ + setupAide &options = elliptic.options; + + const int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS"); + const int serial = options.compareArgs("THREAD MODEL", "SERIAL"); + int enableReductions = 1; + options.getArgs("DEBUG ENABLE REDUCTIONS", enableReductions); + + mesh_t* mesh = elliptic.mesh; + + const dlong Nlocal = mesh->Np * mesh->Nelements; + const dlong Nblock = elliptic.Nblock; + + multiWeightedInnerProduct2Kernel(Nlocal, Nblock, m, o_w, o_a, o_b, offset, elliptic.o_wrk); + elliptic.o_wrk.copyTo(multiwork.data(), sizeof(dfloat)*m*Nblock); + for(dlong k = 0 ; k < m; ++k){ + dfloat accum = 0.0; + for(dlong n = 0; n < Nblock; ++n){ + accum += multiwork[n+k*Nblock]; + } + alpha[k] = accum; + } + o_alpha.copyFrom(alpha.data(),sizeof(double)*m); +}