Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable residual projection for block solver #189

Merged
merged 6 commits into from
Dec 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ jobs:
- stage: test
name: "ethier subcycle"
script: cd $NEKRS_EXAMPLES/ethier && nrsmpi ethier 2 2
- stage: test
name: "ethier velocity and pressure projection"
script: cd $NEKRS_EXAMPLES/ethier && nrsmpi ethier 2 3
- stage: test
name: "ethier (block) velocity and pressure projection with subcycling"
script: cd $NEKRS_EXAMPLES/ethier && nrsmpi ethier 2 4
- stage: test
name: "lowMach default"
script: cd $NEKRS_EXAMPLES/lowMach && nrsmpi lowMach 2 1
Expand Down
38 changes: 33 additions & 5 deletions examples/ethier/ci.inc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void ciSetup(MPI_Comm comm, setupAide &options)
options.setArgs("SCALAR00 DENSITY", string("1"));
options.setArgs("SCALAR01 DIFFUSIVITY", string("0.01"));
options.setArgs("SCALAR01 DENSITY", string("1"));
options.setArgs("END TIME", string("0.2"));
options.setArgs("END TIME", string("0.06"));
options.setArgs("DT", string("2e-3"));
options.setArgs("SUBCYCLING STEPS", string("0"));
options.setArgs("PRESSURE RESIDUAL PROJECTION", "FALSE");
Expand All @@ -33,6 +33,17 @@ void ciSetup(MPI_Comm comm, setupAide &options)
options.setArgs("SUBCYCLING STEPS", string("1"));
options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
}
if (ciMode == 3) {
options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
options.setArgs("VELOCITY RESIDUAL PROJECTION", "TRUE");
}
if (ciMode == 4) {
options.setArgs("END TIME", string("0.2"));
options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
options.setArgs("SUBCYCLING STEPS", string("1"));
options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
options.setArgs("VELOCITY RESIDUAL PROJECTION", "TRUE");
}
options.setArgs("TIME INTEGRATOR", "TOMBO3");
options.setArgs("ADVECTION TYPE", "CONVECTIVE+CUBATURE");
options.setArgs("VELOCITY SOLVER TOLERANCE", string("1e-12"));
Expand All @@ -53,23 +64,40 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)

double *err = (double *) nek_scPtr(1);

const double vxErr = abs((err[0] - 3.65E-10)/err[0]);
const double prErr = abs((err[1] - 6.71E-10)/err[1]);
double vxErr, prErr;
double s1Err, s2Err;

int pIterErr;
int velIterErr;

switch (ciMode) {
case 1 : velIterErr = abs(nrs->NiterU - 10);
s1Err = abs((err[2] - 1.00E-11)/err[2]);
s2Err = abs((err[3] - 1.31E-11)/err[3]);
s1Err = abs((err[2] - 5.25E-12)/err[2]);
s2Err = abs((err[3] - 6.09E-12)/err[3]);
pIterErr = abs(nrs->NiterP - 4);
vxErr = abs((err[0] - 2.78E-10)/err[0]);
prErr = abs((err[1] - 7.17E-10)/err[1]);
break;
case 2 : velIterErr = abs(nrs->NiterU - 10);
s1Err = abs((err[2] - 6.49E-12)/err[2]);
s2Err = abs((err[3] - 7.28E-12)/err[3]);
pIterErr = abs(nrs->NiterP - 1);
vxErr = abs((err[0] - 2.78E-10)/err[0]);
prErr = abs((err[1] - 8.38E-10)/err[1]);
break;
case 3 : velIterErr = abs(nrs->NiterU - 4);
s1Err = abs((err[2] - 5.25E-12)/err[2]);
s2Err = abs((err[3] - 6.09E-12)/err[3]);
pIterErr = abs(nrs->NiterP - 1);
vxErr = abs((err[0] - 2.78E-10)/err[0]);
prErr = abs((err[1] - 8.34E-10)/err[1]);
break;
case 4 : velIterErr = abs(nrs->NiterU - 1);
s1Err = abs((err[2] - 1.71E-11)/err[2]);
s2Err = abs((err[3] - 2.00E-11)/err[3]);
pIterErr = abs(nrs->NiterP - 1);
vxErr = abs((err[0] - 3.65E-10)/err[0]);
prErr = abs((err[1] - 6.71E-10)/err[1]);
break;
}

Expand Down
37 changes: 27 additions & 10 deletions okl/elliptic/ellipticResidualProjection.okl
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,18 @@
*/

@kernel void scalarMultiply(const dlong N,
const dlong fieldOffset,
const dlong offset,
const dfloat alpha,
@restrict dfloat* x)
{
for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
if(n < N)
x[n + offset] = alpha * x[n + offset];
if(n < N){
#pragma unroll p_Nfields
for(dlong field = 0 ; field < p_Nfields; ++field){
x[n + offset + field * fieldOffset] = alpha * x[n + offset + field * fieldOffset];
}
}
}

@kernel void multiScaledAddwOffset(const dlong N,
Expand All @@ -43,9 +48,11 @@
@restrict dfloat* x)
{
for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
if(n < N)
if(n < N){
for(dlong k = 0; k < m - 1; ++k)
x[n + destOffset] = -alphas[k] * x[n + k * fieldOffset] + beta * x[n + destOffset];
for(dlong fld = 0; fld < p_Nfields; ++fld)
x[n + destOffset + fld * fieldOffset] = -alphas[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset] + beta * x[n + destOffset + fld * fieldOffset];
}
}

@kernel void accumulate(const dlong N,
Expand All @@ -58,10 +65,12 @@
for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
if(n < N) {
// y = alpha[0] * o_xx[:,0]
y[n] = alpha[0] * x[n];
for(dlong fld = 0 ; fld < p_Nfields; ++fld)
y[n + fld * fieldOffset] = alpha[0] * x[n + fld * fieldOffset];
for(dlong k = 1; k < m; ++k)
// y += alpha[k] * o_xx[:,k]
y[n] += alpha[k] * x[n + k * fieldOffset];
for(dlong fld = 0 ; fld < p_Nfields; ++fld)
// y += alpha[k] * o_xx[:,k]
y[n + fld * fieldOffset] += alpha[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset];
}
}

Expand All @@ -78,16 +87,24 @@
for(dlong b = 0; b < (N + p_threadBlockSize - 1) / p_threadBlockSize; ++b; @outer(0)) {
@shared volatile dfloat s_wxy[p_threadBlockSize];
@exclusive dfloat w_loc;
@exclusive dfloat y_loc;
@exclusive dfloat y_loc[p_Nfields];

for(int v = 0; v < m; ++v) {
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) {
const dlong id = t + p_threadBlockSize * b;
if(v == 0 && id < N) {
w_loc = w[id];
y_loc = y[id + offset];
for(dlong fld = 0; fld < p_Nfields; ++fld){
y_loc[fld] = y[id + offset + fld * fieldOffset];
}
}
dfloat res = 0.f;
if(id < N){
for(dlong fld = 0; fld < p_Nfields; ++fld){
res += w_loc * x[id + p_Nfields * fieldOffset * v + fld * fieldOffset] * y_loc[fld];
}
}
s_wxy[t] = (id < N) ? w_loc * x[id + fieldOffset * v] * y_loc : 0.f;
s_wxy[t] = res;
}

@barrier("local");
Expand Down
2 changes: 0 additions & 2 deletions src/core/parReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,6 @@ setupAide parRead(std::string &setupFile, MPI_Comm comm)
options.setArgs("VELOCITY BLOCK SOLVER", "FALSE");
if(std::strstr(vsolver.c_str(), "block")) {
options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
if(options.compareArgs("VELOCITY RESIDUAL PROJECTION","TRUE"))
exit("Residual projection is not enabled for the velocity block solver!", EXIT_FAILURE);
}
}

Expand Down
83 changes: 47 additions & 36 deletions src/elliptic/ellipticResidualProjection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,31 +33,30 @@ void ResidualProjection::matvec(occa::memory& o_Ax,
occa::memory& o_x,
const dlong x_offset)
{
occa::memory o_xtmp = o_x + fieldOffset * x_offset * sizeof(dfloat);
occa::memory o_Axtmp = o_Ax + fieldOffset * Ax_offset * sizeof(dfloat);
occa::memory o_xtmp = o_x + Nfields * fieldOffset * x_offset * sizeof(dfloat);
occa::memory o_Axtmp = o_Ax + Nfields * fieldOffset * Ax_offset * sizeof(dfloat);
matvecOperator(o_xtmp, o_Axtmp);
}

void ResidualProjection::updateProjectionSpace()
{
dlong m = numVecsProjection;
if(m <= 0) return;
if(numVecsProjection <= 0) return;

multiWeightedInnerProduct(o_xx, m, o_bb, m - 1);
const dfloat norm_orig = alpha[m - 1];
multiWeightedInnerProduct(o_xx, o_bb, numVecsProjection - 1);
const dfloat norm_orig = alpha[numVecsProjection - 1];
dfloat norm_new = norm_orig;
const dfloat one = 1.0;
multiScaledAddwOffsetKernel(Nlocal, m, (m - 1) * fieldOffset, fieldOffset, o_alpha, one, o_xx);
multiScaledAddwOffsetKernel(Nlocal, m, (m - 1) * fieldOffset, fieldOffset, o_alpha, one, o_bb);
for(int k = 0; k < m - 1; ++k)
multiScaledAddwOffsetKernel(Nlocal, numVecsProjection, Nfields * (numVecsProjection - 1) * fieldOffset, fieldOffset, o_alpha, one, o_xx);
multiScaledAddwOffsetKernel(Nlocal, numVecsProjection, Nfields * (numVecsProjection - 1) * fieldOffset, fieldOffset, o_alpha, one, o_bb);
for(int k = 0; k < numVecsProjection - 1; ++k)
norm_new = norm_new - alpha[k] * alpha[k];
norm_new = sqrt(norm_new);
dfloat tol = 1e-7;
const dfloat test = norm_new / norm_orig;
if(test > tol) {
const dfloat scale = 1.0 / norm_new;
scalarMultiplyKernel(Nlocal, (m - 1) * fieldOffset, scale, o_xx);
scalarMultiplyKernel(Nlocal, (m - 1) * fieldOffset, scale, o_bb);
scalarMultiplyKernel(Nlocal, fieldOffset, Nfields * (numVecsProjection - 1) * fieldOffset, scale, o_xx);
scalarMultiplyKernel(Nlocal, fieldOffset, Nfields * (numVecsProjection - 1) * fieldOffset, scale, o_bb);
} else {
if(verbose && rank == 0) {
std::cout << "Detected rank deficiency: " << test << ".\n";
Expand All @@ -72,13 +71,16 @@ void ResidualProjection::computePreProjection(occa::memory& o_r)
dfloat one = 1.0;
dfloat zero = 0.0;
dfloat mone = -1.0;
const int m = numVecsProjection;
if(m <= 0) return;
multiWeightedInnerProduct(o_xx,m,o_r,0);
if(numVecsProjection <= 0) return;
multiWeightedInnerProduct(o_xx,o_r,0);

accumulateKernel(Nlocal, m, fieldOffset, o_alpha, o_xx, o_xbar);
accumulateKernel(Nlocal, m, fieldOffset, o_alpha, o_bb, o_rtmp);
scaledAddKernel(Nlocal, mone, o_rtmp, one, o_r);
accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_xx, o_xbar);
accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_bb, o_rtmp);
if(blockSolver){
scaledAddKernel(Nlocal, fieldOffset, mone, o_rtmp, one, o_r);
} else {
scaledAddKernel(Nlocal, mone, o_rtmp, one, o_r);
}
}

void ResidualProjection::computePostProjection(occa::memory & o_x)
Expand All @@ -89,25 +91,33 @@ void ResidualProjection::computePostProjection(occa::memory & o_x)
if(numVecsProjection == 0) {
// reset bases
numVecsProjection = 1;
o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat));
o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat));
} else if(numVecsProjection == maxNumVecsProjection) {
numVecsProjection = 1;
scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat));
if(blockSolver){
scaledAddKernel(Nlocal, fieldOffset, one, o_xbar, one, o_x);
} else {
scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
}
o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat));
} else {
numVecsProjection++;
// xx[m-1] = x
o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat), (numVecsProjection - 1) * fieldOffset * sizeof(dfloat), 0);
o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat), Nfields * (numVecsProjection - 1) * fieldOffset * sizeof(dfloat), 0);
// x = x + xbar
scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
if(blockSolver){
scaledAddKernel(Nlocal, fieldOffset, one, o_xbar, one, o_x);
} else {
scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
}
}
const dlong m_save = numVecsProjection;
const dlong previousNumVecsProjection = numVecsProjection;
matvec(o_bb,numVecsProjection - 1,o_xx,numVecsProjection - 1);

updateProjectionSpace();
if (numVecsProjection < m_save) { // Last vector was linearly dependent, reset space
if (numVecsProjection < previousNumVecsProjection) { // Last vector was linearly dependent, reset space
numVecsProjection = 1;
o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat)); // writes first n words of o_xx, first approximation vector
o_xx.copyFrom(o_x, Nfields * fieldOffset * sizeof(dfloat)); // writes first n words of o_xx, first approximation vector
matvec(o_bb,0,o_xx,0);
updateProjectionSpace();
}
Expand All @@ -121,12 +131,14 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
numTimeSteps(_numTimeSteps),
Nlocal(elliptic.mesh->Np * elliptic.mesh->Nelements),
fieldOffset(elliptic.Ntotal),
Nfields(elliptic.Nfields),
Nblock(elliptic.Nblock),
Nblock2(elliptic.Nblock2),
resNormFactor(elliptic.resNormFactor),
rank(elliptic.mesh->rank),
size(elliptic.mesh->size),
comm(elliptic.mesh->comm),
blockSolver(elliptic.blockSolver),
o_tmp(elliptic.o_tmp),
o_tmp2(elliptic.o_tmp2),
o_wrk(elliptic.o_wrk),
Expand All @@ -143,9 +155,9 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
work = (dfloat*) calloc(maxNumVecsProjection, sizeof(dfloat));
multiwork = (dfloat*) calloc(Nblock * maxNumVecsProjection, sizeof(dfloat));
o_alpha = elliptic.mesh->device.malloc(maxNumVecsProjection * sizeof(dfloat));
o_xbar = elliptic.mesh->device.malloc(Nlocal * sizeof(dfloat));
o_xx = elliptic.mesh->device.malloc(fieldOffset * maxNumVecsProjection * sizeof(dfloat));
o_bb = elliptic.mesh->device.malloc(fieldOffset * maxNumVecsProjection * sizeof(dfloat));
o_xbar = elliptic.mesh->device.malloc(Nfields * fieldOffset * sizeof(dfloat));
o_xx = elliptic.mesh->device.malloc(Nfields * fieldOffset * maxNumVecsProjection * sizeof(dfloat));
o_bb = elliptic.mesh->device.malloc(Nfields * fieldOffset * maxNumVecsProjection * sizeof(dfloat));

string install_dir;
install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
Expand All @@ -160,6 +172,7 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
properties["defines/p_blockSize"] = BLOCKSIZE;
properties["defines/dfloat"] = dfloatString;
properties["defines/dlong"] = dlongString;
properties["defines/p_Nfields"] = Nfields;

filename = oklpath + "ellipticResidualProjection.okl";
scalarMultiplyKernel = elliptic.mesh->device.buildKernel(filename.c_str(),
Expand Down Expand Up @@ -193,8 +206,7 @@ void ResidualProjection::pre(occa::memory& o_r)
if(timestep < numTimeSteps)
return;

const int m = numVecsProjection;
if(m <= 0) return;
if(numVecsProjection <= 0) return;
dfloat priorResidualNorm = 0.0;
if(verbose) priorResidualNorm =
sqrt(weightedNorm(o_r) * resNormFactor);
Expand Down Expand Up @@ -222,24 +234,23 @@ void ResidualProjection::post(occa::memory& o_x)

void ResidualProjection::multiWeightedInnerProduct(
occa::memory &o_a,
const dlong m,
occa::memory &o_b,
const dlong offset)
{
#ifdef ELLIPTIC_ENABLE_TIMER
timer::tic("dotp",1);
#endif
multiWeightedInnerProduct2Kernel(Nlocal, fieldOffset, Nblock, m, offset * fieldOffset, o_invDegree, o_a, o_b, o_wrk);
multiWeightedInnerProduct2Kernel(Nlocal, fieldOffset, Nblock, numVecsProjection, Nfields * offset * fieldOffset, o_invDegree, o_a, o_b, o_wrk);

o_wrk.copyTo(multiwork, sizeof(dfloat) * m * Nblock);
for(dlong k = 0; k < m; ++k) {
o_wrk.copyTo(multiwork, sizeof(dfloat) * numVecsProjection * Nblock);
for(dlong k = 0; k < numVecsProjection; ++k) {
dfloat accum = 0.0;
for(dlong n = 0; n < Nblock; ++n)
accum += multiwork[n + k * Nblock];
alpha[k] = accum;
}
MPI_Allreduce(MPI_IN_PLACE, alpha, m, MPI_DFLOAT, MPI_SUM, comm);
o_alpha.copyFrom(alpha,sizeof(dfloat) * m);
MPI_Allreduce(MPI_IN_PLACE, alpha, numVecsProjection, MPI_DFLOAT, MPI_SUM, comm);
o_alpha.copyFrom(alpha,sizeof(dfloat) * numVecsProjection);
#ifdef ELLIPTIC_ENABLE_TIMER
timer::toc("dotp");
#endif
Expand Down
3 changes: 2 additions & 1 deletion src/elliptic/ellipticResidualProjection.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ class ResidualProjection final
void matvec(occa::memory& o_Ax, const dlong Ax_offset, occa::memory& o_x, const dlong x_offset);
void multiWeightedInnerProduct(
occa::memory& o_a,
const dlong m,
occa::memory& o_b,
const dlong offset);
const dlong maxNumVecsProjection;
Expand Down Expand Up @@ -81,11 +80,13 @@ class ResidualProjection final
dlong numVecsProjection;
const dlong Nlocal; // vector size
const dlong fieldOffset; // offset
const dlong Nfields;
const dlong Nblock;
const dlong Nblock2;
const dfloat resNormFactor;
const int rank;
const int size;
const int blockSolver;
MPI_Comm comm;

std::function<void(occa::memory&,occa::memory&)> matvecOperator;
Expand Down
2 changes: 1 addition & 1 deletion src/elliptic/ellipticSolve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ int ellipticSolve(elliptic_t* elliptic,
ellipticUpdateJacobi(elliptic);

if(options.compareArgs("RESIDUAL PROJECTION","TRUE"))
elliptic->o_x0.copyFrom(o_x, elliptic->Ntotal * sizeof(dfloat));
elliptic->o_x0.copyFrom(o_x, elliptic->Nfields * elliptic->Ntotal * sizeof(dfloat));

// compute initial residual
ellipticOperator(elliptic, o_x, elliptic->o_Ap, dfloatString);
Expand Down