Skip to content

Commit

Permalink
Performance improvements for the residual projection scheme (#115) (#120
Browse files Browse the repository at this point in the history
)
  • Loading branch information
MalachiTimothyPhillips authored Jul 24, 2020
1 parent ef4a051 commit 1462cd1
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 13 deletions.
9 changes: 9 additions & 0 deletions src/libP/solvers/elliptic/ellipticResidualProjection.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ class ResidualProjection final
const dlong a_offset,
occa::memory& o_b,
const dlong b_offset);
void multiWeightedInnerProduct(occa::memory& o_w,
occa::memory& o_a,
const dlong m,
occa::memory& o_b,
const dlong offset);
elliptic_t& elliptic;
const dlong maxNumVecsProjection;
const dlong numTimeSteps;
Expand All @@ -72,14 +77,18 @@ class ResidualProjection final
occa::kernel scalarMultiplyKernel;
occa::kernel scaledAddwOffsetTwoVecKernel;
occa::kernel scaledAddwOffsetKernel;
occa::kernel multiScaledAddwOffsetKernel;
occa::kernel subtractedMultiScaledAddwOffsetKernel;
occa::kernel placeVectorKernel;
occa::kernel extractVectorKernel;
occa::kernel weightedInnerProduct2Kernel;
occa::kernel multiWeightedInnerProduct2Kernel;
occa::kernel innerProductKernel;
occa::kernel accumulateKernel;

std::vector<dfloat> alpha; // host shadow
std::vector<dfloat> work; // O(m) work array
std::vector<dfloat> multiwork; // O(Nblock*m) work array

dlong numVecsProjection;
dlong Ntotal; // vector size
Expand Down
82 changes: 81 additions & 1 deletion src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,36 @@
if(n < N)
x[n + destOffset * N] = alpha * x[n + sourceOffset * N] + beta * x[n + destOffset * N];
}
@kernel void multiScaledAddwOffset(const dlong N,
const dlong m,
@restrict const dfloat * alphas,
@restrict dfloat* x,
const dfloat beta,
const dlong destOffset)
{
for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){
if(n < N){
for(dlong k = 0; k < m-1; ++k){
x[n + destOffset * N] = -alphas[k] * x[n + k * N] + beta * x[n + destOffset * N];
}
}
}
}
@kernel void subtractedMultiScaledAddwOffset(const dlong N,
const dlong m,
@restrict const dfloat * alphas,
@restrict dfloat* x,
const dfloat beta,
const dlong k)
{
for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){
if(n < N){
for(dlong j = m-1; j >= k + 1; j--){
x[n + k * N] = -alphas[j] * x[n + j * N] + beta * x[n + k * N];
}
}
}
}
@kernel void scaledAddwOffsetTwoVec(const dlong N,
const dfloat alpha,
@restrict const dfloat* x,
Expand Down Expand Up @@ -143,7 +173,6 @@
}
}

// TODO: add offset
@kernel void weightedInnerProduct2(const dlong N,
@restrict const dfloat* w,
@restrict const dfloat* x,
Expand Down Expand Up @@ -184,4 +213,55 @@

for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 1) wxy[b] = s_wxy[0] + s_wxy[1];
}
}
@kernel void multiWeightedInnerProduct2(const dlong N,
const dlong Nblock,
const dlong m,
@restrict const dfloat* w,
@restrict const dfloat* x,
@restrict const dfloat* y,
const dlong offset,
@restrict dfloat* wxy)
{
for(dlong b = 0; b < (N + p_threadBlockSize - 1) / p_threadBlockSize; ++b; @outer(0)) {
@shared volatile dfloat s_wxy[p_threadBlockSize];
@exclusive dfloat w_loc;
@exclusive dfloat y_loc;

for(int v = 0 ; v < m; ++v){

for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) {
const dlong id = t + p_threadBlockSize * b;
if(v == 0 && id < N){
w_loc = w[id];
y_loc = y[id+N*offset];
}
s_wxy[t] = (id < N) ? w_loc * x[id + N * v] * y_loc : 0.f;
}

@barrier("local");
#if p_threadBlockSize > 512
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512];
@barrier("local");
#endif
#if p_threadBlockSize > 256
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256];
@barrier("local");
#endif

for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128];
@barrier("local");

for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64];
@barrier("local");

for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32];
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16];
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 8) s_wxy[t] += s_wxy[t + 8];
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 4) s_wxy[t] += s_wxy[t + 4];
for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 2) s_wxy[t] += s_wxy[t + 2];

for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 1) wxy[b+v*Nblock] = s_wxy[0] + s_wxy[1];
}
}
}
62 changes: 50 additions & 12 deletions src/libP/solvers/elliptic/src/ellipticResidualProjection.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,9 @@ void ResidualProjection::reOrthogonalize()
+ computeInnerProduct(o_bb,j,o_xx,k));
}
gop(alpha.data() + k,work.data(),(m - k) + 1);
for(int j = m - 1; j >= k + 1; j--) {
scaledAddwOffsetKernel(Ntotal, -alpha[j], o_xx,j, one, k);
scaledAddwOffsetKernel(Ntotal, -alpha[j], o_bb,j, one, k);
}
o_alpha.copyFrom(alpha.data(),sizeof(dfloat)*((m-k)+1));
subtractedMultiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_xx, one, k);
subtractedMultiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_bb, one, k);
dfloat normp = sqrt(alpha[k]);
dfloat normk = 0.0;
if(useWeightedFormulation) {
Expand Down Expand Up @@ -114,20 +113,19 @@ void ResidualProjection::updateProjectionSpace()
dlong m = numVecsProjection;
if(m <= 0) return;
if(useWeightedFormulation) {
for(int k = 0; k < m; ++k)
alpha[k] = weightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx,k,o_bb,m - 1);
multiWeightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx, m, o_bb, m-1);
} else {
for(int k = 0; k < m; ++k)
alpha[k] = computeInnerProduct(o_xx,k,o_bb,m - 1);
}
gop(alpha.data(),work.data(),m);
o_alpha.copyFrom(alpha.data(),sizeof(dfloat)*m);
const dfloat norm_orig = alpha[m - 1];
dfloat norm_new = norm_orig;
const dfloat one = 1.0;
multiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_xx, one, m-1);
multiScaledAddwOffsetKernel(Ntotal, m, o_alpha, o_bb, one, m-1);
for(int k = 0; k < m - 1; ++k) {
const dfloat scale = -alpha[k];
scaledAddwOffsetKernel(Ntotal, scale, o_xx,k, one, m - 1);
scaledAddwOffsetKernel(Ntotal, scale, o_bb,k, one, m - 1);
norm_new = norm_new - alpha[k] * alpha[k];
}
norm_new = sqrt(norm_new);
Expand All @@ -153,8 +151,7 @@ void ResidualProjection::computePreProjection(occa::memory& o_r)
const int m = numVecsProjection;
if(m <= 0) return;
if(useWeightedFormulation) {
for(int k = 0; k < m; ++k)
alpha[k] = weightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_r,0,o_xx,k);
multiWeightedInnerProduct(elliptic.mesh->ogs->o_invDegree, o_xx,m,o_r,0);
} else {
for(int k = 0; k < m; ++k)
alpha[k] = computeInnerProduct(o_r,0,o_xx,k);
Expand Down Expand Up @@ -211,6 +208,7 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic,
verbose = elliptic.options.compareArgs("VERBOSE","TRUE");
alpha.resize(maxNumVecsProjection);
work.resize(maxNumVecsProjection);
multiwork.resize(Nblock*maxNumVecsProjection);
o_alpha = elliptic.mesh->device.malloc < dfloat > (maxNumVecsProjection);
o_xbar = elliptic.mesh->device.malloc < dfloat > (Ntotal);
o_xx = elliptic.mesh->device.malloc < dfloat > (Ntotal * maxNumVecsProjection);
Expand All @@ -222,7 +220,9 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic,
if ((r == 0 && elliptic.mesh->rank == 0) || (r == 1 && elliptic.mesh->rank > 0)) {
occa::properties properties;
properties += elliptic.mesh->device.properties();
properties["defines/p_threadBlockSize"] = 256;
properties["defines/p_threadBlockSize"] = blockSize;
properties["defines/p_blockSize"] = blockSize;
properties["defines/p_maxMultiVectors"] = maxNumVecsProjection;
properties["defines/dfloat"] = dfloatString;
properties["defines/dlong"] = dlongString;

Expand All @@ -238,10 +238,19 @@ ResidualProjection::ResidualProjection(elliptic_t& _elliptic,
scaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
"scaledAddwOffset",
properties);
multiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
"multiScaledAddwOffset",
properties);
subtractedMultiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
"subtractedMultiScaledAddwOffset",
properties);
placeVectorKernel = elliptic.mesh->device.buildKernel(fileName, "placeVector", properties);
weightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName,
"weightedInnerProduct2",
properties);
multiWeightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName,
"multiWeightedInnerProduct2",
properties);
innerProductKernel = elliptic.mesh->device.buildKernel(fileName, "innerProduct", properties);
accumulateKernel = elliptic.mesh->device.buildKernel(fileName, "accumulate", properties);
}
Expand Down Expand Up @@ -354,3 +363,32 @@ dfloat ResidualProjection::weightedInnerProduct(occa::memory &o_w,

return wab;
}
void ResidualProjection::multiWeightedInnerProduct(occa::memory &o_w,
occa::memory &o_a,
const dlong m,
occa::memory &o_b,
const dlong offset)
{
setupAide &options = elliptic.options;

const int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS");
const int serial = options.compareArgs("THREAD MODEL", "SERIAL");
int enableReductions = 1;
options.getArgs("DEBUG ENABLE REDUCTIONS", enableReductions);

mesh_t* mesh = elliptic.mesh;

const dlong Nlocal = mesh->Np * mesh->Nelements;
const dlong Nblock = elliptic.Nblock;

multiWeightedInnerProduct2Kernel(Nlocal, Nblock, m, o_w, o_a, o_b, offset, elliptic.o_wrk);
elliptic.o_wrk.copyTo(multiwork.data(), sizeof(dfloat)*m*Nblock);
for(dlong k = 0 ; k < m; ++k){
dfloat accum = 0.0;
for(dlong n = 0; n < Nblock; ++n){
accum += multiwork[n+k*Nblock];
}
alpha[k] = accum;
}
o_alpha.copyFrom(alpha.data(),sizeof(double)*m);
}

0 comments on commit 1462cd1

Please sign in to comment.