diff --git a/3rd_party/gslib/ogs/ogs.hpp b/3rd_party/gslib/ogs/ogs.hpp index 0eac7fe3b..83ed72224 100644 --- a/3rd_party/gslib/ogs/ogs.hpp +++ b/3rd_party/gslib/ogs/ogs.hpp @@ -247,8 +247,9 @@ typedef struct { occa::memory o_scatterOffsets, o_gatherOffsets; occa::memory o_scatterIds, o_gatherIds; - occa::kernel packBufDoubleKernel, unpackBufDoubleKernel; - occa::kernel packBufFloatKernel, unpackBufFloatKernel; + occa::kernel packBufDoubleKernel, packBufFloatKernel; + occa::kernel unpackBufDoubleAddKernel, unpackBufDoubleMinKernel, unpackBufDoubleMaxKernel; + occa::kernel unpackBufFloatAddKernel; oogs_mode mode; diff --git a/3rd_party/gslib/ogs/okl/oogs.okl b/3rd_party/gslib/ogs/okl/oogs.okl index f3e5ac1dd..5eaa8e344 100644 --- a/3rd_party/gslib/ogs/okl/oogs.okl +++ b/3rd_party/gslib/ogs/okl/oogs.okl @@ -1,9 +1,9 @@ @kernel void packBuf_float(const dlong Nscatter, - const int Nentries, - @restrict const dlong * scatterStarts, - @restrict const dlong * scatterIds, - @restrict const float * q, - @restrict float * scatterq){ + const int Nentries, + @restrict const dlong * scatterStarts, + @restrict const dlong * scatterIds, + @restrict const float * q, + @restrict float * scatterq){ for(dlong s=0;s gq) ? q[id*Nentries+k] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} diff --git a/3rd_party/gslib/ogs/src/oogs.cpp b/3rd_party/gslib/ogs/src/oogs.cpp index 409f5850e..149ab8dee 100644 --- a/3rd_party/gslib/ogs/src/oogs.cpp +++ b/3rd_party/gslib/ogs/src/oogs.cpp @@ -1,8 +1,3 @@ -/* - * TODO: - * - Support other operations than just add - */ - #include #include #include "ogs.hpp" @@ -63,8 +58,8 @@ struct gs_data { static void convertPwMap(const uint *restrict map, int *restrict starts, - int *restrict ids){ - + int *restrict ids) +{ uint i,j; int n=0, s=0; while((i=*map++)!=UINT_MAX) { @@ -79,19 +74,19 @@ static void convertPwMap(const uint *restrict map, } } -static void pairwiseExchange(occa::memory o_halo, int unit_size, oogs_t *gs) +static void pairwiseExchange(int unit_size, oogs_t *gs) { ogs_t *ogs = gs->ogs; struct gs_data *hgs = (gs_data*) ogs->haloGshSym; const void* execdata = hgs->r.data; const struct pw_data *pwd = (pw_data*) execdata; const struct comm *comm = &hgs->comm; - const unsigned Nhalo = ogs->NhaloGather; // hardwired for now const unsigned transpose = 0; const unsigned recv = 0^transpose, send = 1^transpose; + { // prepost recv comm_req *req = pwd->req; const struct pw_comm_data *c = &pwd->comm[recv]; @@ -141,19 +136,20 @@ oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::f const unsigned recv = 0^transpose, send = 1^transpose; const void* execdata = hgs->r.data; const struct pw_data *pwd = (pw_data*) execdata; - const unsigned Nhalo = ogs->NhaloGather; - const unsigned unit_size = nVec*sizeof(double); // hardwire just need to be big enough + const unsigned unit_size = nVec*sizeof(double); // just need to be big enough to run callcack const struct comm *comm = &hgs->comm; const int rank = comm->id; - if(Nhalo == 0) return gs; + if(ogs->NhaloGather == 0) return gs; for (int r=0;r<2;r++) { if ((r==0 && rank==0) || (r==1 && rank>0)) { gs->packBufDoubleKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_double", ogs::kernelInfo); - gs->unpackBufDoubleKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_double", ogs::kernelInfo); gs->packBufFloatKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_float", ogs::kernelInfo); - gs->unpackBufFloatKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_float", ogs::kernelInfo); + gs->unpackBufDoubleAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_doubleAdd", ogs::kernelInfo); + gs->unpackBufFloatAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_floatAdd", ogs::kernelInfo); + gs->unpackBufDoubleMinKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_doubleMin", ogs::kernelInfo); + gs->unpackBufDoubleMaxKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_doubleMax", ogs::kernelInfo); } MPI_Barrier(comm->c); } @@ -163,24 +159,24 @@ oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::f gs->h_buffSend = ogs->device.malloc(pwd->comm[send].total*unit_size, props); gs->bufSend = (unsigned char*)gs->h_buffSend.ptr(props); - int *scatterOffsets = (int*) calloc((Nhalo+1),sizeof(int)); + int *scatterOffsets = (int*) calloc(ogs->NhaloGather+1,sizeof(int)); int *scatterIds = (int*) calloc(pwd->comm[send].total,sizeof(int)); convertPwMap(pwd->map[send], scatterOffsets, scatterIds); gs->o_bufSend = ogs->device.malloc(pwd->comm[send].total*unit_size); - gs->o_scatterOffsets = ogs->device.malloc((Nhalo+1)*sizeof(int), scatterOffsets); + gs->o_scatterOffsets = ogs->device.malloc((ogs->NhaloGather+1)*sizeof(int), scatterOffsets); gs->o_scatterIds = ogs->device.malloc(pwd->comm[send].total*sizeof(int), scatterIds); free(scatterOffsets); free(scatterIds); gs->h_buffRecv = ogs->device.malloc(pwd->comm[recv].total*unit_size, props); gs->bufRecv = (unsigned char*)gs->h_buffRecv.ptr(props); - int* gatherOffsets = (int*) calloc((Nhalo+1),sizeof(int)); + int* gatherOffsets = (int*) calloc(ogs->NhaloGather+1,sizeof(int)); int *gatherIds = (int*) calloc(pwd->comm[recv].total,sizeof(int)); convertPwMap(pwd->map[recv], gatherOffsets, gatherIds); gs->o_bufRecv = ogs->device.malloc(pwd->comm[recv].total*unit_size); - gs->o_gatherOffsets = ogs->device.malloc((Nhalo+1)*sizeof(int), gatherOffsets); + gs->o_gatherOffsets = ogs->device.malloc((ogs->NhaloGather+1)*sizeof(int), gatherOffsets); gs->o_gatherIds = ogs->device.malloc(pwd->comm[recv].total*sizeof(int), gatherIds); free(gatherOffsets); free(gatherIds); @@ -244,34 +240,101 @@ oogs_t* oogs::setup(dlong N, hlong *ids, int nVec, dlong stride, const char *typ return setup(ogs, nVec, stride, type, callback, gsMode); } +static void packBuf(oogs_t *gs, + const dlong Ngather, + const int k, + occa::memory o_starts, + occa::memory o_ids, + const char* type, + occa::memory o_v, + occa::memory o_gv) +{ + if ((!strcmp(type, "float"))) { + gs->packBufFloatKernel(Ngather, k, o_starts, o_ids, o_v, o_gv); + } else if ((!strcmp(type, "double"))) { + gs->packBufDoubleKernel(Ngather, k, o_starts, o_ids, o_v, o_gv); + } else { + printf("oogs: unsupported datatype %s!\n", type); + exit(1); + } +} + +static void unpackBuf(oogs_t *gs, + const dlong Ngather, + const int k, + occa::memory o_starts, + occa::memory o_ids, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv) +{ + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) { + gs->unpackBufFloatAddKernel(Ngather, k, o_starts, o_ids, o_v, o_gv); + } else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) { + gs->unpackBufDoubleAddKernel(Ngather, k, o_starts, o_ids, o_v, o_gv); + } else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) { + gs->unpackBufDoubleMinKernel(Ngather, k, o_starts, o_ids, o_v, o_gv); + } else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) { + gs->unpackBufDoubleMaxKernel(Ngather, k, o_starts, o_ids, o_v, o_gv); + } else { + printf("oogs: unsupported datatype %s and operatior %s!\n", type, op); + exit(1); + } +} + +void reallocBuffers(int unit_size, oogs_t *gs) +{ + ogs_t *ogs = gs->ogs; + const unsigned transpose = 0; + struct gs_data *hgs = (gs_data*) ogs->haloGshSym; + const unsigned recv = 0^transpose, send = 1^transpose; + const void* execdata = hgs->r.data; + const struct pw_data *pwd = (pw_data*) execdata; + + if (ogs::o_haloBuf.size() < ogs->NhaloGather*unit_size) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::haloBuf = ogsHostMallocPinned(ogs->device, ogs->NhaloGather*unit_size, NULL, ogs::o_haloBuf, ogs::h_haloBuf); + } + if (gs->o_bufSend.size() < pwd->comm[send].total*unit_size) { + occa::properties props; + props["mapped"] = true; + if(gs->o_bufSend.size()) gs->o_bufSend.free(); + gs->o_bufSend = ogs->device.malloc(pwd->comm[send].total*unit_size); + if(gs->h_buffSend.size()) gs->h_buffSend.free(); + gs->h_buffSend = ogs->device.malloc(pwd->comm[send].total*unit_size, props); + gs->bufSend = (unsigned char*)gs->h_buffSend.ptr(props); + } + if (gs->o_bufRecv.size() < pwd->comm[recv].total*unit_size) { + occa::properties props; + props["mapped"] = true; + if(gs->o_bufRecv.size()) gs->o_bufRecv.free(); + gs->o_bufRecv = ogs->device.malloc(pwd->comm[recv].total*unit_size); + if(gs->h_buffRecv.size()) gs->h_buffRecv.free(); + gs->h_buffRecv = ogs->device.malloc(pwd->comm[recv].total*unit_size, props); + gs->bufRecv = (unsigned char*)gs->h_buffRecv.ptr(props); + } +} + void oogs::start(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *gs) { size_t Nbytes; - occa::kernel packBuf; - if (!strcmp(type, "float")) { + if (!strcmp(type, "float")) Nbytes = sizeof(float); - packBuf = gs->packBufFloatKernel; - } else if (!strcmp(type, "double")) { + else if (!strcmp(type, "double")) Nbytes = sizeof(double); - packBuf = gs->packBufDoubleKernel; - } else if (!strcmp(type, "int")) { + else if (!strcmp(type, "int")) Nbytes = sizeof(int); - } else if (!strcmp(type, "long long int")) { + else if (!strcmp(type, "long long int")) Nbytes = sizeof(long long int); - } ogs_t *ogs = gs->ogs; if (ogs->NhaloGather) { - if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { - if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); - ogs::haloBuf = ogsHostMallocPinned(ogs->device, ogs->NhaloGather*Nbytes*k, NULL, ogs::o_haloBuf, ogs::h_haloBuf); - } - } + reallocBuffers(Nbytes*k, gs); - if (ogs->NhaloGather) { occaGatherMany(ogs->NhaloGather, k, stride, ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); - if(gs->mode != OOGS_DEFAULT) packBuf(ogs->NhaloGather, k, gs->o_scatterOffsets, gs->o_scatterIds, ogs::o_haloBuf, gs->o_bufSend); + if(gs->mode != OOGS_DEFAULT) packBuf(gs, ogs->NhaloGather, k, gs->o_scatterOffsets, gs->o_scatterIds, type, ogs::o_haloBuf, gs->o_bufSend); ogs->device.finish(); if(gs->mode == OOGS_DEFAULT) { @@ -285,22 +348,10 @@ void oogs::start(occa::memory o_v, const int k, const dlong stride, const char * void oogs::finish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *gs) { size_t Nbytes; - occa::kernel unpackBuf; - if (!strcmp(type, "float")) { + if (!strcmp(type, "float")) Nbytes = sizeof(float); - unpackBuf = gs->unpackBufFloatKernel; - } else if (!strcmp(type, "double")) { + else if (!strcmp(type, "double")) Nbytes = sizeof(double); - unpackBuf = gs->unpackBufDoubleKernel; - } else { - printf("oogs: unsupported datatype %s!\n", type); - exit(1); - } - - if (strcmp(op, "add")) { - printf("oogs: unsupported operation %s!\n", op); - exit(1); - } ogs_t *ogs = gs->ogs; @@ -321,7 +372,7 @@ void oogs::finish(occa::memory o_v, const int k, const dlong stride, const char for (int i=0;iNhaloGather*Nbytes; ogsHostGatherScatterMany(H, k, type, op, ogs->haloGshSym); } else { - pairwiseExchange(ogs::o_haloBuf, Nbytes*k, gs); + pairwiseExchange(Nbytes*k, gs); } #ifdef OGS_ENABLE_TIMER timer::toc("gsMPI"); @@ -330,7 +381,7 @@ void oogs::finish(occa::memory o_v, const int k, const dlong stride, const char if(gs->mode == OOGS_DEFAULT) { ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); } else { - unpackBuf(ogs->NhaloGather, k, gs->o_gatherOffsets, gs->o_gatherIds, gs->o_bufRecv, ogs::o_haloBuf); + unpackBuf(gs, ogs->NhaloGather, k, gs->o_gatherOffsets, gs->o_gatherIds, type, op, gs->o_bufRecv, ogs::o_haloBuf); } ogs->device.finish(); @@ -367,9 +418,13 @@ void oogs::destroy(oogs_t *gs) gs->o_bufSend.free(); gs->packBufDoubleKernel.free(); - gs->unpackBufDoubleKernel.free(); gs->packBufFloatKernel.free(); - gs->unpackBufFloatKernel.free(); + + gs->unpackBufDoubleAddKernel.free(); + gs->unpackBufFloatAddKernel.free(); + + gs->unpackBufDoubleMinKernel.free(); + gs->unpackBufDoubleMaxKernel.free(); free(gs); } diff --git a/okl/core/cdsHelmholtzBCHex3D.okl b/okl/core/cdsHelmholtzBCHex3D.okl index 7d9b7d80f..b3f486eda 100644 --- a/okl/core/cdsHelmholtzBCHex3D.okl +++ b/okl/core/cdsHelmholtzBCHex3D.okl @@ -25,33 +25,33 @@ */ #define surfaceTerms(sk,face,i, j) \ - { \ + { \ struct bcData bc; \ bc.idM = vmapM[sk]; \ bc.time = time; \ - bc.id = EToBM[face + p_Nfaces * e]; \ - bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ - bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ - bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ + bc.id = EToBM[face + p_Nfaces * e]; \ + bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ + bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ + bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ bc.x = x[bc.idM]; \ bc.y = y[bc.idM]; \ bc.z = z[bc.idM]; \ bc.fieldOffset = offset; \ bc.wrk = W; \ \ - const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; \ + const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; \ const dlong bcType = mapB[bc.idM]; \ - bc.sF = 0.f; bc.sP = 0.f; \ + bc.sF = 0.f; bc.sP = 0.f; \ bc.scalarId = scalarId; \ - if(bcType == 1) { \ - cdsDirichletConditions3D(&bc); \ + if(bcType == 1) { \ + bc.sP = U[bc.idM]; \ } \ - if(bcType == 3) { \ + if(bcType == 3) { \ cdsNeumannConditions3D(&bc); \ } \ \ - s_U [j][i] = bc.sP; \ - s_ndU[j][i] = -WsJ * (bc.sF); \ + s_U [j][i] = bc.sP; \ + s_ndU[j][i] = -WsJ * (bc.sF); \ } //RHS contributions for continuous solver @@ -70,6 +70,7 @@ @restrict const dfloat* x, @restrict const dfloat* y, @restrict const dfloat* z, + @restrict const dfloat* U, @restrict const int* mapB, @restrict const dfloat* lambda, @restrict const dfloat* W, @@ -335,19 +336,19 @@ } } -@kernel void cdsHelmholtzAddBCHex3D(const dlong Nelements, - const dlong offset, - const dlong scalarId, - const dfloat time, - @restrict const dfloat* sgeo, - @restrict const dfloat* x, - @restrict const dfloat* y, - @restrict const dfloat* z, - @restrict const dlong* vmapM, - @restrict const int* EToBM, - @restrict const int* EToB, - @restrict const dfloat* W, - @restrict dfloat* S) +@kernel void cdsDirichletBC(const dlong Nelements, + const dlong offset, + const dlong scalarId, + const dfloat time, + @restrict const dfloat* sgeo, + @restrict const dfloat* x, + @restrict const dfloat* y, + @restrict const dfloat* z, + @restrict const dlong* vmapM, + @restrict const int* EToBM, + @restrict const int* EToB, + @restrict const dfloat* W, + @restrict dfloat* S) { for(dlong e = 0; e < Nelements; e++; @outer(0)) for(int f = 0; f < p_Nfaces; f++) { diff --git a/okl/core/cdsSumMakefHex3D.okl b/okl/core/cdsSumMakefHex3D.okl index 15042eb76..cdc1e0986 100644 --- a/okl/core/cdsSumMakefHex3D.okl +++ b/okl/core/cdsSumMakefHex3D.okl @@ -26,11 +26,9 @@ @kernel void cdsSumMakefHex3D(const dlong Nelements, @restrict const dfloat* vgeo, - @restrict const dfloat* MM, const dfloat idt, @restrict const dfloat* extbdfA, @restrict const dfloat* extbdfB, - @restrict const dfloat* extbdfC, const dlong sOffset, const dlong offset, @restrict const dfloat* S, @@ -51,23 +49,23 @@ const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; const dfloat JW = vgeo[gid + p_JWID * p_Np]; const dfloat rhoM = RHO[id + offset]; - dfloat sum = 0; + dfloat sum1 = 0; if (p_SUBCYCLING) { const dfloat NSm = NS[id]; - sum += rhoM * idt * NSm; + sum1 += NSm; }else { for (int s = 0; s < p_Nstages; s++) { const dfloat Sm = S[id + offset + s * sOffset]; - sum += rhoM * idt * extbdfB[s] * Sm; + sum1 += extbdfB[s] * Sm; } } - // already multiplied by rho + dfloat sum2 = 0; for (int s = 0; s < p_Nstages; s++) { const dfloat FSm = FS[id + offset + s * sOffset]; - sum += extbdfA[s] * FSm; + sum2 += extbdfA[s] * FSm; // already multiplied by rho } - BF[id + offset] = JW * sum; + BF[id + offset] = JW*(sum2 + rhoM*idt*sum1); } } } diff --git a/okl/core/insDivergenceHex3D.okl b/okl/core/insDivergenceHex3D.okl index a039b1c89..db7f6c4c1 100644 --- a/okl/core/insDivergenceHex3D.okl +++ b/okl/core/insDivergenceHex3D.okl @@ -120,42 +120,34 @@ // Adding slip BC to Pressure Rhs i.e. dp/dn = 0 =>dp/dn -n \dot F = -n \dot F for slip // or dp/dn -n\dot F = -g0/dt n \dot u^(n+1) for other BC's -#define surfaceTermsTOMBO(sk,face,m, i, j) \ - { \ - struct bcData bc; \ - const dlong idM = vmapM[sk]; \ - \ - bc.time = time; \ - bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ - bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ - bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ - bc.x = x[idM]; \ - bc.y = y[idM]; \ - bc.z = z[idM]; \ - bc.idM = idM; \ - bc.fieldOffset = offset; \ - const dfloat WSJ = sgeo[sk * p_Nsgeo + p_WSJID]; \ - \ - bc.uM = U[idM + 0 * offset]; \ - bc.vM = U[idM + 1 * offset]; \ - bc.wM = U[idM + 2 * offset]; \ - bc.uP = 0.f; bc.vP = 0.f; bc.wP = 0.f; \ - bc.wrk = W; \ - bc.id = EToBM[face + p_Nfaces * e]; \ - const dlong bcType = EToB[face + p_Nfaces * e]; \ - if(bcType > 0) { \ - if(bcType == 1) { \ - bc.uP = 0.f; bc.vP = 0.f; bc.wP = 0.f; \ - }else if(bcType == 2) { \ - insVelocityDirichletConditions3D(&bc); \ - }else if(bcType == 4 || bcType == 5 || bcType == 6) { \ - bc.uP = bc.uM; bc.vP = bc.vM; bc.wP = bc.wM; \ - } \ - } \ - dfloat scale = coef; \ - if(bcType == 4 || bcType == 5 || bcType == 6 ) { \ - scale = -1.f; \ - } \ +#define surfaceTermsTOMBO(sk,face,m, i, j) \ + { \ + struct bcData bc; \ + const dlong idM = vmapM[sk]; \ + \ + bc.time = time; \ + bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ + bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ + bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ + bc.x = x[idM]; \ + bc.y = y[idM]; \ + bc.z = z[idM]; \ + bc.idM = idM; \ + bc.fieldOffset = offset; \ + const dfloat WSJ = sgeo[sk * p_Nsgeo + p_WSJID]; \ + \ + bc.uM = U[idM + 0 * offset]; \ + bc.vM = U[idM + 1 * offset]; \ + bc.wM = U[idM + 2 * offset]; \ + bc.uP = 0.f; bc.vP = 0.f; bc.wP = 0.f; \ + bc.wrk = W; \ + bc.id = EToBM[face + p_Nfaces * e]; \ + const dlong bcType = EToB[face + p_Nfaces * e]; \ + if(bcType > 0 && bcType != 3) { \ + bc.uP = bc.uM; bc.vP = bc.vM; bc.wP = bc.wM; \ + } \ + dfloat scale = coef; \ + if(bcType == 4 || bcType == 5 || bcType == 6 ) scale = -1.f; \ s_fluxDiv[m][j][i] = scale * WSJ * (bc.nx * bc.uP + bc.ny * bc.vP + bc.nz * bc.wP); \ } diff --git a/okl/core/insPressureBCHex3D.okl b/okl/core/insPressureBCHex3D.okl index c9444ff66..01ee05363 100644 --- a/okl/core/insPressureBCHex3D.okl +++ b/okl/core/insPressureBCHex3D.okl @@ -25,21 +25,21 @@ */ // We are solving for Pressure Difference -@kernel void insPressureAddBCTOMBOHex3D(const dlong Nelements, - const dfloat time, - const dfloat dt, - const dlong offset, - @restrict const dfloat* sgeo, - @restrict const dfloat* x, - @restrict const dfloat* y, - @restrict const dfloat* z, - @restrict const dlong* vmapM, - @restrict const int* EToBM, - @restrict const int* EToB, - @restrict const dfloat* W, - @restrict const dfloat* U, - @restrict const dfloat* P, - @restrict dfloat* PI) +@kernel void insPressureDirichletBCHex3D(const dlong Nelements, + const dfloat time, + const dfloat dt, + const dlong offset, + @restrict const dfloat* sgeo, + @restrict const dfloat* x, + @restrict const dfloat* y, + @restrict const dfloat* z, + @restrict const dlong* vmapM, + @restrict const int* EToBM, + @restrict const int* EToB, + @restrict const dfloat* W, + @restrict const dfloat* U, + @restrict const dfloat* P, + @restrict dfloat* PI) { for(dlong e = 0; e < Nelements; e++; @outer(0)) for(int n = 0; n < p_Nfp * p_Nfaces; ++n; @inner(0)) { diff --git a/okl/core/insPressureUpdate.okl b/okl/core/insPressureUpdate.okl index 1a4baa0fc..2f7269a1e 100644 --- a/okl/core/insPressureUpdate.okl +++ b/okl/core/insPressureUpdate.okl @@ -24,12 +24,11 @@ */ -@kernel void insPressureUpdateTOMBO(const dlong Nelements, - const dlong offset, - @restrict const int* mapB, - @restrict const dfloat* PI, - @restrict const dfloat* P, - @restrict dfloat* Pnew) +@kernel void insPressureUpdate(const dlong Nelements, + @restrict const int* mapB, + @restrict const dfloat* PI, + @restrict const dfloat* P, + @restrict dfloat* Pnew) { for(dlong e = 0; e < Nelements; ++e; @outer(0)) for(int n = 0; n < p_Np; ++n; @inner(0)) { @@ -37,12 +36,7 @@ const int bcType = mapB[id]; const dfloat princ = PI[id]; dfloat pr = P[id]; - - if(bcType == 1) // Dirichlet already updated - pr = princ; - else - pr += princ; - + if(bcType != 1) pr += princ; // Dirichlet already updated Pnew[id] = pr; } } diff --git a/okl/core/insVelocityBCHex3D.okl b/okl/core/insVelocityBCHex3D.okl index 4aaa9862f..123d3a87f 100644 --- a/okl/core/insVelocityBCHex3D.okl +++ b/okl/core/insVelocityBCHex3D.okl @@ -428,19 +428,19 @@ } } -@kernel void insVelocityAddBCHex3D(const dlong Nelements, - const dlong offset, - const dfloat time, - @restrict const dfloat* sgeo, - @restrict const dfloat* x, - @restrict const dfloat* y, - @restrict const dfloat* z, - @restrict const dlong* vmapM, - @restrict const int* EToBM, - @restrict const int* EToB, - @restrict const dfloat* W, - @restrict const dfloat* U, - @restrict dfloat* UH) +@kernel void insVelocityDirichletBCHex3D(const dlong Nelements, + const dlong offset, + const dfloat time, + @restrict const dfloat* sgeo, + @restrict const dfloat* x, + @restrict const dfloat* y, + @restrict const dfloat* z, + @restrict const dlong* vmapM, + @restrict const int* EToBM, + @restrict const int* EToB, + @restrict const dfloat* W, + @restrict const dfloat* U, + @restrict dfloat* UH) { for(dlong e = 0; e < Nelements; e++; @outer(0)) for(int f = 0; f < p_Nfaces; f++) { diff --git a/okl/core/insVelocityExt.okl b/okl/core/insVelocityExt.okl index 597a1a1f5..add64c2a7 100644 --- a/okl/core/insVelocityExt.okl +++ b/okl/core/insVelocityExt.okl @@ -31,7 +31,6 @@ @restrict const dfloat* U, @restrict dfloat* Ue) { - // Low storage Runge Kutta time step update for(dlong e = 0; e < Nelements; ++e; @outer(0)) { for(int n = 0; n < p_Np; ++n; @inner(0)) { const dlong id = n + p_Np * e; diff --git a/okl/core/insVelocityRhsHex3D.okl b/okl/core/insVelocityRhsHex3D.okl index f475c56f6..55ac8e84c 100644 --- a/okl/core/insVelocityRhsHex3D.okl +++ b/okl/core/insVelocityRhsHex3D.okl @@ -31,7 +31,6 @@ @restrict const dfloat* extbdfA, @restrict const dfloat* extbdfB, const dlong fieldOffset, - @restrict const dfloat* U, @restrict const dfloat* BF, @restrict const dfloat* FU, @restrict const dfloat* GP, diff --git a/okl/core/math.okl b/okl/core/math.okl index 00a18f533..ac92d3561 100644 --- a/okl/core/math.okl +++ b/okl/core/math.okl @@ -56,3 +56,17 @@ if(n < N) Y[n] = a + b * X[n]; } + +@kernel void maskCopy(const dlong Nmasked, + const dlong offset, + @restrict const dlong* maskIds, + @restrict const dfloat* in, + @restrict dfloat* out){ + + for(dlong n=0;n #include "nrs.hpp" occa::memory cdsSolve(const int is, cds_t* cds, dfloat time) @@ -13,9 +14,34 @@ occa::memory cdsSolve(const int is, cds_t* cds, dfloat time) } elliptic_t* solver = cds->solver[is]; - cds->o_wrk1.copyFrom(cds->o_BF, cds->Ntotal * sizeof(dfloat), 0, - is * cds->fieldOffset * sizeof(dfloat)); + //copy current solution fields as initial guess + cds->o_wrk0.copyFrom(cds->o_S, cds->Ntotal * sizeof(dfloat), 0, is * cds->fieldOffset * sizeof(dfloat)); + + //enforce Dirichlet BCs + cds->setScalarKernel(cds->fieldOffset, std::numeric_limits::min(), cds->o_wrk2); + for (int sweep = 0; sweep < 2; sweep++) { + cds->dirichletBCKernel(mesh->Nelements, + cds->fieldOffset, + is, + time, + mesh->o_sgeo, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + cds->o_EToB[is], + *(cds->o_usrwrk), + cds->o_wrk2); + //take care of Neumann-Dirichlet shared edges across elements + if(sweep == 0) oogs::startFinish(cds->o_wrk2, 1, cds->fieldOffset, ogsDfloat, ogsMax, gsh); + if(sweep == 1) oogs::startFinish(cds->o_wrk2, 1, cds->fieldOffset, ogsDfloat, ogsMin, gsh); + } + if (solver->Nmasked) cds->maskCopyKernel(solver->Nmasked, 0, solver->o_maskIds, cds->o_wrk2, cds->o_wrk0); + + //build RHS + cds->o_wrk1.copyFrom(cds->o_BF, cds->Ntotal * sizeof(dfloat), 0, is * cds->fieldOffset * sizeof(dfloat)); cds->helmholtzRhsBCKernel(mesh->Nelements, mesh->o_ggeo, mesh->o_sgeo, @@ -31,33 +57,17 @@ occa::memory cdsSolve(const int is, cds_t* cds, dfloat time) mesh->o_x, mesh->o_y, mesh->o_z, + cds->o_wrk0, cds->o_mapB[is], cds->o_ellipticCoeff, *(cds->o_usrwrk), cds->o_wrk1); - oogs::startFinish(cds->o_wrk1, 1, cds->fieldOffset, ogsDfloat, ogsAdd, gsh); if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, cds->o_wrk1); - //copy current solution fields as initial guess - cds->o_wrk0.copyFrom(cds->o_S, cds->Ntotal * sizeof(dfloat), 0, - is * cds->fieldOffset * sizeof(dfloat)); if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, cds->o_wrk0); - cds->Niter[is] = ellipticSolve(solver, cds->TOL, cds->o_wrk1, cds->o_wrk0); + if (solver->Nmasked) cds->maskCopyKernel(solver->Nmasked, 0, solver->o_maskIds, cds->o_wrk2, cds->o_wrk0); - cds->helmholtzAddBCKernel(mesh->Nelements, - cds->fieldOffset, - is, - time, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - cds->o_EToB[is], - *(cds->o_usrwrk), - cds->o_wrk0); return cds->o_wrk0; } diff --git a/src/core/cds.h b/src/core/cds.h index c225b12f3..2e2b9b67c 100644 --- a/src/core/cds.h +++ b/src/core/cds.h @@ -164,12 +164,14 @@ typedef struct occa::kernel helmholtzRhsIpdgBCKernel; occa::kernel helmholtzRhsBCKernel; - occa::kernel helmholtzAddBCKernel; + occa::kernel dirichletBCKernel; occa::kernel setEllipticCoeffKernel; occa::kernel invMassMatrixKernel; occa::kernel massMatrixKernel; + occa::kernel maskCopyKernel; + occa::properties* kernelInfo; }cds_t; diff --git a/src/core/ins.h b/src/core/ins.h index 02bcf93da..5b6ccdb27 100644 --- a/src/core/ins.h +++ b/src/core/ins.h @@ -98,7 +98,7 @@ typedef struct occa::memory o_pRecvBuffer,h_pRecvBuffer; occa::memory o_gatherTmpPinned, h_gatherTmpPinned; - occa::memory o_wrk0, o_wrk1, o_wrk2, o_wrk3, o_wrk4, o_wrk5, o_wrk6, + occa::memory o_wrk0, o_wrk1, o_wrk2, o_wrk3, o_wrk4, o_wrk5, o_wrk6, o_wrk7, o_wrk9, o_wrk12, o_wrk15; int Nsubsteps; @@ -195,13 +195,13 @@ typedef struct occa::kernel divergenceStrongVolumeKernel; occa::kernel sumMakefKernel; occa::kernel pressureRhsKernel; - occa::kernel pressureAddBCKernel; + occa::kernel pressureDirichletBCKernel; occa::kernel pressurePenaltyKernel; occa::kernel pressureUpdateKernel; occa::kernel velocityRhsKernel; occa::kernel velocityRhsBCKernel; - occa::kernel velocityAddBCKernel; + occa::kernel velocityDirichletBCKernel; occa::kernel setScalarKernel; @@ -217,6 +217,8 @@ typedef struct occa::kernel invMassMatrixKernel; occa::kernel massMatrixKernel; + occa::kernel maskCopyKernel; + int* EToB; occa::memory o_EToB; diff --git a/src/core/insSetup.cpp b/src/core/insSetup.cpp index 7fe675782..ab5b5884c 100644 --- a/src/core/insSetup.cpp +++ b/src/core/insSetup.cpp @@ -106,7 +106,8 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil ins->finalTime += ins->startTime; } - ins->NtimeSteps = ceil((ins->finalTime - ins->startTime) / ins->dt); + ins->NtimeSteps = (ins->finalTime - ins->startTime) / ins->dt; + if((ins->finalTime - ins->NtimeSteps*ins->dt) > ins->dt) ins->NtimeSteps++; options.setArgs("NUMBER TIMESTEPS", std::to_string(ins->NtimeSteps)); if(ins->Nsubsteps) ins->sdt = ins->dt / ins->Nsubsteps; @@ -142,7 +143,7 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil ins->PI = (dfloat*) calloc(ins->fieldOffset,sizeof(dfloat)); ins->BF = (dfloat*) calloc(ins->NVfields * ins->fieldOffset,sizeof(dfloat)); - ins->FU = (dfloat*) calloc(ins->NVfields * (ins->Nstages + 1) * ins->fieldOffset,sizeof(dfloat)); + ins->FU = (dfloat*) calloc(ins->NVfields * ins->Nstages * ins->fieldOffset,sizeof(dfloat)); if(ins->Nsubsteps) { int Sorder; @@ -181,6 +182,7 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil ins->o_wrk4 = o_scratch.slice( 4 * ins->fieldOffset * sizeof(dfloat)); ins->o_wrk5 = o_scratch.slice( 5 * ins->fieldOffset * sizeof(dfloat)); ins->o_wrk6 = o_scratch.slice( 6 * ins->fieldOffset * sizeof(dfloat)); + ins->o_wrk7 = o_scratch.slice( 7 * ins->fieldOffset * sizeof(dfloat)); ins->o_wrk9 = o_scratch.slice( 9 * ins->fieldOffset * sizeof(dfloat)); ins->o_wrk12 = o_scratch.slice(12 * ins->fieldOffset * sizeof(dfloat)); ins->o_wrk15 = o_scratch.slice(15 * ins->fieldOffset * sizeof(dfloat)); @@ -196,7 +198,7 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil ins->o_PI = mesh->device.malloc(ins->fieldOffset * sizeof(dfloat), ins->PI); ins->o_FU = - mesh->device.malloc(ins->NVfields * (ins->Nstages + 1) * ins->fieldOffset * sizeof(dfloat), + mesh->device.malloc(ins->NVfields * ins->Nstages * ins->fieldOffset * sizeof(dfloat), ins->FU); ins->o_BF = mesh->device.malloc(ins->NVfields * ins->fieldOffset * sizeof(dfloat), ins->BF); @@ -585,8 +587,7 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil // build inverse mass matrix for(hlong e = 0; e < mesh->Nelements; ++e) for(int n = 0; n < mesh->Np; ++n) - lumpedMassMatrix[e * mesh->Np + - n] = mesh->vgeo[e * mesh->Np * mesh->Nvgeo + JWID * mesh->Np + n]; + lumpedMassMatrix[e * mesh->Np + n] = mesh->vgeo[e * mesh->Np * mesh->Nvgeo + JWID * mesh->Np + n]; ogsGatherScatter(lumpedMassMatrix, ogsDfloat, ogsAdd, mesh->ogs); for(int n = 0; n < mesh->Np * mesh->Nelements; ++n) lumpedMassMatrix[n] = 1. / lumpedMassMatrix[n]; @@ -656,12 +657,12 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo); fileName = oklpath + "insPressureBC" + suffix + ".okl"; - kernelName = "insPressureAddBCTOMBO" + suffix; - ins->pressureAddBCKernel = + kernelName = "insPressureDirichletBC" + suffix; + ins->pressureDirichletBCKernel = mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC); fileName = oklpath + "insPressureUpdate" + ".okl"; - kernelName = "insPressureUpdateTOMBO"; + kernelName = "insPressureUpdate"; ins->pressureUpdateKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo); fileName = oklpath + "insVelocityRhs" + suffix + ".okl"; @@ -674,8 +675,8 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil ins->velocityRhsBCKernel = mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC); - kernelName = "insVelocityAddBC" + suffix; - ins->velocityAddBCKernel = + kernelName = "insVelocityDirichletBC" + suffix; + ins->velocityDirichletBCKernel = mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC); fileName = oklpath + "insSubCycle" + suffix + ".okl"; @@ -724,6 +725,10 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil ins->scalarScaledAddKernel = mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo); + kernelName = "maskCopy"; + ins->maskCopyKernel = + mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo); + // =========================================================================== fileName = oklpath + "insFilterRT" + suffix + ".okl"; @@ -865,10 +870,10 @@ cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &k // Solution storage at interpolation nodes cds->U = ins->U; // Point to INS side Velocity cds->S = - (dfloat*) calloc(cds->NSfields * (cds->Nstages + 0) * cds->fieldOffset,sizeof(dfloat)); + (dfloat*) calloc(cds->NSfields * cds->Nstages * cds->fieldOffset,sizeof(dfloat)); cds->BF = (dfloat*) calloc(cds->NSfields * cds->fieldOffset,sizeof(dfloat)); cds->FS = - (dfloat*) calloc(cds->NSfields * (cds->Nstages + 1) * cds->fieldOffset,sizeof(dfloat)); + (dfloat*) calloc(cds->NSfields * cds->Nstages * cds->fieldOffset,sizeof(dfloat)); cds->Nsubsteps = ins->Nsubsteps; if(cds->Nsubsteps) { @@ -918,11 +923,11 @@ cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &k cds->o_U = ins->o_U; cds->o_Ue = ins->o_Ue; cds->o_S = - mesh->device.malloc(cds->NSfields * (cds->Nstages + 0) * cds->fieldOffset * sizeof(dfloat), + mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat), cds->S); cds->o_BF = mesh->device.malloc(cds->NSfields * cds->fieldOffset * sizeof(dfloat), cds->BF); cds->o_FS = - mesh->device.malloc(cds->NSfields * (cds->Nstages + 1) * cds->fieldOffset * sizeof(dfloat), + mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat), cds->FS); cds->options = options; @@ -1089,6 +1094,11 @@ cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &k cds->setScalarKernel = mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo); + fileName = oklpath + "math.okl"; + kernelName = "maskCopy"; + cds->maskCopyKernel = + mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo); + fileName = oklpath + "cdsSumMakef" + suffix + ".okl"; kernelName = "cdsSumMakef" + suffix; cds->sumMakefKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo); @@ -1097,8 +1107,8 @@ cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &k kernelName = "cdsHelmholtzBC" + suffix; cds->helmholtzRhsBCKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfoBC); - kernelName = "cdsHelmholtzAddBC" + suffix; - cds->helmholtzAddBCKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfoBC); + kernelName = "cdsDirichletBC"; + cds->dirichletBCKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfoBC); fileName = oklpath + "setEllipticCoeff.okl"; kernelName = "setEllipticCoeff"; diff --git a/src/core/parReader.cpp b/src/core/parReader.cpp index fa53cdc65..60b2adfbb 100644 --- a/src/core/parReader.cpp +++ b/src/core/parReader.cpp @@ -180,7 +180,7 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm) ini.extract("general", "timestepper", timeStepper); if(timeStepper == "bdf3" || timeStepper == "tombo3") { options.setArgs("TIME INTEGRATOR", "TOMBO3"); - exit("No support for bdf3!", EXIT_FAILURE); + //exit("No support for bdf3!", EXIT_FAILURE); } if(timeStepper == "bdf2" || timeStepper == "tombo2") options.setArgs("TIME INTEGRATOR", "TOMBO2"); diff --git a/src/core/runTime.cpp b/src/core/runTime.cpp index c834c62aa..0956ea90c 100644 --- a/src/core/runTime.cpp +++ b/src/core/runTime.cpp @@ -10,12 +10,12 @@ void extbdfCoefficents(ins_t* ins, int order); -void makef(ins_t* ins, dfloat time, occa::memory &o_BF); +void makef(ins_t* ins, dfloat time, occa::memory o_FU, occa::memory o_BF); occa::memory velocityStrongSubCycle(ins_t* ins, dfloat time, occa::memory o_U); void fluidSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_U); -void makeq(ins_t* ins, dfloat time, occa::memory o_BF); +void makeq(ins_t* ins, dfloat time, occa::memory o_FS, occa::memory o_BF); occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is, occa::memory o_U, occa::memory o_S); void scalarSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_S); @@ -42,14 +42,13 @@ void runStep(ins_t* ins, dfloat time, dfloat dt, int tstep) extbdfCoefficents(ins,tstep); // First extrapolate velocity to t^(n+1) - int velocityExtrapolationOrder = ins->ExplicitOrder; - if(!ins->flow) velocityExtrapolationOrder = 1; - ins->velocityExtKernel(mesh->Nelements, - velocityExtrapolationOrder, - ins->fieldOffset, - ins->o_extbdfC, - ins->o_U, - ins->o_Ue); + if(ins->flow) + ins->velocityExtKernel(mesh->Nelements, + ins->ExplicitOrder, + ins->fieldOffset, + ins->o_extbdfA, + ins->o_U, + ins->o_Ue); if(ins->Nscalar) scalarSolve(ins, time, dt, cds->o_S); @@ -66,7 +65,7 @@ void runStep(ins_t* ins, dfloat time, dfloat dt, int tstep) qthermal(ins, time + dt, ins->o_div); } - if(ins->flow) fluidSolve(ins, time, dt, ins->o_U); + if(ins->flow) fluidSolve(ins, time, dt, ins->o_U); const dfloat cfl = computeCFL(ins, time + dt, tstep); @@ -103,51 +102,43 @@ void extbdfCoefficents(ins_t* ins, int order) { if(order == 1) { ins->g0 = 1.0f; - dfloat extbdfB[3] = {1.0f, 0.0f, 0.0f}; - dfloat extbdfA[3] = {1.0f, 0.0f, 0.0f}; - dfloat extbdfC[3] = {1.0f, 0.0f, 0.0f}; + dfloat extbdfB[] = {1.0f, 0.0f, 0.0f}; + dfloat extbdfA[] = {1.0f, 0.0f, 0.0f}; + dfloat extbdfC[] = {1.0f, 0.0f, 0.0f}; memcpy(ins->extbdfB, extbdfB, 3 * sizeof(dfloat)); memcpy(ins->extbdfA, extbdfA, 3 * sizeof(dfloat)); memcpy(ins->extbdfC, extbdfC, 3 * sizeof(dfloat)); - ins->o_extbdfB.copyFrom(extbdfB); - ins->o_extbdfA.copyFrom(extbdfA); - ins->o_extbdfC.copyFrom(extbdfC); - ins->ExplicitOrder = 1; } else if(order == 2) { ins->g0 = 1.5f; - dfloat extbdfB[3] = {2.0f,-0.5f, 0.0f}; - dfloat extbdfA[3] = {2.0f,-1.0f, 0.0f}; - dfloat extbdfC[3] = {1.0f, 0.0f, 0.0f}; + dfloat extbdfB[] = {2.0f,-0.5f, 0.0f}; + dfloat extbdfA[] = {2.0f,-1.0f, 0.0f}; + dfloat extbdfC[] = {1.0f, 0.0f, 0.0f}; memcpy(ins->extbdfB, extbdfB, 3 * sizeof(dfloat)); memcpy(ins->extbdfA, extbdfA, 3 * sizeof(dfloat)); memcpy(ins->extbdfC, extbdfC, 3 * sizeof(dfloat)); - ins->o_extbdfB.copyFrom(extbdfB); - ins->o_extbdfA.copyFrom(extbdfA); - ins->o_extbdfC.copyFrom(extbdfC); - ins->ExplicitOrder = 2; } else if(order == 3) { ins->g0 = 11.f / 6.f; - dfloat extbdfB[3] = {3.0f,-1.5f, 1.0f / 3.0f}; - dfloat extbdfA[3] = {3.0f,-3.0f, 1.0f}; - dfloat extbdfC[3] = {2.0f,-1.0f, 0.0f}; + dfloat extbdfB[] = {3.0f,-1.5f, 1.0f/3.0f}; + dfloat extbdfA[] = {3.0f,-3.0f, 1.0f}; + dfloat extbdfC[] = {2.0f,-1.0f, 0.0f}; memcpy(ins->extbdfB, extbdfB, 3 * sizeof(dfloat)); memcpy(ins->extbdfA, extbdfA, 3 * sizeof(dfloat)); memcpy(ins->extbdfC, extbdfC, 3 * sizeof(dfloat)); - ins->o_extbdfB.copyFrom(extbdfB); - ins->o_extbdfA.copyFrom(extbdfA); - ins->o_extbdfC.copyFrom(extbdfC); - ins->ExplicitOrder = 3; } + ins->o_extbdfB.copyFrom(ins->extbdfB); // bdf + ins->o_extbdfA.copyFrom(ins->extbdfA); // ext + ins->o_extbdfC.copyFrom(ins->extbdfC); + ins->ig0 = 1.0 / ins->g0; if (ins->Nscalar) { @@ -157,20 +148,19 @@ void extbdfCoefficents(ins_t* ins, int order) } } -void makeq(ins_t* ins, dfloat time, occa::memory o_BF) +void makeq(ins_t* ins, dfloat time, occa::memory o_FS, occa::memory o_BF) { cds_t* cds = ins->cds; mesh_t* mesh = cds->mesh; - cds->setScalarKernel(cds->fieldOffset * cds->NSfields, 0.0, cds->o_FS); - if(udf.sEqnSource) udf.sEqnSource(ins, time, cds->o_S, cds->o_FS); + if(udf.sEqnSource) udf.sEqnSource(ins, time, cds->o_S, o_FS); for(int is = 0; is < cds->NSfields; is++) { if(!cds->compute[is]) continue; mesh_t* mesh; (is) ? mesh = cds->meshV : mesh = cds->mesh; - const dlong sOffset = is * cds->fieldOffset; + const dlong isOffset = is * cds->fieldOffset; occa::memory o_adv = cds->o_wrk0; if(cds->options.compareArgs("FILTER STABILIZATION", "RELAXATION")) @@ -178,10 +168,10 @@ void makeq(ins_t* ins, dfloat time, occa::memory o_BF) cds->meshV->Nelements, ins->o_filterMT, ins->filterS, - sOffset, + isOffset, cds->o_rho, cds->o_S, - cds->o_FS); + o_FS); if(cds->Nsubsteps) { o_adv = scalarStrongSubCycle(cds, time, is, cds->o_U, cds->o_S); @@ -195,8 +185,8 @@ void makeq(ins_t* ins, dfloat time, occa::memory o_BF) mesh->o_cubInterpT, mesh->o_cubProjectT, cds->vFieldOffset, - sOffset, - cds->o_Ue, + isOffset, + cds->o_U, cds->o_S, cds->o_rho, cds->o_wrk0); @@ -206,8 +196,8 @@ void makeq(ins_t* ins, dfloat time, occa::memory o_BF) mesh->o_vgeo, mesh->o_Dmatrices, cds->vFieldOffset, - sOffset, - cds->o_Ue, + isOffset, + cds->o_U, cds->o_S, cds->o_rho, cds->o_wrk0); @@ -218,25 +208,23 @@ void makeq(ins_t* ins, dfloat time, occa::memory o_BF) 0 * cds->fieldOffset, cds->o_wrk0, 1.0, - sOffset, - cds->o_FS); + isOffset, + o_FS); } cds->sumMakefKernel( mesh->Nelements, mesh->o_vgeo, - mesh->o_MM, cds->idt, cds->o_extbdfA, cds->o_extbdfB, - cds->o_extbdfC, cds->fieldOffset * cds->NSfields, - sOffset, + isOffset, cds->o_S, o_adv, - cds->o_FS, + o_FS, cds->o_rho, - cds->o_BF); + o_BF); } } @@ -244,17 +232,17 @@ void scalarSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_S) { cds_t* cds = ins->cds; - for (int s = cds->Nstages; s > 1; s--) - cds->o_FS.copyFrom( - cds->o_FS, - cds->fieldOffset * cds->NSfields * sizeof(dfloat), - (s - 1) * cds->fieldOffset * cds->NSfields * sizeof(dfloat), - (s - 2) * cds->fieldOffset * cds->NSfields * sizeof(dfloat)); - timer::tic("makeq", 1); - makeq(ins, time, cds->o_BF); + cds->setScalarKernel(cds->fieldOffset * cds->NSfields, 0.0, cds->o_FS); + makeq(ins, time, cds->o_FS, cds->o_BF); timer::toc("makeq"); + for (int s = cds->Nstages; s > 1; s--) { + const dlong Nbyte = cds->fieldOffset * cds->NSfields * sizeof(dfloat); + cds->o_FS.copyFrom(cds->o_FS, Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte); + cds->o_S.copyFrom (cds->o_S , Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte); + } + timer::tic("scalarSolve", 1); for (int is = 0; is < cds->NSfields; is++) { if(!cds->compute[is]) continue; @@ -282,31 +270,16 @@ void scalarSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_S) cds->o_ellipticCoeff); occa::memory o_Snew = cdsSolve(is, cds, time + dt); - - for (int s = cds->Nstages; s > 1; s--) - o_S.copyFrom( - o_S, - cds->Ntotal * sizeof(dfloat), - ((s - 1) * (cds->fieldOffset * cds->NSfields) + is * cds->fieldOffset) * sizeof(dfloat), - ((s - 2) * (cds->fieldOffset * cds->NSfields) + is * cds->fieldOffset) * sizeof(dfloat)); o_Snew.copyTo(o_S, cds->Ntotal * sizeof(dfloat), is * cds->fieldOffset * sizeof(dfloat)); } timer::toc("scalarSolve"); } -void makef(ins_t* ins, dfloat time, occa::memory &o_BF) +void makef(ins_t* ins, dfloat time, occa::memory o_FU, occa::memory o_BF) { mesh_t* mesh = ins->mesh; - for (int s = ins->Nstages; s > 1; s--) - ins->o_FU.copyFrom( - ins->o_FU, - ins->fieldOffset * ins->NVfields * sizeof(dfloat), - (s - 1) * ins->fieldOffset * ins->NVfields * sizeof(dfloat), - (s - 2) * ins->fieldOffset * ins->NVfields * sizeof(dfloat)); - - ins->setScalarKernel(ins->fieldOffset * ins->NVfields, 0.0, ins->o_FU); - if(udf.uEqnSource) udf.uEqnSource(ins, time, ins->o_U, ins->o_FU); + if(udf.uEqnSource) udf.uEqnSource(ins, time, ins->o_U, o_FU); if(ins->options.compareArgs("FILTER STABILIZATION", "RELAXATION")) ins->filterRTKernel( @@ -315,7 +288,7 @@ void makef(ins_t* ins, dfloat time, occa::memory &o_BF) ins->filterS, ins->fieldOffset, ins->o_U, - ins->o_FU); + o_FU); occa::memory o_adv = ins->o_wrk0; if(ins->Nsubsteps) { @@ -348,7 +321,7 @@ void makef(ins_t* ins, dfloat time, occa::memory &o_BF) ins->o_wrk0, 1.0, 0, - ins->o_FU); + o_FU); } ins->sumMakefKernel( @@ -361,7 +334,7 @@ void makef(ins_t* ins, dfloat time, occa::memory &o_BF) ins->fieldOffset, ins->o_U, o_adv, - ins->o_FU, + o_FU, o_BF); } @@ -370,9 +343,16 @@ void fluidSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_U) mesh_t* mesh = ins->mesh; timer::tic("makef", 1); - makef(ins, time, ins->o_BF); + ins->setScalarKernel(ins->fieldOffset * ins->NVfields, 0.0, ins->o_FU); + makef(ins, time, ins->o_FU, ins->o_BF); timer::toc("makef"); + for (int s = ins->Nstages; s > 1; s--) { + const dlong Nbyte = ins->fieldOffset * ins->NVfields * sizeof(dfloat); + ins->o_FU.copyFrom(ins->o_FU, Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte); + ins->o_U.copyFrom (ins->o_U , Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte); + } + timer::tic("pressureSolve", 1); ins->setEllipticCoeffPressureKernel( ins->Nlocal, @@ -394,12 +374,6 @@ void fluidSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_U) ins->o_ellipticCoeff); occa::memory o_Unew = tombo::velocitySolve(ins, time + dt); - for (int s = ins->Nstages; s > 1; s--) - o_U.copyFrom( - o_U, - ins->fieldOffset * ins->NVfields * sizeof(dfloat), - (s - 1) * ins->fieldOffset * ins->NVfields * sizeof(dfloat), - (s - 2) * ins->fieldOffset * ins->NVfields * sizeof(dfloat)); o_U.copyFrom(o_Unew, ins->NVfields * ins->fieldOffset * sizeof(dfloat)); timer::toc("velocitySolve"); } @@ -460,32 +434,34 @@ occa::memory velocityStrongSubCycle(ins_t* ins, dfloat time, occa::memory o_U) o_U, ins->o_Ue); - if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - ins->subCycleStrongCubatureVolumeKernel( - mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - mesh->o_vgeo, - mesh->o_cubvgeo, - mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - ins->o_InvM, - ins->fieldOffset, - rk * ins->NVfields * ins->fieldOffset, - ins->o_Ue, - ins->o_wrk0, - ins->o_wrk6); - else - ins->subCycleStrongVolumeKernel( - mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - mesh->o_vgeo, - mesh->o_Dmatrices, - ins->fieldOffset, - rk * ins->NVfields * ins->fieldOffset, - ins->o_Ue, - ins->o_wrk0, - ins->o_wrk6); + if(mesh->NglobalGatherElements) { + if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + ins->subCycleStrongCubatureVolumeKernel( + mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + mesh->o_vgeo, + mesh->o_cubvgeo, + mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + ins->o_InvM, + ins->fieldOffset, + rk * ins->NVfields * ins->fieldOffset, + ins->o_Ue, + ins->o_wrk0, + ins->o_wrk6); + else + ins->subCycleStrongVolumeKernel( + mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + mesh->o_vgeo, + mesh->o_Dmatrices, + ins->fieldOffset, + rk * ins->NVfields * ins->fieldOffset, + ins->o_Ue, + ins->o_wrk0, + ins->o_wrk6); + } occa::memory o_rhs; if(rk == 0) o_rhs = ins->o_wrk6; @@ -495,32 +471,34 @@ occa::memory velocityStrongSubCycle(ins_t* ins, dfloat time, occa::memory o_U) oogs::start(o_rhs, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); - if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - ins->subCycleStrongCubatureVolumeKernel( - mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_vgeo, - mesh->o_cubvgeo, - mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - ins->o_InvM, - ins->fieldOffset, - rk * ins->NVfields * ins->fieldOffset, - ins->o_Ue, - ins->o_wrk0, - ins->o_wrk6); - else - ins->subCycleStrongVolumeKernel( - mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_vgeo, - mesh->o_Dmatrices, - ins->fieldOffset, - rk * ins->NVfields * ins->fieldOffset, - ins->o_Ue, - ins->o_wrk0, - ins->o_wrk6); + if(mesh->NlocalGatherElements) { + if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + ins->subCycleStrongCubatureVolumeKernel( + mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_vgeo, + mesh->o_cubvgeo, + mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + ins->o_InvM, + ins->fieldOffset, + rk * ins->NVfields * ins->fieldOffset, + ins->o_Ue, + ins->o_wrk0, + ins->o_wrk6); + else + ins->subCycleStrongVolumeKernel( + mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_vgeo, + mesh->o_Dmatrices, + ins->fieldOffset, + rk * ins->NVfields * ins->fieldOffset, + ins->o_Ue, + ins->o_wrk0, + ins->o_wrk6); + } oogs::finish(o_rhs, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( @@ -605,31 +583,33 @@ occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is, o_U, cds->o_Ue); - if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->subCycleStrongCubatureVolumeKernel( - mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - cds->vFieldOffset, - rk * cds->fieldOffset, - mesh->o_vgeo, - mesh->o_cubvgeo, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - cds->o_Ue, - cds->o_wrk0, - cds->o_wrk2); - else - cds->subCycleStrongVolumeKernel( - mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - cds->vFieldOffset, - rk * cds->fieldOffset, - mesh->o_vgeo, - mesh->o_Dmatrices, - cds->o_Ue, - cds->o_wrk0, - cds->o_wrk2); + if(mesh->NglobalGatherElements) { + if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + cds->subCycleStrongCubatureVolumeKernel( + mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + cds->vFieldOffset, + rk * cds->fieldOffset, + mesh->o_vgeo, + mesh->o_cubvgeo, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + cds->o_Ue, + cds->o_wrk0, + cds->o_wrk2); + else + cds->subCycleStrongVolumeKernel( + mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + cds->vFieldOffset, + rk * cds->fieldOffset, + mesh->o_vgeo, + mesh->o_Dmatrices, + cds->o_Ue, + cds->o_wrk0, + cds->o_wrk2); + } occa::memory o_rhs; if(rk == 0) o_rhs = cds->o_wrk2; @@ -639,31 +619,33 @@ occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is, oogs::start(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh); - if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->subCycleStrongCubatureVolumeKernel( - mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - cds->vFieldOffset, - rk * cds->fieldOffset, - mesh->o_vgeo, - mesh->o_cubvgeo, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - cds->o_Ue, - cds->o_wrk0, - cds->o_wrk2); - else - cds->subCycleStrongVolumeKernel( - mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - cds->vFieldOffset, - rk * cds->fieldOffset, - mesh->o_vgeo, - mesh->o_Dmatrices, - cds->o_Ue, - cds->o_wrk0, - cds->o_wrk2); + if(mesh->NlocalGatherElements) { + if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + cds->subCycleStrongCubatureVolumeKernel( + mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + cds->vFieldOffset, + rk * cds->fieldOffset, + mesh->o_vgeo, + mesh->o_cubvgeo, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + cds->o_Ue, + cds->o_wrk0, + cds->o_wrk2); + else + cds->subCycleStrongVolumeKernel( + mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + cds->vFieldOffset, + rk * cds->fieldOffset, + mesh->o_vgeo, + mesh->o_Dmatrices, + cds->o_Ue, + cds->o_wrk0, + cds->o_wrk2); + } oogs::finish(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh); cds->invMassMatrixKernel( diff --git a/src/core/tombo.cpp b/src/core/tombo.cpp index 907252d2f..7830814ad 100644 --- a/src/core/tombo.cpp +++ b/src/core/tombo.cpp @@ -62,7 +62,6 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) ins->o_wrk0, ins->o_wrk6); - oogs::startFinish(ins->o_wrk6, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( @@ -81,7 +80,59 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) ins->o_wrk6, ins->o_wrk3); - const dfloat lambda = ins->g0 * ins->idt; + //enforce Dirichlet BCs + ins->setScalarKernel((1+ins->NVfields)*ins->fieldOffset, std::numeric_limits::min(), ins->o_wrk6); + for (int sweep = 0; sweep < 2; sweep++) { + ins->pressureDirichletBCKernel(mesh->Nelements, + time, + ins->dt, + ins->fieldOffset, + mesh->o_sgeo, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + ins->o_EToB, + ins->o_usrwrk, + ins->o_U, + ins->o_P, + ins->o_wrk6); + + ins->velocityDirichletBCKernel(mesh->Nelements, + ins->fieldOffset, + time, + mesh->o_sgeo, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + ins->o_EToB, + ins->o_usrwrk, + ins->o_U, + ins->o_wrk7); + + //take care of Neumann-Dirichlet shared edges across elements + if (sweep == 0) oogs::startFinish(ins->o_wrk6, 1+ins->NVfields, ins->fieldOffset, ogsDfloat, ogsMax, ins->gsh); + if (sweep == 1) oogs::startFinish(ins->o_wrk6, 1+ins->NVfields, ins->fieldOffset, ogsDfloat, ogsMin, ins->gsh); + } + + if (ins->pSolver->Nmasked) ins->maskCopyKernel(ins->pSolver->Nmasked, 0, ins->pSolver->o_maskIds, + ins->o_wrk6, ins->o_P); + + if (ins->uvwSolver) { + if (ins->uvwSolver->Nmasked) ins->maskCopyKernel(ins->uvwSolver->Nmasked, 0*ins->fieldOffset, ins->uvwSolver->o_maskIds, + ins->o_wrk7, ins->o_U); + } else { + if (ins->uSolver->Nmasked) ins->maskCopyKernel(ins->uSolver->Nmasked, 0*ins->fieldOffset, ins->uSolver->o_maskIds, + ins->o_wrk7, ins->o_U); + if (ins->vSolver->Nmasked) ins->maskCopyKernel(ins->vSolver->Nmasked, 1*ins->fieldOffset, ins->vSolver->o_maskIds, + ins->o_wrk7, ins->o_U); + if (ins->wSolver->Nmasked) ins->maskCopyKernel(ins->wSolver->Nmasked, 2*ins->fieldOffset, ins->wSolver->o_maskIds, + ins->o_wrk7, ins->o_U); + } + ins->divergenceSurfaceKernel( mesh->Nelements, mesh->o_vgeo, @@ -90,7 +141,7 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) mesh->o_EToB, ins->o_EToB, time, - -lambda, + -(ins->g0 * ins->idt), mesh->o_x, mesh->o_y, mesh->o_z, @@ -112,45 +163,21 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) ins->pressureAddQtlKernel( mesh->Nelements, mesh->o_vgeo, - lambda, + ins->g0 * ins->idt, ins->o_div, ins->o_wrk3); - elliptic_t* solver = ins->pSolver; - oogs::startFinish(ins->o_wrk3, 1, 0, ogsDfloat, ogsAdd, ins->gsh); - - if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_wrk3); + if (ins->pSolver->Nmasked) mesh->maskKernel(ins->pSolver->Nmasked, ins->pSolver->o_maskIds, ins->o_wrk3); ins->setScalarKernel(ins->Ntotal, 0.0, ins->o_PI); - //if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_PI); - - ins->NiterP = ellipticSolve(solver, ins->presTOL, ins->o_wrk3, ins->o_PI); - - ins->pressureAddBCKernel(mesh->Nelements, - time, - ins->dt, - ins->fieldOffset, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - ins->o_EToB, - ins->o_usrwrk, - ins->o_U, - ins->o_P, - ins->o_PI); - - // update (increment) all points but not Dirichlet + ins->NiterP = ellipticSolve(ins->pSolver, ins->presTOL, ins->o_wrk3, ins->o_PI); ins->pressureUpdateKernel(mesh->Nelements, - ins->fieldOffset, - solver->o_mapB, + ins->pSolver->o_mapB, ins->o_PI, ins->o_P, - ins->o_wrk0); - return ins->o_wrk0; + ins->o_wrk1); + return ins->o_wrk1; } occa::memory velocitySolve(ins_t* ins, dfloat time) @@ -163,6 +190,7 @@ occa::memory velocitySolve(ins_t* ins, dfloat time) ins->o_div, ins->o_P, ins->o_wrk3); + ins->gradientVolumeKernel( mesh->Nelements, mesh->o_vgeo, @@ -179,7 +207,6 @@ occa::memory velocitySolve(ins_t* ins, dfloat time) ins->o_extbdfA, ins->o_extbdfB, ins->fieldOffset, - ins->o_U, ins->o_BF, ins->o_FU, ins->o_wrk0, @@ -220,7 +247,11 @@ occa::memory velocitySolve(ins_t* ins, dfloat time) if (ins->uvwSolver->Nmasked) mesh->maskKernel(ins->uvwSolver->Nmasked, ins->uvwSolver->o_maskIds, ins->o_wrk3); + ins->NiterU = ellipticSolve(ins->uvwSolver, ins->velTOL, ins->o_wrk3, ins->o_wrk0); + + if (ins->uvwSolver->Nmasked) ins->maskCopyKernel(ins->uvwSolver->Nmasked, 0*ins->fieldOffset, ins->uvwSolver->o_maskIds, + ins->o_U, ins->o_wrk0); } else { if (ins->uSolver->Nmasked) mesh->maskKernel(ins->uSolver->Nmasked, ins->uSolver->o_maskIds, @@ -240,25 +271,20 @@ occa::memory velocitySolve(ins_t* ins, dfloat time) if (ins->wSolver->Nmasked) mesh->maskKernel(ins->wSolver->Nmasked, ins->wSolver->o_maskIds, ins->o_wrk5); + ins->NiterU = ellipticSolve(ins->uSolver, ins->velTOL, ins->o_wrk3, ins->o_wrk0); ins->NiterV = ellipticSolve(ins->vSolver, ins->velTOL, ins->o_wrk4, ins->o_wrk1); ins->NiterW = ellipticSolve(ins->wSolver, ins->velTOL, ins->o_wrk5, ins->o_wrk2); - } - ins->velocityAddBCKernel(mesh->Nelements, - ins->fieldOffset, - time, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - ins->o_EToB, - ins->o_usrwrk, - ins->o_U, - ins->o_wrk0); + if (ins->uSolver->Nmasked) ins->maskCopyKernel(ins->uSolver->Nmasked, 0*ins->fieldOffset, ins->uSolver->o_maskIds, + ins->o_U, ins->o_wrk0); + if (ins->vSolver->Nmasked) ins->maskCopyKernel(ins->vSolver->Nmasked, 1*ins->fieldOffset, ins->vSolver->o_maskIds, + ins->o_U, ins->o_wrk0); + if (ins->wSolver->Nmasked) ins->maskCopyKernel(ins->wSolver->Nmasked, 2*ins->fieldOffset, ins->wSolver->o_maskIds, + ins->o_U, ins->o_wrk0); + } return ins->o_wrk0; } + } // namespace diff --git a/src/libP/solvers/elliptic/elliptic.h b/src/libP/solvers/elliptic/elliptic.h index f48111edb..42f73b0a2 100644 --- a/src/libP/solvers/elliptic/elliptic.h +++ b/src/libP/solvers/elliptic/elliptic.h @@ -318,6 +318,14 @@ void ellipticOperator(elliptic_t* elliptic, occa::memory &o_Aq, const char* precision); +void ellipticAx(elliptic_t* elliptic, + dlong NelementsList, + occa::memory &o_elementsList, + occa::memory &o_q, + occa::memory &o_Aq, + const char* precision); + + dfloat ellipticWeightedNorm2(elliptic_t* elliptic, occa::memory &o_w, occa::memory &o_a); void ellipticBuildIpdg(elliptic_t* elliptic, int basisNp, dfloat* basis, dfloat lambda, diff --git a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c b/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c index 83b511c38..2d6849b18 100644 --- a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c +++ b/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c @@ -992,19 +992,10 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf oogs_mode oogsMode = OOGS_AUTO; if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT; if(options.compareArgs("THREAD MODEL", "OPENMP")) oogsMode = OOGS_DEFAULT; - auto callback = [&]() // hardwaire to FP64 const coeff + auto callback = [&]() { - if(mesh->NlocalGatherElements == 0) return; - occa::kernel &partialAxKernel = elliptic->partialAxKernel; - partialAxKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->lambda[0], - elliptic->o_p, - elliptic->o_Ap); + ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, + elliptic->o_p, elliptic->o_Ap, dfloatString); }; elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode); elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode); diff --git a/src/libP/solvers/elliptic/src/ellipticOperator.c b/src/libP/solvers/elliptic/src/ellipticOperator.c index e6fb46df1..d5e65eef2 100644 --- a/src/libP/solvers/elliptic/src/ellipticOperator.c +++ b/src/libP/solvers/elliptic/src/ellipticOperator.c @@ -29,122 +29,76 @@ #include "omp.h" -//host gs -void ellipticSerialOperator(elliptic_t* elliptic, - occa::memory &o_q, - occa::memory &o_Aq, - const char* precision) +void ellipticAx(elliptic_t* elliptic, + dlong NelementsList, + occa::memory &o_elementsList, + occa::memory &o_q, + occa::memory &o_Aq, + const char* precision) { mesh_t* mesh = elliptic->mesh; setupAide &options = elliptic->options; int enableGatherScatters = 1; int enableReductions = 1; - int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS"); - int ipdg = options.compareArgs("DISCRETIZATION", "IPDG"); + const int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS"); + const int serial = options.compareArgs("THREAD MODEL", "SERIAL"); + const int ipdg = options.compareArgs("DISCRETIZATION", "IPDG"); + const int mapType = (elliptic->elementType == HEXAHEDRA && + options.compareArgs("ELEMENT MAP", "TRILINEAR")) ? 1:0; + const int integrationType = (elliptic->elementType == HEXAHEDRA && + options.compareArgs("ELLIPTIC INTEGRATION", "CUBATURE")) ? 1:0; options.getArgs("DEBUG ENABLE REDUCTIONS", enableReductions); options.getArgs("DEBUG ENABLE OGS", enableGatherScatters); - dfloat* sendBuffer = elliptic->sendBuffer; - dfloat* recvBuffer = elliptic->recvBuffer; - dfloat* gradSendBuffer = elliptic->gradSendBuffer; - dfloat* gradRecvBuffer = elliptic->gradRecvBuffer; - - dfloat alpha = 0., alphaG = 0.; - - if(continuous) { - if(elliptic->var_coeff) { - if(elliptic->blockSolver) - elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, mesh->o_ggeo, - mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, elliptic->o_lambda, - o_q, o_Aq); - else - elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, mesh->o_ggeo, mesh->o_Dmatrices, - mesh->o_Smatrices, mesh->o_MM, elliptic->o_lambda, o_q, o_Aq); - }else{ - const dfloat lambda = elliptic->lambda[0]; - if(elliptic->blockSolver) - elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, mesh->o_ggeo, - mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, elliptic->o_lambda, - o_q, o_Aq); - else - elliptic->AxKernel(mesh->Nelements, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - lambda, - o_q, - o_Aq); - } - - oogs::startFinish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); - if (elliptic->Nmasked) - mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Aq); - - } else if(ipdg) { - - printf("WARNING: DEBUGGING C0\n"); - MPI_Finalize(); - - exit(-1); - } -} - -void ellipticOperator(elliptic_t* elliptic, - occa::memory &o_q, - occa::memory &o_Aq, - const char* precision) -{ - mesh_t* mesh = elliptic->mesh; - setupAide &options = elliptic->options; - - int enableGatherScatters = 1; - int enableReductions = 1; - int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS"); - int serial = options.compareArgs("THREAD MODEL", "SERIAL"); - int ipdg = options.compareArgs("DISCRETIZATION", "IPDG"); - - options.getArgs("DEBUG ENABLE REDUCTIONS", enableReductions); - options.getArgs("DEBUG ENABLE OGS", enableGatherScatters); - - dfloat* sendBuffer = elliptic->sendBuffer; - dfloat* recvBuffer = elliptic->recvBuffer; - dfloat* gradSendBuffer = elliptic->gradSendBuffer; - dfloat* gradRecvBuffer = elliptic->gradRecvBuffer; - - dfloat alpha = 0., alphaG = 0.; - dlong Nblock = elliptic->Nblock; - dfloat* tmp = elliptic->tmp; - occa::memory &o_tmp = elliptic->o_tmp; - if(serial) { - ellipticSerialOperator(elliptic, o_q, o_Aq, precision); + if(continuous) { + if(elliptic->var_coeff) { + if(elliptic->blockSolver) + elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, mesh->o_ggeo, + mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, elliptic->o_lambda, + o_q, o_Aq); + else + elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, mesh->o_ggeo, mesh->o_Dmatrices, + mesh->o_Smatrices, mesh->o_MM, elliptic->o_lambda, o_q, o_Aq); + }else{ + const dfloat lambda = elliptic->lambda[0]; + if(elliptic->blockSolver) + elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, mesh->o_ggeo, + mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, elliptic->o_lambda, + o_q, o_Aq); + else + elliptic->AxKernel(mesh->Nelements, + mesh->o_ggeo, + mesh->o_Dmatrices, + mesh->o_Smatrices, + mesh->o_MM, + lambda, + o_q, + o_Aq); + } + } else { + exit(1); + } return; } if(continuous) { oogs_t* oogsAx = elliptic->oogsAx; - int mapType = (elliptic->elementType == HEXAHEDRA && - options.compareArgs("ELEMENT MAP", "TRILINEAR")) ? 1:0; - - int integrationType = (elliptic->elementType == HEXAHEDRA && - options.compareArgs("ELLIPTIC INTEGRATION", "CUBATURE")) ? 1:0; - occa::kernel &partialAxKernel = (strstr(precision, "float")) ? elliptic->partialFloatAxKernel : elliptic->partialAxKernel; - if(mesh->NglobalGatherElements) { + if(NelementsList) { if(integrationType == 0) { // GLL or non-hex if(mapType == 0) { if(elliptic->var_coeff) { if(elliptic->blockSolver) - partialAxKernel(mesh->NglobalGatherElements, + partialAxKernel(NelementsList, elliptic->Ntotal, elliptic->loffset, - mesh->o_globalGatherElementList, + o_elementsList, mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, @@ -153,9 +107,9 @@ void ellipticOperator(elliptic_t* elliptic, o_q, o_Aq); else - partialAxKernel(mesh->NglobalGatherElements, + partialAxKernel(NelementsList, elliptic->Ntotal, - mesh->o_globalGatherElementList, + o_elementsList, mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, @@ -165,10 +119,10 @@ void ellipticOperator(elliptic_t* elliptic, o_Aq); }else{ if(elliptic->blockSolver) - partialAxKernel(mesh->NglobalGatherElements, + partialAxKernel(NelementsList, elliptic->Ntotal, elliptic->loffset, - mesh->o_globalGatherElementList, + o_elementsList, mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, @@ -177,8 +131,8 @@ void ellipticOperator(elliptic_t* elliptic, o_q, o_Aq); else - partialAxKernel(mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, + partialAxKernel(NelementsList, + o_elementsList, mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, @@ -192,98 +146,9 @@ void ellipticOperator(elliptic_t* elliptic, if(elliptic->blockSolver) printf("Trilinear version for block solver is not avalibale yet\n"); else - partialAxKernel(mesh->NglobalGatherElements, - elliptic->Ntotal, - mesh->o_globalGatherElementList, - elliptic->o_EXYZ, - elliptic->o_gllzw, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->o_lambda, - o_q, - o_Aq); - }else{ - if(elliptic->blockSolver) - printf("Trilinear version for block solver is not avalibale yet\n"); - else - partialAxKernel(mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - elliptic->o_EXYZ, - elliptic->o_gllzw, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->lambda[0], - o_q, - o_Aq); - } - } - } - } - - if(enableGatherScatters) - oogs::start(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, oogsAx); - - if(mesh->NlocalGatherElements) { - if(integrationType == 0) { // GLL or non-hex - if(mapType == 0) { - if(elliptic->var_coeff) { - if(elliptic->blockSolver) - partialAxKernel(mesh->NlocalGatherElements, - elliptic->Ntotal, - elliptic->loffset, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->o_lambda, - o_q, - o_Aq); - else - partialAxKernel(mesh->NlocalGatherElements, - elliptic->Ntotal, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->o_lambda, - o_q, - o_Aq); - }else{ - if(elliptic->blockSolver) - partialAxKernel(mesh->NlocalGatherElements, - elliptic->Ntotal, - elliptic->loffset, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->o_lambda, - o_q, - o_Aq); - else - partialAxKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->lambda[0], - o_q, - o_Aq); - } - }else { - if(elliptic->var_coeff) { - if(elliptic->blockSolver) - printf("Trilinear version for block solver is not avalibale yet\n"); - else - partialAxKernel(mesh->NlocalGatherElements, + partialAxKernel(NelementsList, elliptic->Ntotal, - mesh->o_localGatherElementList, + o_elementsList, elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, @@ -296,8 +161,8 @@ void ellipticOperator(elliptic_t* elliptic, if(elliptic->blockSolver) printf("Trilinear version for block solver is not avalibale yet\n"); else - partialAxKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, + partialAxKernel(NelementsList, + o_elementsList, elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, @@ -310,12 +175,29 @@ void ellipticOperator(elliptic_t* elliptic, } } } + } +} - if(enableGatherScatters) - oogs::finish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, oogsAx); +void ellipticOperator(elliptic_t* elliptic, + occa::memory &o_q, + occa::memory &o_Aq, + const char* precision) +{ + mesh_t* mesh = elliptic->mesh; + setupAide &options = elliptic->options; + oogs_t* oogsAx = elliptic->oogsAx; - if (elliptic->Nmasked) - mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Aq); + int serial = options.compareArgs("THREAD MODEL", "SERIAL"); + if(serial) { + occa::memory o_dummy; + ellipticAx(elliptic, mesh->Nelements, o_dummy, o_q, o_Aq, precision); + oogs::startFinish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); + } else { + ellipticAx(elliptic, mesh->NglobalGatherElements, mesh->o_globalGatherElementList, o_q, o_Aq, precision); + oogs::start(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, oogsAx); + ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, o_q, o_Aq, precision); + oogs::finish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, oogsAx); } + if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Aq); } diff --git a/src/libP/solvers/elliptic/src/ellipticSolveSetup.c b/src/libP/solvers/elliptic/src/ellipticSolveSetup.c index 83a9455a1..21ec64b5a 100644 --- a/src/libP/solvers/elliptic/src/ellipticSolveSetup.c +++ b/src/libP/solvers/elliptic/src/ellipticSolveSetup.c @@ -1009,31 +1009,8 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo) if(options.compareArgs("THREAD MODEL", "OPENMP")) oogsMode = OOGS_DEFAULT; auto callback = [&]() // hardwired to FP64 variable coeff { - if(mesh->NlocalGatherElements == 0) return; - occa::kernel &partialAxKernel = elliptic->partialAxKernel; - if(elliptic->blockSolver) - partialAxKernel(mesh->NlocalGatherElements, - elliptic->Ntotal, - elliptic->loffset, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->o_lambda, - elliptic->o_p, - elliptic->o_Ap); - else - partialAxKernel(mesh->NlocalGatherElements, - elliptic->Ntotal, - mesh->o_localGatherElementList, - mesh->o_ggeo, - mesh->o_Dmatrices, - mesh->o_Smatrices, - mesh->o_MM, - elliptic->o_lambda, - elliptic->o_p, - elliptic->o_Ap); + ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, + elliptic->o_p, elliptic->o_Ap, dfloatString); }; elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode); elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode); diff --git a/src/main.cpp b/src/main.cpp index a1444c3ae..e5d89baef 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -157,14 +157,12 @@ int main(int argc, char** argv) nekrs::nekOutfld(); } - if (tStep && tStep % runTimeStatFreq == 0) nekrs::printRuntimeStatistics(); + if (tStep && tStep % runTimeStatFreq == 0 || tStep == NtimeSteps) nekrs::printRuntimeStatistics(); ++tStep; } MPI_Pcontrol(0); - if ((tStep-1) % runTimeStatFreq != 0) nekrs::printRuntimeStatistics(); - if(rank == 0) std::cout << "\nEnd." << "\n"; MPI_Finalize();