diff --git a/src/cds.cpp b/src/cds.cpp index 47f87bf9e..a2e64c21a 100644 --- a/src/cds.cpp +++ b/src/cds.cpp @@ -121,118 +121,4 @@ void cdsAdvection(cds_t *cds, dfloat time, occa::memory o_U, occa::memory o_S, o } -void cdsStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd){ - mesh_t *mesh = cds->mesh; - const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs); - - const dfloat tn0 = time - 0*cds->dt; - const dfloat tn1 = time - 1*cds->dt; - const dfloat tn2 = time - 2*cds->dt; - - - dfloat zero = 0.0, one = 1.0; - int izero = 0; - dfloat b, bScale=0; - - // Solve for Each SubProblem - for (int torder=(cds->ExplicitOrder-1); torder>=0; torder--){ - - b=cds->extbdfB[torder]; - bScale += b; - - // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) - dlong toffset = torder*cds->NSfields*cds->Ntotal; - - if (torder==cds->ExplicitOrder-1) { //first substep - cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, zero, izero, o_Sd); - } else { //add the next field - cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, one, izero, o_Sd); - } - - // SubProblem starts from here from t^(n-torder) - const dfloat tsub = time - torder*cds->dt; - // Advance SubProblem to t^(n-torder+1) - for(int ststep = 0; ststepNsubsteps;++ststep){ - - const dfloat tstage = tsub + ststep*cds->sdt; - - for(int rk=0;rkSNrk;++rk){// LSERK4 stages - // Extrapolate velocity to subProblem stage time - dfloat t = tstage + cds->sdt*cds->Srkc[rk]; - - switch(cds->ExplicitOrder){ - case 1: - cds->extC[0] = 1.f; cds->extC[1] = 0.f; cds->extC[2] = 0.f; - break; - case 2: - cds->extC[0] = (t-tn1)/(tn0-tn1); - cds->extC[1] = (t-tn0)/(tn1-tn0); - cds->extC[2] = 0.f; - break; - case 3: - cds->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); - cds->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2)); - cds->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1)); - break; - } - cds->o_extC.copyFrom(cds->extC); - - //compute advective velocity fields at time t - cds->subCycleExtKernel(NtotalElements, - Nstages, - cds->vOffset, - cds->o_extC, - o_U, - cds->o_Ue); - - - // Compute Volume Contribution - if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")){ - cds->subCycleStrongCubatureVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_cubvgeo, - mesh->o_cubDiffInterpT, // mesh->o_cubDWmatrices, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - cds->vOffset, - cds->sOffset, - cds->o_Ue, - o_Sd, - cds->o_rhsSd); - } else{ - cds->subCycleStrongVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_Dmatrices, - cds->vOffset, - cds->sOffset, - cds->o_Ue, - o_Sd, - cds->o_rhsSd); - - } - - ogsGatherScatter(cds->o_rhsSd, ogsDfloat, ogsAdd, mesh->ogs); - - // int nfield = ins->dim==2 ? 2:3; - cds->invMassMatrixKernel(mesh->Nelements, - cds->sOffset, - cds->NSfields, - mesh->o_vgeo, - cds->o_InvM, // mesh->o_MM, // should be invMM for tri/tet - cds->o_rhsSd); - - // Update Kernel - cds->subCycleRKUpdateKernel(mesh->Nelements, - cds->sdt, - cds->Srka[rk], - cds->Srkb[rk], - cds->sOffset, - cds->o_rhsSd, - cds->o_resS, - o_Sd); - } - } - } - -} diff --git a/src/cds.h b/src/cds.h index 9c7660afd..3b37ddbe0 100644 --- a/src/cds.h +++ b/src/cds.h @@ -186,7 +186,6 @@ typedef struct { }cds_t; void cdsAdvection(cds_t *cds, dfloat time, occa::memory o_U, occa::memory o_S, occa::memory o_NS); -void cdsStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd); void cdsHelmholtzRhs(cds_t *cds, dfloat time, int stage, occa::memory o_rhsS); void cdsHelmholtzSolve(cds_t *cds, dfloat time, int stage, occa::memory o_rhsS,occa::memory o_rkS); diff --git a/src/cfl.cpp b/src/cfl.cpp index 180053c36..99756743d 100644 --- a/src/cfl.cpp +++ b/src/cfl.cpp @@ -1,6 +1,10 @@ #include "nekrs.hpp" -void getDh(ins_t *ins){ +static int firstTime = 1; +static dfloat *tmp; +static occa::memory o_tmp; + +void setup(ins_t *ins){ mesh_t *mesh = ins->mesh; @@ -23,16 +27,18 @@ void getDh(ins_t *ins){ free(dH); } - ins->computedDh = 1; + tmp = (dfloat *) calloc(ins->Nblock, sizeof(dfloat)); + o_tmp = mesh->device.malloc(ins->Nblock*sizeof(dfloat), tmp); + + firstTime = 0; } dfloat computeCFL(ins_t *ins, dfloat time, int tstep){ mesh_t *mesh = ins->mesh; - // create dH i.e. nodal spacing in reference coordinates - if(!ins->computedDh) - getDh(ins); + if(firstTime) setup(ins); + // Compute cfl factors i.e. dt* U / h ins->cflKernel(mesh->Nelements, ins->dt, @@ -44,8 +50,6 @@ dfloat computeCFL(ins_t *ins, dfloat time, int tstep){ // find the local maximum of CFL number const dlong Ntotal = mesh->Np*mesh->Nelements; - dfloat *tmp = (dfloat *) calloc(ins->Nblock, sizeof(dfloat)); - occa::memory o_tmp = mesh->device.malloc(ins->Nblock*sizeof(dfloat), tmp); ins->maxKernel(Ntotal, ins->o_rhsU, o_tmp); o_tmp.copyTo(tmp); @@ -58,7 +62,6 @@ dfloat computeCFL(ins_t *ins, dfloat time, int tstep){ dfloat gcfl = 0.f; MPI_Allreduce(&cfl, &gcfl, 1, MPI_DFLOAT, MPI_MAX, mesh->comm); - free(tmp); return gcfl; } diff --git a/src/ins.h b/src/ins.h index 6a503a276..4115fab9c 100644 --- a/src/ins.h +++ b/src/ins.h @@ -55,11 +55,10 @@ typedef struct { dfloat idt, inu; // hold some inverses dfloat *U, *P; - dfloat *NU, *LU, *GP; - dfloat *GU; - dfloat *rhsU, *rhsV, *rhsW, *rhsP; - dfloat *rkU, *rkP, *PI; - dfloat *rkNU, *rkLU, *rkGP; + dfloat *NU, *GP; + dfloat *rhsU, *rhsP; + dfloat *PI; + dfloat *rkNU; dfloat *FU; // Additional source terms for explicit contribution @@ -101,9 +100,6 @@ typedef struct { dfloat *Ud, *Ue, *resU, *rhsUd, sdt; occa::memory o_Ud, o_Ue, o_resU, o_rhsUd; - dfloat *cU, *cUd; - occa::memory o_cU, o_cUd; - dfloat *Wrk; occa::memory o_Wrk; @@ -134,16 +130,15 @@ typedef struct { occa::kernel constrainKernel; occa::memory o_U, o_P; - occa::memory o_rhsU, o_rhsV, o_rhsW, o_rhsP; + occa::memory o_rhsU, o_rhsP; - occa::memory o_NU, o_LU, o_GP, o_NC; - occa::memory o_GU; + occa::memory o_NU, o_GP, o_NC; occa::memory o_FU; - occa::memory o_UH, o_VH, o_WH; - occa::memory o_rkU, o_rkP, o_PI; - occa::memory o_rkNU, o_rkLU, o_rkGP; + occa::memory o_UH; + occa::memory o_PI; + occa::memory o_rkNU; occa::memory o_Vort, o_Div; diff --git a/src/insSetup.cpp b/src/insSetup.cpp index c25e9a1ff..034ccf0a3 100644 --- a/src/insSetup.cpp +++ b/src/insSetup.cpp @@ -66,42 +66,29 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options) // compute samples of q at interpolation nodes ins->U = (dfloat*) calloc(ins->NVfields*ins->Nstages*Ntotal,sizeof(dfloat)); - ins->P = (dfloat*) calloc( ins->Nstages*Ntotal,sizeof(dfloat)); + ins->P = (dfloat*) calloc(Ntotal,sizeof(dfloat)); //rhs storage - ins->rhsU = (dfloat*) calloc(Ntotal,sizeof(dfloat)); - ins->rhsV = (dfloat*) calloc(Ntotal,sizeof(dfloat)); - ins->rhsW = (dfloat*) calloc(Ntotal,sizeof(dfloat)); + ins->rhsU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); ins->rhsP = (dfloat*) calloc(Ntotal,sizeof(dfloat)); //additional field storage ins->NU = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat)); - ins->LU = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat)); - ins->GP = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat)); - ins->GU = (dfloat*) calloc(ins->NVfields*Ntotal*4,sizeof(dfloat)); - - ins->rkU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); - ins->rkP = (dfloat*) calloc( Ntotal,sizeof(dfloat)); ins->PI = (dfloat*) calloc( Ntotal,sizeof(dfloat)); ins->rkNU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); - ins->rkLU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); - ins->rkGP = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); ins->FU = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat)); - //extra storage for interpolated fields - ins->cU = (dfloat *) calloc(ins->NVfields*mesh->Nelements*mesh->cubNp,sizeof(dfloat)); + ins->Ue = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); options.getArgs("SUBCYCLING STEPS",ins->Nsubsteps); if(ins->Nsubsteps){ ins->Ud = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); - ins->Ue = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); ins->resU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); ins->rhsUd = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat)); - ins->cUd = (dfloat*) calloc(ins->NVfields*mesh->Nelements*mesh->cubNp,sizeof(dfloat)); // Prepare RK stages for Subcycling Part @@ -188,6 +175,12 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options) else meshOccaSetup2D(mesh, options, kernelInfo); + // free what is not required + mesh->o_cubsgeo.free(); + mesh->o_cubggeo.free(); + mesh->o_cubsgeo = (void *) NULL; + mesh->o_cubggeo = (void *) NULL; + occa::properties kernelInfoV = kernelInfo; occa::properties kernelInfoP = kernelInfo; occa::properties kernelInfoS = kernelInfo; @@ -239,8 +232,9 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options) options.getArgs("DATA FILE", boundaryHeaderFileName); kernelInfo["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL); - ins->o_U = mesh->device.malloc(ins->NVfields*ins->Nstages*Ntotal*sizeof(dfloat), ins->U); - ins->o_P = mesh->device.malloc( ins->Nstages*Ntotal*sizeof(dfloat), ins->P); + ins->o_U = mesh->device.malloc(ins->NVfields*ins->Nstages*Ntotal*sizeof(dfloat), ins->U); + ins->o_Ue = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ue); + ins->o_P = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->P); /* if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE")) @@ -258,11 +252,9 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options) if(ins->Nsubsteps){ // Note that resU and resV can be replaced with already introduced buffer - ins->o_Ue = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ue); ins->o_Ud = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ud); ins->o_resU = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->resU); ins->o_rhsUd = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rhsUd); - ins->o_cUd = mesh->device.malloc(ins->NVfields*mesh->Nelements*mesh->cubNp*sizeof(dfloat), ins->cUd); } dfloat rkC[4] = {1.0, 0.0, -1.0, -2.0}; @@ -282,28 +274,18 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options) ins->o_Wrk = mesh->device.malloc(1*sizeof(dfloat), ins->Wrk); // MEMORY ALLOCATION - ins->o_rhsU = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsU); - ins->o_rhsV = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsV); - ins->o_rhsW = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsW); + ins->o_rhsU = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rhsU); ins->o_rhsP = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsP); //storage for helmholtz solves - ins->o_UH = mesh->device.malloc(Ntotal*sizeof(dfloat)); - ins->o_VH = mesh->device.malloc(Ntotal*sizeof(dfloat)); - ins->o_WH = mesh->device.malloc(Ntotal*sizeof(dfloat)); + ins->o_UH = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat)); ins->o_FU = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->FU); ins->o_NU = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->NU); - ins->o_GP = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->GP); - ins->o_rkGP = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rkGP); - ins->o_NC = ins->o_GP; // Use GP storage to store curl(curl(u)) history + ins->o_NC = ins->o_UH; - ins->o_rkU = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rkU); - ins->o_rkP = mesh->device.malloc( Ntotal*sizeof(dfloat), ins->rkP); ins->o_PI = mesh->device.malloc( Ntotal*sizeof(dfloat), ins->PI); - ins->o_cU = mesh->device.malloc(ins->NVfields*mesh->Nelements*mesh->cubNp*sizeof(dfloat), ins->cU); - if(ins->options.compareArgs("FILTER STABILIZATION", "RELAXATION")) filterSetup(ins); @@ -744,8 +726,6 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH) // Use Nsubsteps if INS does to prevent stability issues cds->Nsubsteps = ins->Nsubsteps; - cds->Ue = (dfloat*) calloc(cds->NVfields*Ntotal,sizeof(dfloat)); - if(cds->Nsubsteps){ // This memory can be reduced, check later......!!!!!!! cds->Sd = (dfloat*) calloc(cds->NSfields*Ntotal,sizeof(dfloat)); @@ -768,7 +748,9 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH) kernelInfo["defines/" "p_NTSfields"]= (cds->NVfields+cds->NSfields + 1); kernelInfo["defines/" "p_diff"] = cds->diff; - cds->o_U = ins->o_U; + cds->o_U = ins->o_U; + cds->o_Ue = ins->o_Ue; + cds->o_S = mesh->device.malloc(cds->NSfields*(cds->Nstages+0)*Ntotal*sizeof(dfloat), cds->S); string suffix, fileName, kernelName; @@ -973,8 +955,6 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH) kernelName = "cdsInvMassMatrix" + suffix; cds->invMassMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo); - cds->o_Ue = mesh->device.malloc(cds->NVfields*Ntotal*sizeof(dfloat), cds->Ue); - if(cds->Nsubsteps){ // Note that resU and resV can be replaced with already introduced buffer cds->o_Sd = mesh->device.malloc(cds->NSfields*Ntotal*sizeof(dfloat), cds->Sd); diff --git a/src/okl/insAdvectionHex3D.okl b/src/okl/insAdvectionHex3D.okl index fe90e3b1b..78b0b73a4 100644 --- a/src/okl/insAdvectionHex3D.okl +++ b/src/okl/insAdvectionHex3D.okl @@ -123,7 +123,6 @@ } } -// just for temporary fix, needed to be revised.... AK..... @kernel void insStrongAdvectionCubatureVolumeHex3D(const dlong Nelements, @restrict const dfloat * vgeo, @restrict const dfloat * cubvgeo, @@ -132,7 +131,6 @@ @restrict const dfloat * cubProjectT, const dlong offset, @restrict const dfloat * U, - @restrict dfloat * cU, //storage for interpolated fields @restrict dfloat * NU){ // (phi, U.grad Ud) diff --git a/src/okl/insCurlHex3D.okl b/src/okl/insCurlHex3D.okl index 557565ee9..6e35ec0c0 100644 --- a/src/okl/insCurlHex3D.okl +++ b/src/okl/insCurlHex3D.okl @@ -31,9 +31,7 @@ SOFTWARE. @restrict const dfloat * const M, const dlong offset, @restrict const dfloat * U, - @restrict dfloat * Wx, - @restrict dfloat * Wy, - @restrict dfloat * Wz){ + @restrict dfloat * W){ for(dlong e=0;esubCycleExtKernel(mesh->Nelements, + ins->ExplicitOrder, + ins->fieldOffset, + ins->o_extbdfC, + ins->o_U, + ins->o_Ue); + if(ins->Nscalar) scalarSolve(ins, time, ins->dt, cds->o_S); velocitySolve(ins, time, ins->dt, ins->o_U); @@ -141,19 +151,10 @@ void makeq(ins_t *ins, dfloat time, occa::memory o_NS, occa::memory o_FS){ cds_t *cds = ins->cds; mesh_t *mesh = ins->mesh; - if(cds->Nsubsteps) { - cdsStrongSubCycle(cds, time, cds->Nstages, ins->o_U, cds->o_S, o_NS); - } else { - // First extrapolate velocity to t^(n+1) - cds->subCycleExtKernel(mesh->Nelements, - cds->ExplicitOrder, - cds->vOffset, - cds->o_extbdfC, - ins->o_U, - cds->o_Ue); - + if(cds->Nsubsteps) + scalarStrongSubCycle(cds, time, cds->Nstages, cds->o_U, cds->o_S, o_NS); + else cdsAdvection(cds, time, cds->o_Ue, cds->o_S, o_NS); - } ins->setScalarKernel(cds->Ntotal*cds->NSfields, 0.0, o_FS); @@ -207,7 +208,7 @@ void makef(ins_t *ins, dfloat time, occa::memory o_NU, occa::memory o_FU) mesh_t *mesh = ins->mesh; if(ins->Nsubsteps) { - subCycle(ins, time, ins->Nstages, ins->o_U, o_NU); + velocityStrongSubCycle(ins, time, ins->Nstages, ins->o_U, o_NU); } else { if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")) ins->advectionStrongCubatureVolumeKernel( @@ -219,7 +220,6 @@ void makef(ins_t *ins, dfloat time, occa::memory o_NU, occa::memory o_FU) mesh->o_cubProjectT, ins->fieldOffset, ins->o_U, - ins->o_cU, o_NU); else ins->advectionStrongVolumeKernel( @@ -247,35 +247,260 @@ void velocitySolve(ins_t *ins, dfloat time, dfloat dt, occa::memory o_U) { makef(ins, time, ins->o_NU, ins->o_FU); - curlCurl(ins, time, o_U, ins->o_NC); + tombo::curlCurl(ins, time, ins->o_Ue, ins->o_NC); + + occa::memory &o_wrk = ins->o_UH; - // o_rkU = sum_j (alpha_j U_j)- sum_j ( beta_j (FU_J + NU_j + NC_j) ) - pressureRhs (ins, time+dt); - pressureSolve(ins, time+dt, ins->o_rkP); - ins->o_P.copyFrom(ins->o_rkP, ins->Ntotal*sizeof(dfloat)); + tombo::pressureRhs (ins, time+dt, ins->o_rhsP); + tombo::pressureSolve(ins, time+dt, ins->o_rhsP, o_wrk); + ins->o_P.copyFrom(o_wrk, ins->Ntotal*sizeof(dfloat)); - // rhsU^s = MM*1/nu*[ -(grad P) + sum_i ( (a_i) U^(n-i)/dt - b_i (NU+FU)^(n-i) )] - velocityRhs (ins, time+dt); - velocitySolve(ins, time+dt, ins->o_rkU); + tombo::velocityRhs (ins, time+dt, ins->o_rhsU); + tombo::velocitySolve(ins, time+dt, ins->o_rhsU, o_wrk); for (int s=ins->Nstages;s>1;s--) o_U.copyFrom(o_U, ins->Ntotal*ins->NVfields*sizeof(dfloat), (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat)); - o_U.copyFrom(ins->o_rkU, ins->NVfields*ins->Ntotal*sizeof(dfloat)); + o_U.copyFrom(o_wrk, ins->NVfields*ins->Ntotal*sizeof(dfloat)); for (int s=ins->Nstages;s>1;s--) { ins->o_NU.copyFrom(ins->o_NU, ins->Ntotal*ins->NVfields*sizeof(dfloat), (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat)); - ins->o_NC.copyFrom(ins->o_NC, ins->Ntotal*ins->NVfields*sizeof(dfloat), - (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), - (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat)); - ins->o_FU.copyFrom(ins->o_FU, ins->Ntotal*ins->NVfields*sizeof(dfloat), (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat)); } } + +void velocityStrongSubCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud) +{ + mesh_t *mesh = ins->mesh; + + const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs); + + const dfloat tn0 = time - 0*ins->dt; + const dfloat tn1 = time - 1*ins->dt; + const dfloat tn2 = time - 2*ins->dt; + + dfloat zero = 0.0, one = 1.0; + int izero = 0; + + dfloat b, bScale=0; + + // Solve for Each SubProblem + for (int torder=ins->ExplicitOrder-1; torder>=0; torder--){ + + b=ins->extbdfB[torder]; + bScale += b; + + // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) + dlong toffset = torder*ins->NVfields*ins->Ntotal; + + if (torder==ins->ExplicitOrder-1) { //first substep + ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U, zero, izero, o_Ud); + } else { //add the next field + ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U, one, izero, o_Ud); + } + + // SubProblem starts from here from t^(n-torder) + const dfloat tsub = time - torder*ins->dt; + // Advance SubProblem to t^(n-torder+1) + for(int ststep = 0; ststepNsubsteps;++ststep){ + const dfloat tstage = tsub + ststep*ins->sdt; + + for(int rk=0;rkSNrk;++rk){// LSERK4 stages + // Extrapolate velocity to subProblem stage time + dfloat t = tstage + ins->sdt*ins->Srkc[rk]; + + switch(ins->ExplicitOrder){ + case 1: + ins->extC[0] = 1.f; ins->extC[1] = 0.f; ins->extC[2] = 0.f; + break; + case 2: + ins->extC[0] = (t-tn1)/(tn0-tn1); + ins->extC[1] = (t-tn0)/(tn1-tn0); + ins->extC[2] = 0.f; + break; + case 3: + ins->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); + ins->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2)); + ins->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1)); + break; + } + ins->o_extC.copyFrom(ins->extC); + + //compute advective velocity fields at time t + ins->subCycleExtKernel(NtotalElements, + Nstages, + ins->fieldOffset, + ins->o_extC, + o_U, + ins->o_Ue); + + // Compute Volume Contribution + if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")){ + ins->subCycleStrongCubatureVolumeKernel( + mesh->Nelements, + mesh->o_vgeo, + mesh->o_cubvgeo, + mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + ins->o_invLumpedMassMatrix, + ins->fieldOffset, + ins->o_Ue, + o_Ud, + ins->o_rhsUd); + }else{ + ins->subCycleStrongVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_Dmatrices, + ins->fieldOffset, + ins->o_Ue, + o_Ud, + ins->o_rhsUd); + } + + ogsGatherScatterMany(ins->o_rhsUd, ins->NVfields, ins->fieldOffset, + ogsDfloat, ogsAdd, mesh->ogs); + + + int nfield = ins->dim==2 ? 2:3; + ins->invMassMatrixKernel(mesh->Nelements, + ins->fieldOffset, + nfield, + mesh->o_vgeo, + ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet + ins->o_rhsUd); + + ins->subCycleRKUpdateKernel(mesh->Nelements, + ins->sdt, + ins->Srka[rk], + ins->Srkb[rk], + ins->fieldOffset, + ins->o_rhsUd, + ins->o_resU, + o_Ud); + } + } + } +} + +void scalarStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd){ + + mesh_t *mesh = cds->mesh; + const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs); + + const dfloat tn0 = time - 0*cds->dt; + const dfloat tn1 = time - 1*cds->dt; + const dfloat tn2 = time - 2*cds->dt; + + + dfloat zero = 0.0, one = 1.0; + int izero = 0; + dfloat b, bScale=0; + + // Solve for Each SubProblem + for (int torder=(cds->ExplicitOrder-1); torder>=0; torder--){ + + b=cds->extbdfB[torder]; + bScale += b; + + // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) + dlong toffset = torder*cds->NSfields*cds->Ntotal; + + if (torder==cds->ExplicitOrder-1) { //first substep + cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, zero, izero, o_Sd); + } else { //add the next field + cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, one, izero, o_Sd); + } + + // SubProblem starts from here from t^(n-torder) + const dfloat tsub = time - torder*cds->dt; + // Advance SubProblem to t^(n-torder+1) + for(int ststep = 0; ststepNsubsteps;++ststep){ + + const dfloat tstage = tsub + ststep*cds->sdt; + + for(int rk=0;rkSNrk;++rk){// LSERK4 stages + // Extrapolate velocity to subProblem stage time + dfloat t = tstage + cds->sdt*cds->Srkc[rk]; + + switch(cds->ExplicitOrder){ + case 1: + cds->extC[0] = 1.f; cds->extC[1] = 0.f; cds->extC[2] = 0.f; + break; + case 2: + cds->extC[0] = (t-tn1)/(tn0-tn1); + cds->extC[1] = (t-tn0)/(tn1-tn0); + cds->extC[2] = 0.f; + break; + case 3: + cds->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); + cds->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2)); + cds->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1)); + break; + } + cds->o_extC.copyFrom(cds->extC); + + //compute advective velocity fields at time t + cds->subCycleExtKernel(NtotalElements, + Nstages, + cds->vOffset, + cds->o_extC, + o_U, + cds->o_Ue); + + + // Compute Volume Contribution + if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")){ + cds->subCycleStrongCubatureVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_cubvgeo, + mesh->o_cubDiffInterpT, // mesh->o_cubDWmatrices, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + cds->vOffset, + cds->sOffset, + cds->o_Ue, + o_Sd, + cds->o_rhsSd); + } else{ + cds->subCycleStrongVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_Dmatrices, + cds->vOffset, + cds->sOffset, + cds->o_Ue, + o_Sd, + cds->o_rhsSd); + + } + + ogsGatherScatter(cds->o_rhsSd, ogsDfloat, ogsAdd, mesh->ogs); + + // int nfield = ins->dim==2 ? 2:3; + cds->invMassMatrixKernel(mesh->Nelements, + cds->sOffset, + cds->NSfields, + mesh->o_vgeo, + cds->o_InvM, // mesh->o_MM, // should be invMM for tri/tet + cds->o_rhsSd); + + // Update Kernel + cds->subCycleRKUpdateKernel(mesh->Nelements, + cds->sdt, + cds->Srka[rk], + cds->Srkb[rk], + cds->sOffset, + cds->o_rhsSd, + cds->o_resS, + o_Sd); + } + } + } +} diff --git a/src/tombo.cpp b/src/tombo.cpp index d733bd185..d018aff7f 100644 --- a/src/tombo.cpp +++ b/src/tombo.cpp @@ -1,96 +1,14 @@ #include "nekrs.hpp" -// Compute divergence of the velocity field using physical boundary data at t = time. -void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU){ - - mesh_t *mesh = ins->mesh; - - // if (ins->vOptions.compareArgs("DISCRETIZATION","IPDG")) { - if(mesh->totalHaloPairs>0){ - - // make sure compute device is ready to perform halo extract - mesh->device.finish(); - - // switch to data stream - mesh->device.setStream(mesh->dataStream); - - ins->haloGetKernel(mesh->totalHaloPairs, - ins->NVfields, - ins->fieldOffset, - mesh->o_haloElementList, - mesh->o_haloGetNodeIds, - o_U, - ins->o_vHaloBuffer); - - dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs; - - // copy extracted halo to HOST - ins->o_vHaloBuffer.copyTo(ins->vSendBuffer, Ndata*sizeof(dfloat), 0, "async: true");// zero offset - - mesh->device.setStream(mesh->defaultStream); - } - - // computes div u^(n+1) volume term - ins->divergenceVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_Dmatrices, - ins->fieldOffset, - o_U, - o_DU); - - if(mesh->totalHaloPairs>0){ - - // make sure compute device is ready to perform halo extract - mesh->device.setStream(mesh->dataStream); - mesh->device.finish(); - - // start halo exchange - meshHaloExchangeStart(mesh, - mesh->Nfp*(ins->NVfields)*sizeof(dfloat), - ins->vSendBuffer, - ins->vRecvBuffer); - - meshHaloExchangeFinish(mesh); - - dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs; - - ins->o_vHaloBuffer.copyFrom(ins->vRecvBuffer, Ndata*sizeof(dfloat), 0); // zero offset - - ins->haloPutKernel(mesh->totalHaloPairs, - ins->NVfields, - ins->fieldOffset, - mesh->o_haloElementList, - mesh->o_haloPutNodeIds, - ins->o_vHaloBuffer, - o_U); +void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP); +void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU); - mesh->device.setStream(mesh->defaultStream); - } - - //computes div u^(n+1) surface term - const dfloat lambda = -ins->g0*ins->idt; - ins->divergenceSurfaceKernel( - mesh->Nelements, - mesh->o_vgeo, - mesh->o_sgeo, - mesh->o_LIFTT, - mesh->o_vmapM, - mesh->o_vmapP, - ins->o_EToB, - time, - lambda, - mesh->o_x, - mesh->o_y, - mesh->o_z, - ins->fieldOffset, - ins->o_Wrk, - o_U, - o_DU); -} +namespace tombo { -void pressureRhs(ins_t *ins, dfloat time) +void pressureRhs(ins_t *ins, dfloat time, occa::memory o_rhsP) { mesh_t *mesh = ins->mesh; + occa::memory &o_wrk = ins->o_UH; ins->pressureRhsKernel(mesh->Nelements, mesh->o_vgeo, @@ -105,11 +23,11 @@ void pressureRhs(ins_t *ins, dfloat time) ins->o_NU, ins->o_NC, ins->o_FU, - ins->o_rkU); + o_wrk); // weak divergence term (not that no boundary contribution) // -> (F, grad(phi))_sigma - g_0/dt*(n*U^n+1)_del_sigma - insDivergence(ins, time, ins->o_rkU, ins->o_rhsP); + insDivergence(ins, time, o_wrk, o_rhsP); // Add -(grad P, grad phi) to rhsP const dfloat lambda = 0.0; @@ -120,20 +38,20 @@ void pressureRhs(ins_t *ins, dfloat time) mesh->o_MM, lambda, ins->o_P, - ins->o_rhsP); + o_rhsP); } -void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rkP) +void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rhsP, occa::memory o_rkP) { mesh_t *mesh = ins->mesh; elliptic_t *solver = ins->pSolver; - ogsGatherScatter(ins->o_rhsP, ogsDfloat, ogsAdd, mesh->ogs); - if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_rhsP); + ogsGatherScatter(o_rhsP, ogsDfloat, ogsAdd, mesh->ogs); + if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, o_rhsP); if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_PI); - ins->NiterP = ellipticSolve(solver, 0.0, ins->presTOL, ins->o_rhsP, ins->o_PI); + ins->NiterP = ellipticSolve(solver, 0.0, ins->presTOL, o_rhsP, ins->o_PI); ins->pressureAddBCKernel(mesh->Nelements, time, @@ -158,82 +76,13 @@ void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rkP) o_rkP); } -void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP) -{ - mesh_t *mesh = ins->mesh; - - if(mesh->totalHaloPairs>0){ - - // make sure compute device is ready to perform halo extract - mesh->device.finish(); - - // switch to data stream - mesh->device.setStream(mesh->dataStream); - - int one = 1; - dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs; - - ins->haloGetKernel(mesh->totalHaloPairs, - one, - ins->fieldOffset, - mesh->o_haloElementList, - mesh->o_haloGetNodeIds, - o_P, - ins->o_pHaloBuffer); - - // copy extracted halo to HOST - ins->o_pHaloBuffer.copyTo(ins->pSendBuffer, - Ndata*sizeof(dfloat), 0, "async: true"); - - mesh->device.setStream(mesh->defaultStream); - } - - // Compute Volume Contribution - ins->gradientVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_Dmatrices, - ins->fieldOffset, - o_P, - o_GP); - - // COMPLETE HALO EXCHANGE - if(mesh->totalHaloPairs>0){ - // make sure compute device is ready to perform halo extract - mesh->device.setStream(mesh->dataStream); - mesh->device.finish(); - - // start halo exchange - meshHaloExchangeStart(mesh, - mesh->Nfp*sizeof(dfloat), - ins->pSendBuffer, - ins->pRecvBuffer); - - meshHaloExchangeFinish(mesh); - - int one = 1; - dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs; - - ins->o_pHaloBuffer.copyFrom(ins->pRecvBuffer, Ndata*sizeof(dfloat), 0); // zero offset - - ins->haloPutKernel(mesh->totalHaloPairs, - one, - ins->fieldOffset, - mesh->o_haloElementList, - mesh->o_haloPutNodeIds, - ins->o_pHaloBuffer, - o_P); - - mesh->device.finish(); - mesh->device.setStream(mesh->defaultStream); - mesh->device.finish(); - } -} - -void velocityRhs(ins_t *ins, dfloat time) +void velocityRhs(ins_t *ins, dfloat time, occa::memory o_rhsU) { mesh_t *mesh = ins->mesh; - insGradient (ins, time, ins->o_P, ins->o_rkGP); + occa::memory &o_wrk = ins->o_UH; + + insGradient (ins, time, ins->o_P, o_wrk); ins->velocityRhsKernel(mesh->Nelements, mesh->o_vgeo, @@ -247,19 +96,27 @@ void velocityRhs(ins_t *ins, dfloat time) ins->o_U, ins->o_NU, ins->o_FU, - ins->o_rkGP, - ins->o_rhsU, - ins->o_rhsV, - ins->o_rhsW); + o_wrk, + o_rhsU); } -void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat) +void velocitySolve(ins_t *ins, dfloat time, occa::memory o_rhsU, occa::memory o_UH) { mesh_t *mesh = ins->mesh; + dlong Ntotal = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np; + elliptic_t *usolver = ins->uSolver; elliptic_t *vsolver = ins->vSolver; elliptic_t *wsolver = ins->wSolver; + occa::memory o_rhsu = o_rhsU.slice(0*ins->fieldOffset*sizeof(dfloat)); + occa::memory o_rhsv = o_rhsU.slice(1*ins->fieldOffset*sizeof(dfloat)); + occa::memory o_rhsw = o_rhsU.slice(2*ins->fieldOffset*sizeof(dfloat)); + + occa::memory o_uh = o_UH.slice(0*ins->fieldOffset*sizeof(dfloat)); + occa::memory o_vh = o_UH.slice(1*ins->fieldOffset*sizeof(dfloat)); + occa::memory o_wh = o_UH.slice(2*ins->fieldOffset*sizeof(dfloat)); + ins->velocityRhsBCKernel(mesh->Nelements, ins->fieldOffset, mesh->o_ggeo, @@ -278,41 +135,29 @@ void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat) ins->o_VmapB, ins->o_Wrk, ins->o_U, - ins->o_rhsU, - ins->o_rhsV, - ins->o_rhsW); + o_rhsU); - // TODO: change me to gs_many - ogsGatherScatter(ins->o_rhsU, ogsDfloat, ogsAdd, mesh->ogs); - ogsGatherScatter(ins->o_rhsV, ogsDfloat, ogsAdd, mesh->ogs); - if (ins->dim==3) - ogsGatherScatter(ins->o_rhsW, ogsDfloat, ogsAdd, mesh->ogs); - - dlong Ntotal = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np; + ogsGatherScatterMany(o_rhsU, ins->NVfields, ins->fieldOffset, + ogsDfloat, ogsAdd, mesh->ogs); // Use old velocity for velocity solver initial condition - // TODO: fure into one kernel - ins->o_UH.copyFrom(ins->o_U,Ntotal*sizeof(dfloat),0,0*ins->fieldOffset*sizeof(dfloat)); - ins->o_VH.copyFrom(ins->o_U,Ntotal*sizeof(dfloat),0,1*ins->fieldOffset*sizeof(dfloat)); - if (ins->dim==3) - ins->o_WH.copyFrom(ins->o_U,Ntotal*sizeof(dfloat),0,2*ins->fieldOffset*sizeof(dfloat)); - - // TODO: fuse into single kernel - if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, ins->o_UH); - if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, ins->o_VH); + o_UH.copyFrom(ins->o_U,ins->NVfields*Ntotal*sizeof(dfloat)); + + // TODO: fuse with rhs into single kernel + if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, o_uh); + if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, o_vh); if (ins->dim==3 && wsolver->Nmasked) - mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, ins->o_WH); + mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, o_wh); - // TODO: fuse into single kernel - if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, ins->o_rhsU); - if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, ins->o_rhsV); + if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, o_rhsu); + if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, o_rhsv); if (ins->dim==3 && wsolver->Nmasked) - mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, ins->o_rhsW); + mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, o_rhsw); - ins->NiterU = ellipticSolve(usolver, ins->lambda, ins->velTOL, ins->o_rhsU, ins->o_UH); - ins->NiterV = ellipticSolve(vsolver, ins->lambda, ins->velTOL, ins->o_rhsV, ins->o_VH); + ins->NiterU = ellipticSolve(usolver, ins->lambda, ins->velTOL, o_rhsu, o_uh); + ins->NiterV = ellipticSolve(vsolver, ins->lambda, ins->velTOL, o_rhsv, o_vh); if (ins->dim==3) - ins->NiterW = ellipticSolve(wsolver, ins->lambda, ins->velTOL, ins->o_rhsW, ins->o_WH); + ins->NiterW = ellipticSolve(wsolver, ins->lambda, ins->velTOL, o_rhsw, o_wh); ins->velocityAddBCKernel(mesh->Nelements, ins->fieldOffset, @@ -325,239 +170,215 @@ void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat) ins->o_VmapB, ins->o_Wrk, ins->o_U, - ins->o_UH, - ins->o_VH, - ins->o_WH); - - //copy into intermediate stage storage - // TODO: fuse into one copyTo - ins->o_UH.copyTo(o_Uhat,Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0); - ins->o_VH.copyTo(o_Uhat,Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0); - if (ins->dim==3) - ins->o_WH.copyTo(o_Uhat,Ntotal*sizeof(dfloat),2*ins->fieldOffset*sizeof(dfloat),0); -} - -void subCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud) -{ - mesh_t *mesh = ins->mesh; - - const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs); - - const dfloat tn0 = time - 0*ins->dt; - const dfloat tn1 = time - 1*ins->dt; - const dfloat tn2 = time - 2*ins->dt; - - dfloat zero = 0.0, one = 1.0; - int izero = 0; - - dfloat b, bScale=0; - - // Solve for Each SubProblem - for (int torder=ins->ExplicitOrder-1; torder>=0; torder--){ - - b=ins->extbdfB[torder]; - bScale += b; - - // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) - dlong toffset = torder*ins->NVfields*ins->Ntotal; - - if (torder==ins->ExplicitOrder-1) { //first substep - ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U, zero, izero, o_Ud); - } else { //add the next field - ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U, one, izero, o_Ud); - } - - // SubProblem starts from here from t^(n-torder) - const dfloat tsub = time - torder*ins->dt; - // Advance SubProblem to t^(n-torder+1) - for(int ststep = 0; ststepNsubsteps;++ststep){ - const dfloat tstage = tsub + ststep*ins->sdt; - - for(int rk=0;rkSNrk;++rk){// LSERK4 stages - // Extrapolate velocity to subProblem stage time - dfloat t = tstage + ins->sdt*ins->Srkc[rk]; - - switch(ins->ExplicitOrder){ - case 1: - ins->extC[0] = 1.f; ins->extC[1] = 0.f; ins->extC[2] = 0.f; - break; - case 2: - ins->extC[0] = (t-tn1)/(tn0-tn1); - ins->extC[1] = (t-tn0)/(tn1-tn0); - ins->extC[2] = 0.f; - break; - case 3: - ins->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); - ins->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2)); - ins->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1)); - break; - } - ins->o_extC.copyFrom(ins->extC); - - //compute advective velocity fields at time t - ins->subCycleExtKernel(NtotalElements, - Nstages, - ins->fieldOffset, - ins->o_extC, - o_U, - ins->o_Ue); - - // Compute Volume Contribution - if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")){ - ins->subCycleStrongCubatureVolumeKernel( - mesh->Nelements, - mesh->o_vgeo, - mesh->o_cubvgeo, - mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - ins->o_invLumpedMassMatrix, - ins->fieldOffset, - ins->o_Ue, - o_Ud, - ins->o_cU, - ins->o_cUd, - ins->o_rhsUd); - }else{ - ins->subCycleStrongVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_Dmatrices, - ins->fieldOffset, - ins->o_Ue, - o_Ud, - ins->o_rhsUd); - } - - // TODO: change me to gs_many - for(int k=0;kdim;++k){ - ogsGatherScatter(ins->o_rhsUd+k*ins->fieldOffset*sizeof(dfloat), - ogsDfloat, ogsAdd, mesh->ogs); - } - - int nfield = ins->dim==2 ? 2:3; - ins->invMassMatrixKernel(mesh->Nelements, - ins->fieldOffset, - nfield, - mesh->o_vgeo, - ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet - ins->o_rhsUd); - - ins->subCycleRKUpdateKernel(mesh->Nelements, - ins->sdt, - ins->Srka[rk], - ins->Srkb[rk], - ins->fieldOffset, - ins->o_rhsUd, - ins->o_resU, - o_Ud); - } - } - } + o_UH); } void curlCurl(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_NC) { mesh_t *mesh = ins->mesh; - // TODO: extrapolte velocity + occa::memory &o_wrk = ins->o_UH; - // No that UH, VH and WH is used as the temporary arrays to store curl(u) - // Note that multiplied with Mass Matrix i.e. JW to prepare smoothing !!!!! ins->curlKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_Dmatrices, mesh->o_MM, ins->fieldOffset, o_U, - ins->o_UH, // Wx - ins->o_VH, // Wy - ins->o_WH);// Wz - - // TODO: change me to gs_many - if(ins->dim==3){ - ogsGatherScatter(ins->o_UH, ogsDfloat, ogsAdd, mesh->ogs); - ogsGatherScatter(ins->o_VH, ogsDfloat, ogsAdd, mesh->ogs); - } - ogsGatherScatter(ins->o_WH, ogsDfloat, ogsAdd, mesh->ogs); - - int nfield = 0; - if (ins->dim==3){ - nfield = 3; - // TODO: fuse into single copyTo - ins->o_UH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0); - ins->o_VH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0); - ins->o_WH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),2*ins->fieldOffset*sizeof(dfloat),0); - }else{ - nfield =1; - // if quad or tri copy Wz to first place - ins->o_WH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0); - } - - - // Multiply With Inverse Mass Matrix + o_wrk); + + ogsGatherScatterMany(o_wrk, ins->NVfields, ins->fieldOffset, + ogsDfloat, ogsAdd, mesh->ogs); + ins->invMassMatrixKernel( mesh->Nelements, ins->fieldOffset, - nfield, + ins->NVfields, mesh->o_vgeo, ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet - ins->o_rkU); - + o_wrk); - if(ins->dim==2){ - // Second curl on smoothed curl(u) - ins->curlBKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_Dmatrices, - mesh->o_MM, - ins->fieldOffset, - ins->o_rkU, - ins->o_UH, // Wx - ins->o_VH, // Wy - ins->o_WH);// Wz - }else{ - ins->curlKernel(mesh->Nelements, + ins->curlKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_Dmatrices, mesh->o_MM, ins->fieldOffset, - ins->o_rkU, - ins->o_UH, // Wx - ins->o_VH, // Wy - ins->o_WH);// Wz - } - - // TODO: change me to gs_many - ogsGatherScatter(ins->o_UH, ogsDfloat, ogsAdd, mesh->ogs); - ogsGatherScatter(ins->o_VH, ogsDfloat, ogsAdd, mesh->ogs); - if(ins->dim==3) - ogsGatherScatter(ins->o_WH, ogsDfloat, ogsAdd, mesh->ogs); - - nfield = 0; - if (ins->dim==3){ - nfield = 3; - // TODO: fuse into single copyTo - ins->o_UH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0); - ins->o_VH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0); - ins->o_WH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),2*ins->fieldOffset*sizeof(dfloat),0); - }else{ - nfield =2; - // TODO: fuse into single copyTo - ins->o_UH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0); - ins->o_VH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0); - } + o_wrk, + o_NC); + + ogsGatherScatterMany(o_NC, ins->NVfields, ins->fieldOffset, + ogsDfloat, ogsAdd, mesh->ogs); ins->invMassMatrixKernel( mesh->Nelements, ins->fieldOffset, - nfield, + ins->NVfields, mesh->o_vgeo, ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet - ins->o_rkU); - - ins->o_rkU.copyTo(o_NC,ins->NVfields*ins->Ntotal*sizeof(dfloat), - 0*ins->fieldOffset*sizeof(dfloat),0); + o_NC); } +} // namespace + +void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP) +{ + mesh_t *mesh = ins->mesh; + + if(mesh->totalHaloPairs>0){ + + // make sure compute device is ready to perform halo extract + mesh->device.finish(); + + // switch to data stream + mesh->device.setStream(mesh->dataStream); + + int one = 1; + dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs; + + ins->haloGetKernel(mesh->totalHaloPairs, + one, + ins->fieldOffset, + mesh->o_haloElementList, + mesh->o_haloGetNodeIds, + o_P, + ins->o_pHaloBuffer); + + // copy extracted halo to HOST + ins->o_pHaloBuffer.copyTo(ins->pSendBuffer, + Ndata*sizeof(dfloat), 0, "async: true"); + + mesh->device.setStream(mesh->defaultStream); + } + + // Compute Volume Contribution + ins->gradientVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_Dmatrices, + ins->fieldOffset, + o_P, + o_GP); + + // COMPLETE HALO EXCHANGE + if(mesh->totalHaloPairs>0){ + + // make sure compute device is ready to perform halo extract + mesh->device.setStream(mesh->dataStream); + mesh->device.finish(); + + // start halo exchange + meshHaloExchangeStart(mesh, + mesh->Nfp*sizeof(dfloat), + ins->pSendBuffer, + ins->pRecvBuffer); + + meshHaloExchangeFinish(mesh); + + int one = 1; + dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs; + + ins->o_pHaloBuffer.copyFrom(ins->pRecvBuffer, Ndata*sizeof(dfloat), 0); // zero offset + + ins->haloPutKernel(mesh->totalHaloPairs, + one, + ins->fieldOffset, + mesh->o_haloElementList, + mesh->o_haloPutNodeIds, + ins->o_pHaloBuffer, + o_P); + + mesh->device.finish(); + mesh->device.setStream(mesh->defaultStream); + mesh->device.finish(); + } +} + +// Compute divergence of the velocity field using physical boundary data at t = time. +void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU){ + + mesh_t *mesh = ins->mesh; + + // if (ins->vOptions.compareArgs("DISCRETIZATION","IPDG")) { + if(mesh->totalHaloPairs>0){ + + // make sure compute device is ready to perform halo extract + mesh->device.finish(); + + // switch to data stream + mesh->device.setStream(mesh->dataStream); + + ins->haloGetKernel(mesh->totalHaloPairs, + ins->NVfields, + ins->fieldOffset, + mesh->o_haloElementList, + mesh->o_haloGetNodeIds, + o_U, + ins->o_vHaloBuffer); + + dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs; + + // copy extracted halo to HOST + ins->o_vHaloBuffer.copyTo(ins->vSendBuffer, Ndata*sizeof(dfloat), 0, "async: true");// zero offset + + mesh->device.setStream(mesh->defaultStream); + } + + // computes div u^(n+1) volume term + ins->divergenceVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_Dmatrices, + ins->fieldOffset, + o_U, + o_DU); + + if(mesh->totalHaloPairs>0){ + + // make sure compute device is ready to perform halo extract + mesh->device.setStream(mesh->dataStream); + mesh->device.finish(); + + // start halo exchange + meshHaloExchangeStart(mesh, + mesh->Nfp*(ins->NVfields)*sizeof(dfloat), + ins->vSendBuffer, + ins->vRecvBuffer); + + meshHaloExchangeFinish(mesh); + + dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs; + + ins->o_vHaloBuffer.copyFrom(ins->vRecvBuffer, Ndata*sizeof(dfloat), 0); // zero offset + + ins->haloPutKernel(mesh->totalHaloPairs, + ins->NVfields, + ins->fieldOffset, + mesh->o_haloElementList, + mesh->o_haloPutNodeIds, + ins->o_vHaloBuffer, + o_U); + + mesh->device.setStream(mesh->defaultStream); + } + + //computes div u^(n+1) surface term + const dfloat lambda = -ins->g0*ins->idt; + ins->divergenceSurfaceKernel( + mesh->Nelements, + mesh->o_vgeo, + mesh->o_sgeo, + mesh->o_LIFTT, + mesh->o_vmapM, + mesh->o_vmapP, + ins->o_EToB, + time, + lambda, + mesh->o_x, + mesh->o_y, + mesh->o_z, + ins->fieldOffset, + ins->o_Wrk, + o_U, + o_DU); +} + diff --git a/src/tombo.hpp b/src/tombo.hpp index 2ea3aa857..3c468745c 100644 --- a/src/tombo.hpp +++ b/src/tombo.hpp @@ -2,14 +2,15 @@ #define nekrs_tombo_hpp_ #include "nekrs.hpp" -void pressureRhs(ins_t *ins, dfloat time); -void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rkP); -void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP); -void velocityRhs(ins_t *ins, dfloat time); -void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat); -void subCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud); -void advection(ins_t *ins, dfloat time); + +namespace tombo { + +void pressureRhs(ins_t *ins, dfloat time, occa::memory o_rhsP); +void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rhsP, occa::memory o_rkP); +void velocityRhs(ins_t *ins, dfloat time, occa::memory o_rhsU); +void velocitySolve(ins_t *ins, dfloat time, occa::memory o_rhsU, occa::memory o_Uhat); void curlCurl(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_NC); -void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU); + +} #endif