Skip to content

Commit

Permalink
various changes to save memory (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
stgeke authored Oct 14, 2019
1 parent 3e6997a commit a8816f9
Show file tree
Hide file tree
Showing 14 changed files with 577 additions and 687 deletions.
114 changes: 0 additions & 114 deletions src/cds.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,118 +121,4 @@ void cdsAdvection(cds_t *cds, dfloat time, occa::memory o_U, occa::memory o_S, o

}

void cdsStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd){

mesh_t *mesh = cds->mesh;
const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs);

const dfloat tn0 = time - 0*cds->dt;
const dfloat tn1 = time - 1*cds->dt;
const dfloat tn2 = time - 2*cds->dt;


dfloat zero = 0.0, one = 1.0;
int izero = 0;
dfloat b, bScale=0;

// Solve for Each SubProblem
for (int torder=(cds->ExplicitOrder-1); torder>=0; torder--){

b=cds->extbdfB[torder];
bScale += b;

// Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
dlong toffset = torder*cds->NSfields*cds->Ntotal;

if (torder==cds->ExplicitOrder-1) { //first substep
cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, zero, izero, o_Sd);
} else { //add the next field
cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, one, izero, o_Sd);
}

// SubProblem starts from here from t^(n-torder)
const dfloat tsub = time - torder*cds->dt;
// Advance SubProblem to t^(n-torder+1)
for(int ststep = 0; ststep<cds->Nsubsteps;++ststep){

const dfloat tstage = tsub + ststep*cds->sdt;

for(int rk=0;rk<cds->SNrk;++rk){// LSERK4 stages
// Extrapolate velocity to subProblem stage time
dfloat t = tstage + cds->sdt*cds->Srkc[rk];

switch(cds->ExplicitOrder){
case 1:
cds->extC[0] = 1.f; cds->extC[1] = 0.f; cds->extC[2] = 0.f;
break;
case 2:
cds->extC[0] = (t-tn1)/(tn0-tn1);
cds->extC[1] = (t-tn0)/(tn1-tn0);
cds->extC[2] = 0.f;
break;
case 3:
cds->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2));
cds->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2));
cds->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1));
break;
}
cds->o_extC.copyFrom(cds->extC);

//compute advective velocity fields at time t
cds->subCycleExtKernel(NtotalElements,
Nstages,
cds->vOffset,
cds->o_extC,
o_U,
cds->o_Ue);


// Compute Volume Contribution
if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")){
cds->subCycleStrongCubatureVolumeKernel(mesh->Nelements,
mesh->o_vgeo,
mesh->o_cubvgeo,
mesh->o_cubDiffInterpT, // mesh->o_cubDWmatrices,
mesh->o_cubInterpT,
mesh->o_cubProjectT,
cds->vOffset,
cds->sOffset,
cds->o_Ue,
o_Sd,
cds->o_rhsSd);
} else{
cds->subCycleStrongVolumeKernel(mesh->Nelements,
mesh->o_vgeo,
mesh->o_Dmatrices,
cds->vOffset,
cds->sOffset,
cds->o_Ue,
o_Sd,
cds->o_rhsSd);

}

ogsGatherScatter(cds->o_rhsSd, ogsDfloat, ogsAdd, mesh->ogs);

// int nfield = ins->dim==2 ? 2:3;
cds->invMassMatrixKernel(mesh->Nelements,
cds->sOffset,
cds->NSfields,
mesh->o_vgeo,
cds->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
cds->o_rhsSd);

// Update Kernel
cds->subCycleRKUpdateKernel(mesh->Nelements,
cds->sdt,
cds->Srka[rk],
cds->Srkb[rk],
cds->sOffset,
cds->o_rhsSd,
cds->o_resS,
o_Sd);
}
}
}

}
1 change: 0 additions & 1 deletion src/cds.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ typedef struct {
}cds_t;

void cdsAdvection(cds_t *cds, dfloat time, occa::memory o_U, occa::memory o_S, occa::memory o_NS);
void cdsStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd);

void cdsHelmholtzRhs(cds_t *cds, dfloat time, int stage, occa::memory o_rhsS);
void cdsHelmholtzSolve(cds_t *cds, dfloat time, int stage, occa::memory o_rhsS,occa::memory o_rkS);
Expand Down
19 changes: 11 additions & 8 deletions src/cfl.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#include "nekrs.hpp"

void getDh(ins_t *ins){
static int firstTime = 1;
static dfloat *tmp;
static occa::memory o_tmp;

void setup(ins_t *ins){

mesh_t *mesh = ins->mesh;

Expand All @@ -23,16 +27,18 @@ void getDh(ins_t *ins){
free(dH);
}

ins->computedDh = 1;
tmp = (dfloat *) calloc(ins->Nblock, sizeof(dfloat));
o_tmp = mesh->device.malloc(ins->Nblock*sizeof(dfloat), tmp);

firstTime = 0;

}

dfloat computeCFL(ins_t *ins, dfloat time, int tstep){

mesh_t *mesh = ins->mesh;
// create dH i.e. nodal spacing in reference coordinates
if(!ins->computedDh)
getDh(ins);
if(firstTime) setup(ins);

// Compute cfl factors i.e. dt* U / h
ins->cflKernel(mesh->Nelements,
ins->dt,
Expand All @@ -44,8 +50,6 @@ dfloat computeCFL(ins_t *ins, dfloat time, int tstep){

// find the local maximum of CFL number
const dlong Ntotal = mesh->Np*mesh->Nelements;
dfloat *tmp = (dfloat *) calloc(ins->Nblock, sizeof(dfloat));
occa::memory o_tmp = mesh->device.malloc(ins->Nblock*sizeof(dfloat), tmp);
ins->maxKernel(Ntotal, ins->o_rhsU, o_tmp);
o_tmp.copyTo(tmp);

Expand All @@ -58,7 +62,6 @@ dfloat computeCFL(ins_t *ins, dfloat time, int tstep){
dfloat gcfl = 0.f;
MPI_Allreduce(&cfl, &gcfl, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);

free(tmp);
return gcfl;
}

23 changes: 9 additions & 14 deletions src/ins.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,10 @@ typedef struct {
dfloat idt, inu; // hold some inverses

dfloat *U, *P;
dfloat *NU, *LU, *GP;
dfloat *GU;
dfloat *rhsU, *rhsV, *rhsW, *rhsP;
dfloat *rkU, *rkP, *PI;
dfloat *rkNU, *rkLU, *rkGP;
dfloat *NU, *GP;
dfloat *rhsU, *rhsP;
dfloat *PI;
dfloat *rkNU;

dfloat *FU; // Additional source terms for explicit contribution

Expand Down Expand Up @@ -101,9 +100,6 @@ typedef struct {
dfloat *Ud, *Ue, *resU, *rhsUd, sdt;
occa::memory o_Ud, o_Ue, o_resU, o_rhsUd;

dfloat *cU, *cUd;
occa::memory o_cU, o_cUd;

dfloat *Wrk;
occa::memory o_Wrk;

Expand Down Expand Up @@ -134,16 +130,15 @@ typedef struct {
occa::kernel constrainKernel;

occa::memory o_U, o_P;
occa::memory o_rhsU, o_rhsV, o_rhsW, o_rhsP;
occa::memory o_rhsU, o_rhsP;

occa::memory o_NU, o_LU, o_GP, o_NC;
occa::memory o_GU;
occa::memory o_NU, o_GP, o_NC;

occa::memory o_FU;

occa::memory o_UH, o_VH, o_WH;
occa::memory o_rkU, o_rkP, o_PI;
occa::memory o_rkNU, o_rkLU, o_rkGP;
occa::memory o_UH;
occa::memory o_PI;
occa::memory o_rkNU;

occa::memory o_Vort, o_Div;

Expand Down
56 changes: 18 additions & 38 deletions src/insSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,42 +66,29 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)

// compute samples of q at interpolation nodes
ins->U = (dfloat*) calloc(ins->NVfields*ins->Nstages*Ntotal,sizeof(dfloat));
ins->P = (dfloat*) calloc( ins->Nstages*Ntotal,sizeof(dfloat));
ins->P = (dfloat*) calloc(Ntotal,sizeof(dfloat));

//rhs storage
ins->rhsU = (dfloat*) calloc(Ntotal,sizeof(dfloat));
ins->rhsV = (dfloat*) calloc(Ntotal,sizeof(dfloat));
ins->rhsW = (dfloat*) calloc(Ntotal,sizeof(dfloat));
ins->rhsU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->rhsP = (dfloat*) calloc(Ntotal,sizeof(dfloat));

//additional field storage
ins->NU = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));
ins->LU = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));
ins->GP = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));

ins->GU = (dfloat*) calloc(ins->NVfields*Ntotal*4,sizeof(dfloat));

ins->rkU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->rkP = (dfloat*) calloc( Ntotal,sizeof(dfloat));
ins->PI = (dfloat*) calloc( Ntotal,sizeof(dfloat));

ins->rkNU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->rkLU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->rkGP = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));

ins->FU = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));

//extra storage for interpolated fields
ins->cU = (dfloat *) calloc(ins->NVfields*mesh->Nelements*mesh->cubNp,sizeof(dfloat));
ins->Ue = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));

options.getArgs("SUBCYCLING STEPS",ins->Nsubsteps);

if(ins->Nsubsteps){
ins->Ud = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->Ue = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->resU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->rhsUd = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
ins->cUd = (dfloat*) calloc(ins->NVfields*mesh->Nelements*mesh->cubNp,sizeof(dfloat));

// Prepare RK stages for Subcycling Part

Expand Down Expand Up @@ -188,6 +175,12 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
else
meshOccaSetup2D(mesh, options, kernelInfo);

// free what is not required
mesh->o_cubsgeo.free();
mesh->o_cubggeo.free();
mesh->o_cubsgeo = (void *) NULL;
mesh->o_cubggeo = (void *) NULL;

occa::properties kernelInfoV = kernelInfo;
occa::properties kernelInfoP = kernelInfo;
occa::properties kernelInfoS = kernelInfo;
Expand Down Expand Up @@ -239,8 +232,9 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
options.getArgs("DATA FILE", boundaryHeaderFileName);
kernelInfo["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL);

ins->o_U = mesh->device.malloc(ins->NVfields*ins->Nstages*Ntotal*sizeof(dfloat), ins->U);
ins->o_P = mesh->device.malloc( ins->Nstages*Ntotal*sizeof(dfloat), ins->P);
ins->o_U = mesh->device.malloc(ins->NVfields*ins->Nstages*Ntotal*sizeof(dfloat), ins->U);
ins->o_Ue = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ue);
ins->o_P = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->P);

/*
if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE"))
Expand All @@ -258,11 +252,9 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)

if(ins->Nsubsteps){
// Note that resU and resV can be replaced with already introduced buffer
ins->o_Ue = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ue);
ins->o_Ud = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ud);
ins->o_resU = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->resU);
ins->o_rhsUd = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rhsUd);
ins->o_cUd = mesh->device.malloc(ins->NVfields*mesh->Nelements*mesh->cubNp*sizeof(dfloat), ins->cUd);
}

dfloat rkC[4] = {1.0, 0.0, -1.0, -2.0};
Expand All @@ -282,28 +274,18 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
ins->o_Wrk = mesh->device.malloc(1*sizeof(dfloat), ins->Wrk);

// MEMORY ALLOCATION
ins->o_rhsU = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsU);
ins->o_rhsV = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsV);
ins->o_rhsW = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsW);
ins->o_rhsU = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rhsU);
ins->o_rhsP = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsP);
//storage for helmholtz solves
ins->o_UH = mesh->device.malloc(Ntotal*sizeof(dfloat));
ins->o_VH = mesh->device.malloc(Ntotal*sizeof(dfloat));
ins->o_WH = mesh->device.malloc(Ntotal*sizeof(dfloat));
ins->o_UH = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat));

ins->o_FU = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->FU);
ins->o_NU = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->NU);
ins->o_GP = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->GP);
ins->o_rkGP = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rkGP);

ins->o_NC = ins->o_GP; // Use GP storage to store curl(curl(u)) history
ins->o_NC = ins->o_UH;

ins->o_rkU = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rkU);
ins->o_rkP = mesh->device.malloc( Ntotal*sizeof(dfloat), ins->rkP);
ins->o_PI = mesh->device.malloc( Ntotal*sizeof(dfloat), ins->PI);

ins->o_cU = mesh->device.malloc(ins->NVfields*mesh->Nelements*mesh->cubNp*sizeof(dfloat), ins->cU);

if(ins->options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
filterSetup(ins);

Expand Down Expand Up @@ -744,8 +726,6 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH)
// Use Nsubsteps if INS does to prevent stability issues
cds->Nsubsteps = ins->Nsubsteps;

cds->Ue = (dfloat*) calloc(cds->NVfields*Ntotal,sizeof(dfloat));

if(cds->Nsubsteps){
// This memory can be reduced, check later......!!!!!!!
cds->Sd = (dfloat*) calloc(cds->NSfields*Ntotal,sizeof(dfloat));
Expand All @@ -768,7 +748,9 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH)
kernelInfo["defines/" "p_NTSfields"]= (cds->NVfields+cds->NSfields + 1);
kernelInfo["defines/" "p_diff"] = cds->diff;

cds->o_U = ins->o_U;
cds->o_U = ins->o_U;
cds->o_Ue = ins->o_Ue;

cds->o_S = mesh->device.malloc(cds->NSfields*(cds->Nstages+0)*Ntotal*sizeof(dfloat), cds->S);

string suffix, fileName, kernelName;
Expand Down Expand Up @@ -973,8 +955,6 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH)
kernelName = "cdsInvMassMatrix" + suffix;
cds->invMassMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo);

cds->o_Ue = mesh->device.malloc(cds->NVfields*Ntotal*sizeof(dfloat), cds->Ue);

if(cds->Nsubsteps){
// Note that resU and resV can be replaced with already introduced buffer
cds->o_Sd = mesh->device.malloc(cds->NSfields*Ntotal*sizeof(dfloat), cds->Sd);
Expand Down
2 changes: 0 additions & 2 deletions src/okl/insAdvectionHex3D.okl
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@
}
}

// just for temporary fix, needed to be revised.... AK.....
@kernel void insStrongAdvectionCubatureVolumeHex3D(const dlong Nelements,
@restrict const dfloat * vgeo,
@restrict const dfloat * cubvgeo,
Expand All @@ -132,7 +131,6 @@
@restrict const dfloat * cubProjectT,
const dlong offset,
@restrict const dfloat * U,
@restrict dfloat * cU, //storage for interpolated fields
@restrict dfloat * NU){

// (phi, U.grad Ud)
Expand Down
Loading

0 comments on commit a8816f9

Please sign in to comment.