diff --git a/src/cds.cpp b/src/cds.cpp
index 47f87bf9e..a2e64c21a 100644
--- a/src/cds.cpp
+++ b/src/cds.cpp
@@ -121,118 +121,4 @@ void cdsAdvection(cds_t *cds, dfloat time, occa::memory o_U, occa::memory o_S, o
 
 }
 
-void cdsStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd){
 
-  mesh_t *mesh = cds->mesh;
-  const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs);  
-
-  const dfloat tn0 = time - 0*cds->dt;
-  const dfloat tn1 = time - 1*cds->dt;
-  const dfloat tn2 = time - 2*cds->dt;
-
-
-  dfloat zero = 0.0, one = 1.0;
-  int izero = 0;
-  dfloat b, bScale=0;
-
-   // Solve for Each SubProblem
-  for (int torder=(cds->ExplicitOrder-1); torder>=0; torder--){
-    
-    b=cds->extbdfB[torder];
-    bScale += b; 
-
-    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
-    dlong toffset = torder*cds->NSfields*cds->Ntotal;
-
-    if (torder==cds->ExplicitOrder-1) { //first substep
-      cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, zero, izero, o_Sd);
-    } else { //add the next field
-      cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S,  one, izero, o_Sd);
-    }     
-
-    // SubProblem  starts from here from t^(n-torder)
-    const dfloat tsub = time - torder*cds->dt;
-    // Advance SubProblem to t^(n-torder+1) 
-    for(int ststep = 0; ststep<cds->Nsubsteps;++ststep){
-     
-      const dfloat tstage = tsub + ststep*cds->sdt;     
-     
-      for(int rk=0;rk<cds->SNrk;++rk){// LSERK4 stages
-        // Extrapolate velocity to subProblem stage time
-        dfloat t = tstage +  cds->sdt*cds->Srkc[rk]; 
-
-        switch(cds->ExplicitOrder){
-          case 1:
-            cds->extC[0] = 1.f; cds->extC[1] = 0.f; cds->extC[2] = 0.f;
-            break;
-          case 2:
-            cds->extC[0] = (t-tn1)/(tn0-tn1);
-            cds->extC[1] = (t-tn0)/(tn1-tn0);
-            cds->extC[2] = 0.f; 
-            break;
-          case 3:
-            cds->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); 
-            cds->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2));
-            cds->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1));
-            break;
-        }
-        cds->o_extC.copyFrom(cds->extC);
-
-        //compute advective velocity fields at time t
-        cds->subCycleExtKernel(NtotalElements,
-                               Nstages,
-                               cds->vOffset,
-                               cds->o_extC,
-                               o_U,
-                               cds->o_Ue);
-
-     
-        // Compute Volume Contribution
-        if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")){
-          cds->subCycleStrongCubatureVolumeKernel(mesh->Nelements,
-                                            mesh->o_vgeo,
-                                            mesh->o_cubvgeo,
-                                            mesh->o_cubDiffInterpT, // mesh->o_cubDWmatrices,
-                                            mesh->o_cubInterpT,
-                                            mesh->o_cubProjectT,
-                                            cds->vOffset,
-                                            cds->sOffset,             
-                                            cds->o_Ue,
-                                                 o_Sd,
-                                            cds->o_rhsSd);
-        } else{
-          cds->subCycleStrongVolumeKernel(mesh->Nelements,
-                                    mesh->o_vgeo,
-                                    mesh->o_Dmatrices,
-                                    cds->vOffset,
-                                    cds->sOffset,           
-                                    cds->o_Ue,
-                                    o_Sd,
-                                    cds->o_rhsSd);
-
-        }
-
-        ogsGatherScatter(cds->o_rhsSd, ogsDfloat, ogsAdd, mesh->ogs);
-
-        // int nfield = ins->dim==2 ? 2:3; 
-        cds->invMassMatrixKernel(mesh->Nelements,
-                                 cds->sOffset,
-                                 cds->NSfields,
-                                 mesh->o_vgeo,
-                                 cds->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
-                                 cds->o_rhsSd);
-
-        // Update Kernel
-        cds->subCycleRKUpdateKernel(mesh->Nelements,
-                                    cds->sdt,
-                                    cds->Srka[rk],
-                                    cds->Srkb[rk],
-                                    cds->sOffset,
-                                    cds->o_rhsSd,
-                                    cds->o_resS, 
-                                         o_Sd);
-      }
-    }
-  }
-
-}
diff --git a/src/cds.h b/src/cds.h
index 9c7660afd..3b37ddbe0 100644
--- a/src/cds.h
+++ b/src/cds.h
@@ -186,7 +186,6 @@ typedef struct {
 }cds_t;
 
 void cdsAdvection(cds_t *cds, dfloat time, occa::memory o_U, occa::memory o_S, occa::memory o_NS);
-void cdsStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd);
 
 void cdsHelmholtzRhs(cds_t *cds, dfloat time, int stage, occa::memory o_rhsS);
 void cdsHelmholtzSolve(cds_t *cds, dfloat time, int stage, occa::memory o_rhsS,occa::memory o_rkS);
diff --git a/src/cfl.cpp b/src/cfl.cpp
index 180053c36..99756743d 100644
--- a/src/cfl.cpp
+++ b/src/cfl.cpp
@@ -1,6 +1,10 @@
 #include "nekrs.hpp"
 
-void getDh(ins_t *ins){
+static int firstTime = 1;
+static dfloat *tmp;
+static occa::memory o_tmp;
+
+void setup(ins_t *ins){
 
   mesh_t *mesh = ins->mesh; 
   
@@ -23,16 +27,18 @@ void getDh(ins_t *ins){
     free(dH); 
   }
 
-  ins->computedDh = 1; 
+  tmp = (dfloat *) calloc(ins->Nblock, sizeof(dfloat));
+  o_tmp = mesh->device.malloc(ins->Nblock*sizeof(dfloat), tmp);
+
+  firstTime = 0; 
 
 }
 
 dfloat computeCFL(ins_t *ins, dfloat time, int tstep){
 
   mesh_t *mesh = ins->mesh; 
-  // create dH i.e. nodal spacing in reference coordinates
-  if(!ins->computedDh)
-    getDh(ins);
+  if(firstTime) setup(ins);
+
   // Compute cfl factors i.e. dt* U / h 
   ins->cflKernel(mesh->Nelements,
                  ins->dt, 
@@ -44,8 +50,6 @@ dfloat computeCFL(ins_t *ins, dfloat time, int tstep){
   
   // find the local maximum of CFL number
   const dlong Ntotal = mesh->Np*mesh->Nelements; 
-  dfloat *tmp        = (dfloat *) calloc(ins->Nblock, sizeof(dfloat));
-  occa::memory o_tmp = mesh->device.malloc(ins->Nblock*sizeof(dfloat), tmp);
   ins->maxKernel(Ntotal, ins->o_rhsU, o_tmp);
   o_tmp.copyTo(tmp);
   
@@ -58,7 +62,6 @@ dfloat computeCFL(ins_t *ins, dfloat time, int tstep){
   dfloat gcfl = 0.f;
   MPI_Allreduce(&cfl, &gcfl, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
   
-  free(tmp);
   return gcfl; 
 }
 
diff --git a/src/ins.h b/src/ins.h
index 6a503a276..4115fab9c 100644
--- a/src/ins.h
+++ b/src/ins.h
@@ -55,11 +55,10 @@ typedef struct {
   dfloat idt, inu; // hold some inverses
   
   dfloat *U, *P;
-  dfloat *NU, *LU, *GP;
-  dfloat *GU;   
-  dfloat *rhsU, *rhsV, *rhsW, *rhsP;   
-  dfloat *rkU, *rkP, *PI;
-  dfloat *rkNU, *rkLU, *rkGP;
+  dfloat *NU, *GP;
+  dfloat *rhsU, *rhsP;   
+  dfloat *PI;
+  dfloat *rkNU;
 
   dfloat *FU; // Additional source terms for explicit contribution
   
@@ -101,9 +100,6 @@ typedef struct {
   dfloat *Ud, *Ue, *resU, *rhsUd, sdt;
   occa::memory o_Ud, o_Ue, o_resU, o_rhsUd;
 
-  dfloat *cU, *cUd;
-  occa::memory o_cU, o_cUd;
-
   dfloat *Wrk;
   occa::memory o_Wrk; 
 
@@ -134,16 +130,15 @@ typedef struct {
   occa::kernel constrainKernel;
   
   occa::memory o_U, o_P;
-  occa::memory o_rhsU, o_rhsV, o_rhsW, o_rhsP; 
+  occa::memory o_rhsU, o_rhsP; 
 
-  occa::memory o_NU, o_LU, o_GP, o_NC;
-  occa::memory o_GU;
+  occa::memory o_NU, o_GP, o_NC;
 
   occa::memory o_FU; 
 
-  occa::memory o_UH, o_VH, o_WH;
-  occa::memory o_rkU, o_rkP, o_PI;
-  occa::memory o_rkNU, o_rkLU, o_rkGP;
+  occa::memory o_UH;
+  occa::memory o_PI;
+  occa::memory o_rkNU;
 
   occa::memory o_Vort, o_Div;
 
diff --git a/src/insSetup.cpp b/src/insSetup.cpp
index c25e9a1ff..034ccf0a3 100644
--- a/src/insSetup.cpp
+++ b/src/insSetup.cpp
@@ -66,42 +66,29 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
 
   // compute samples of q at interpolation nodes
   ins->U     = (dfloat*) calloc(ins->NVfields*ins->Nstages*Ntotal,sizeof(dfloat));
-  ins->P     = (dfloat*) calloc(              ins->Nstages*Ntotal,sizeof(dfloat));
+  ins->P     = (dfloat*) calloc(Ntotal,sizeof(dfloat));
 
   //rhs storage
-  ins->rhsU  = (dfloat*) calloc(Ntotal,sizeof(dfloat));
-  ins->rhsV  = (dfloat*) calloc(Ntotal,sizeof(dfloat));
-  ins->rhsW  = (dfloat*) calloc(Ntotal,sizeof(dfloat));
+  ins->rhsU  = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
   ins->rhsP  = (dfloat*) calloc(Ntotal,sizeof(dfloat));
 
   //additional field storage
   ins->NU   = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));
-  ins->LU   = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));
-  ins->GP   = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));
 
-  ins->GU   = (dfloat*) calloc(ins->NVfields*Ntotal*4,sizeof(dfloat));
-  
-  ins->rkU  = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
-  ins->rkP  = (dfloat*) calloc(              Ntotal,sizeof(dfloat));
   ins->PI   = (dfloat*) calloc(              Ntotal,sizeof(dfloat));
   
   ins->rkNU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
-  ins->rkLU = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
-  ins->rkGP = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
 
   ins->FU   = (dfloat*) calloc(ins->NVfields*(ins->Nstages+1)*Ntotal,sizeof(dfloat));
 
-  //extra storage for interpolated fields
-  ins->cU = (dfloat *) calloc(ins->NVfields*mesh->Nelements*mesh->cubNp,sizeof(dfloat));
+  ins->Ue   = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
 
   options.getArgs("SUBCYCLING STEPS",ins->Nsubsteps);
 
   if(ins->Nsubsteps){
     ins->Ud    = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
-    ins->Ue    = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
     ins->resU  = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
     ins->rhsUd = (dfloat*) calloc(ins->NVfields*Ntotal,sizeof(dfloat));
-    ins->cUd   = (dfloat*) calloc(ins->NVfields*mesh->Nelements*mesh->cubNp,sizeof(dfloat));
 
     // Prepare RK stages for Subcycling Part
    
@@ -188,6 +175,12 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
   else
     meshOccaSetup2D(mesh, options, kernelInfo);
 
+  // free what is not required
+  mesh->o_cubsgeo.free();
+  mesh->o_cubggeo.free();
+  mesh->o_cubsgeo = (void *) NULL;
+  mesh->o_cubggeo = (void *) NULL;
+
   occa::properties kernelInfoV  = kernelInfo;
   occa::properties kernelInfoP  = kernelInfo;
   occa::properties kernelInfoS  = kernelInfo;
@@ -239,8 +232,9 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
   options.getArgs("DATA FILE", boundaryHeaderFileName);
   kernelInfo["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL);
 
-  ins->o_U = mesh->device.malloc(ins->NVfields*ins->Nstages*Ntotal*sizeof(dfloat), ins->U);
-  ins->o_P = mesh->device.malloc(              ins->Nstages*Ntotal*sizeof(dfloat), ins->P);
+  ins->o_U  = mesh->device.malloc(ins->NVfields*ins->Nstages*Ntotal*sizeof(dfloat), ins->U);
+  ins->o_Ue = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ue);
+  ins->o_P  = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->P);
 
 /*
   if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE"))
@@ -258,11 +252,9 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
 
   if(ins->Nsubsteps){
     // Note that resU and resV can be replaced with already introduced buffer
-    ins->o_Ue    = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ue);
     ins->o_Ud    = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->Ud);
     ins->o_resU  = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->resU);
     ins->o_rhsUd = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rhsUd);
-    ins->o_cUd = mesh->device.malloc(ins->NVfields*mesh->Nelements*mesh->cubNp*sizeof(dfloat), ins->cUd);
   }
 
   dfloat rkC[4] = {1.0, 0.0, -1.0, -2.0};
@@ -282,28 +274,18 @@ ins_t *insSetup(mesh_t *mesh, setupAide &options)
   ins->o_Wrk   = mesh->device.malloc(1*sizeof(dfloat), ins->Wrk);
  
   // MEMORY ALLOCATION
-  ins->o_rhsU  = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsU);
-  ins->o_rhsV  = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsV);
-  ins->o_rhsW  = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsW);
+  ins->o_rhsU  = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rhsU);
   ins->o_rhsP  = mesh->device.malloc(Ntotal*sizeof(dfloat), ins->rhsP);
   //storage for helmholtz solves
-  ins->o_UH = mesh->device.malloc(Ntotal*sizeof(dfloat));
-  ins->o_VH = mesh->device.malloc(Ntotal*sizeof(dfloat));
-  ins->o_WH = mesh->device.malloc(Ntotal*sizeof(dfloat));
+  ins->o_UH    = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat));
 
   ins->o_FU    = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->FU);
   ins->o_NU    = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->NU);
-  ins->o_GP    = mesh->device.malloc(ins->NVfields*(ins->Nstages+1)*Ntotal*sizeof(dfloat), ins->GP);
-  ins->o_rkGP  = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rkGP);
 
-  ins->o_NC = ins->o_GP; // Use GP storage to store curl(curl(u)) history
+  ins->o_NC    = ins->o_UH;
 
-  ins->o_rkU   = mesh->device.malloc(ins->NVfields*Ntotal*sizeof(dfloat), ins->rkU);
-  ins->o_rkP   = mesh->device.malloc(              Ntotal*sizeof(dfloat), ins->rkP);
   ins->o_PI    = mesh->device.malloc(              Ntotal*sizeof(dfloat), ins->PI);
 
-  ins->o_cU = mesh->device.malloc(ins->NVfields*mesh->Nelements*mesh->cubNp*sizeof(dfloat), ins->cU);
-
   if(ins->options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
     filterSetup(ins); 
 
@@ -744,8 +726,6 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH)
   // Use Nsubsteps if INS does to prevent stability issues
   cds->Nsubsteps = ins->Nsubsteps; 
 
-  cds->Ue      = (dfloat*) calloc(cds->NVfields*Ntotal,sizeof(dfloat));
-  
   if(cds->Nsubsteps){
     // This memory can be reduced, check later......!!!!!!!
     cds->Sd      = (dfloat*) calloc(cds->NSfields*Ntotal,sizeof(dfloat));
@@ -768,7 +748,9 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH)
   kernelInfo["defines/" "p_NTSfields"]= (cds->NVfields+cds->NSfields + 1);
   kernelInfo["defines/" "p_diff"]      = cds->diff;
  
-  cds->o_U = ins->o_U;
+  cds->o_U  = ins->o_U;
+  cds->o_Ue = ins->o_Ue;
+
   cds->o_S = mesh->device.malloc(cds->NSfields*(cds->Nstages+0)*Ntotal*sizeof(dfloat), cds->S);
 
   string suffix, fileName, kernelName;
@@ -973,8 +955,6 @@ cds_t *cdsSetup(ins_t *ins, setupAide &options, occa::properties &kernelInfoH)
       kernelName = "cdsInvMassMatrix" + suffix;
       cds->invMassMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo);  
 
-      cds->o_Ue     = mesh->device.malloc(cds->NVfields*Ntotal*sizeof(dfloat), cds->Ue);
-      
       if(cds->Nsubsteps){
         // Note that resU and resV can be replaced with already introduced buffer
         cds->o_Sd     = mesh->device.malloc(cds->NSfields*Ntotal*sizeof(dfloat), cds->Sd);
diff --git a/src/okl/insAdvectionHex3D.okl b/src/okl/insAdvectionHex3D.okl
index fe90e3b1b..78b0b73a4 100644
--- a/src/okl/insAdvectionHex3D.okl
+++ b/src/okl/insAdvectionHex3D.okl
@@ -123,7 +123,6 @@
   }
 }
 
-// just for temporary fix, needed to be revised.... AK.....
 @kernel void insStrongAdvectionCubatureVolumeHex3D(const dlong Nelements,
 						   @restrict const  dfloat *  vgeo,
 						   @restrict const  dfloat *  cubvgeo,
@@ -132,7 +131,6 @@
 						   @restrict const  dfloat *  cubProjectT,
 						   const dlong offset,
 						   @restrict const  dfloat *  U,
-						   @restrict dfloat *  cU, //storage for interpolated fields
 						   @restrict dfloat *  NU){
   
   // (phi, U.grad Ud)
diff --git a/src/okl/insCurlHex3D.okl b/src/okl/insCurlHex3D.okl
index 557565ee9..6e35ec0c0 100644
--- a/src/okl/insCurlHex3D.okl
+++ b/src/okl/insCurlHex3D.okl
@@ -31,9 +31,7 @@ SOFTWARE.
                            @restrict const  dfloat *  const M,
                            const dlong offset,
                            @restrict const  dfloat *  U,
-                           @restrict dfloat *  Wx,
-                           @restrict dfloat *  Wy,
-                           @restrict dfloat *  Wz){  
+                           @restrict dfloat *  W){
 
 for(dlong e=0;e<Nelements;e++;@outer(0)){  // for all elements
 
@@ -120,12 +118,12 @@ for(dlong e=0;e<Nelements;e++;@outer(0)){  // for all elements
           
           const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
 
-          Wx[id+0*offset] = vortx;
-          Wy[id+0*offset] = vorty;
-          Wz[id+0*offset] = vortz;
+          W[id+0*offset] = vortx;
+          W[id+1*offset] = vorty;
+          W[id+2*offset] = vortz;
 
         }
       }
     }
   }  
-}
\ No newline at end of file
+}
diff --git a/src/okl/insPressureRhsHex3D.okl b/src/okl/insPressureRhsHex3D.okl
index 9309385e4..94ddcae2e 100644
--- a/src/okl/insPressureRhsHex3D.okl
+++ b/src/okl/insPressureRhsHex3D.okl
@@ -38,7 +38,7 @@
                                 @restrict const  dfloat *  NU,
                                 @restrict const  dfloat *  NC,
                                 @restrict const  dfloat *  FU,
-                                      @restrict dfloat *  rhsU){
+                                @restrict dfloat *  rhsU){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){
 
@@ -58,54 +58,45 @@
               dfloat rhsWn;
 
               if (p_SUBCYCLING) {
-                 // Characteristic Velocity
-                 rhsUn = idt*NU[id+0*fieldOffset]; 
-                 rhsVn = idt*NU[id+1*fieldOffset]; 
-                 rhsWn = idt*NU[id+2*fieldOffset]; 
+                rhsUn = idt*NU[id+0*fieldOffset] - nu*NC[id+0*fieldOffset];
+                rhsVn = idt*NU[id+1*fieldOffset] - nu*NC[id+1*fieldOffset]; 
+                rhsWn = idt*NU[id+2*fieldOffset] - nu*NC[id+2*fieldOffset]; 
 
                 for (int s=0;s<p_Nstages;s++) {
-                  // Curl Curl
-                  const dfloat NCx  = NC[id+0*fieldOffset+s*p_NVfields*fieldOffset];
-                  const dfloat NCy  = NC[id+1*fieldOffset+s*p_NVfields*fieldOffset];               
-                  const dfloat NCz  = NC[id+2*fieldOffset+s*p_NVfields*fieldOffset];               
                   // FU
                   const dfloat FUx  = FU[id+0*fieldOffset+s*p_NVfields*fieldOffset];
                   const dfloat FUy  = FU[id+1*fieldOffset+s*p_NVfields*fieldOffset];
                   const dfloat FUz  = FU[id+2*fieldOffset+s*p_NVfields*fieldOffset];
 
-                  rhsUn += extbdfA[s]*(FUx - nu*NCx);
-                  rhsVn += extbdfA[s]*(FUy - nu*NCy);
-                  rhsWn += extbdfA[s]*(FUz - nu*NCz);
-		           }
+                  rhsUn += extbdfA[s]*FUx;
+                  rhsVn += extbdfA[s]*FUy;
+                  rhsWn += extbdfA[s]*FUz;
+		}
       	      } else {
-            		rhsUn = 0.f;
-            		rhsVn = 0.f;
-            		rhsWn = 0.f;
-
-            		for (int s=0;s<p_Nstages;s++) {
-            		  //U 
-            		  const dfloat Um   = U[id+0*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat Vm   = U[id+1*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat Wm   = U[id+2*fieldOffset+s*p_NVfields*fieldOffset];
-            		  // NU
-            		  const dfloat NUm  = NU[id+0*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat NVm  = NU[id+1*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat NWm  = NU[id+2*fieldOffset+s*p_NVfields*fieldOffset];
-            		  // GP
-            		  const dfloat NCx  = NC[id+0*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat NCy  = NC[id+1*fieldOffset+s*p_NVfields*fieldOffset];               
-            		  const dfloat NCz  = NC[id+2*fieldOffset+s*p_NVfields*fieldOffset];               
-            		  // FU
-            		  const dfloat FUx  = FU[id+0*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat FUy  = FU[id+1*fieldOffset+s*p_NVfields*fieldOffset];
-            		  const dfloat FUz  = FU[id+2*fieldOffset+s*p_NVfields*fieldOffset];
-
-            		  rhsUn += idt*extbdfB[s]*Um - extbdfA[s]*(NUm - FUx + nu*NCx);
-            		  rhsVn += idt*extbdfB[s]*Vm - extbdfA[s]*(NVm - FUy + nu*NCy);
-            		  rhsWn += idt*extbdfB[s]*Wm - extbdfA[s]*(NWm - FUz + nu*NCz);
+                        rhsUn = - nu*NC[id+0*fieldOffset];
+                        rhsVn = - nu*NC[id+1*fieldOffset]; 
+                        rhsWn = - nu*NC[id+2*fieldOffset]; 
+
+           		for (int s=0;s<p_Nstages;s++) {
+           		  //U 
+           		  const dfloat Um   = U[id+0*fieldOffset+s*p_NVfields*fieldOffset];
+           		  const dfloat Vm   = U[id+1*fieldOffset+s*p_NVfields*fieldOffset];
+           		  const dfloat Wm   = U[id+2*fieldOffset+s*p_NVfields*fieldOffset];
+           		  // NU
+           		  const dfloat NUm  = NU[id+0*fieldOffset+s*p_NVfields*fieldOffset];
+           		  const dfloat NVm  = NU[id+1*fieldOffset+s*p_NVfields*fieldOffset];
+           		  const dfloat NWm  = NU[id+2*fieldOffset+s*p_NVfields*fieldOffset];
+           		  // FU
+           		  const dfloat FUx  = FU[id+0*fieldOffset+s*p_NVfields*fieldOffset];
+           		  const dfloat FUy  = FU[id+1*fieldOffset+s*p_NVfields*fieldOffset];
+           		  const dfloat FUz  = FU[id+2*fieldOffset+s*p_NVfields*fieldOffset];
+
+           		  rhsUn += idt*extbdfB[s]*Um - extbdfA[s]*(NUm - FUx);
+           		  rhsVn += idt*extbdfB[s]*Vm - extbdfA[s]*(NVm - FUy);
+           		  rhsWn += idt*extbdfB[s]*Wm - extbdfA[s]*(NWm - FUz);
       		        } 
-	             }
-               //
+	      }
+              //
       	      rhsU[id + 0*fieldOffset] = rhsUn;
       	      rhsU[id + 1*fieldOffset] = rhsVn;
       	      rhsU[id + 2*fieldOffset] = rhsWn;
diff --git a/src/okl/insSubCycleHex3D.okl b/src/okl/insSubCycleHex3D.okl
index 58507e71e..b800a2d63 100644
--- a/src/okl/insSubCycleHex3D.okl
+++ b/src/okl/insSubCycleHex3D.okl
@@ -24,7 +24,6 @@
 
 */
 
-// Trying to find difference between enstrophy generation in libP and Nek5K
 @kernel void insSubCycleStrongCubatureVolumeHex3D(const dlong Nelements,
 					       @restrict const  dfloat *  vgeo,
 					       @restrict const  dfloat *  cubvgeo,
@@ -35,8 +34,6 @@
 					       const dlong offset,
 					       @restrict const  dfloat *  U,
 					       @restrict const  dfloat *  Ud,
-					       @restrict dfloat *  cU, //storage for interpolated fields
-					       @restrict dfloat *  cUd, //storage for interpolated fields
 					       @restrict dfloat *  NU){
   
   // (phi, U.grad Ud)
@@ -415,4 +412,4 @@
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/src/okl/insVelocityBCHex3D.okl b/src/okl/insVelocityBCHex3D.okl
index 4d9a833b4..5e132f8f0 100644
--- a/src/okl/insVelocityBCHex3D.okl
+++ b/src/okl/insVelocityBCHex3D.okl
@@ -63,7 +63,7 @@
 
 //RHS contributions for continuous solver
 @kernel void insVelocityBCHex3D(const dlong Nelements,
-        const dlong offset, 
+                                const dlong offset, 
 				@restrict const  dfloat *  ggeo,
 				@restrict const  dfloat *  sgeo,
 				@restrict const  dfloat *  D,
@@ -78,11 +78,9 @@
 				@restrict const  dfloat *  y,
 				@restrict const  dfloat *  z,
 				@restrict const  int    *  mapB,
-        @restrict const  dfloat *  W,
-        @restrict const  dfloat *  U,
-				@restrict dfloat *  rhsU,
-				@restrict dfloat *  rhsV,
-				@restrict dfloat *  rhsW){
+                                @restrict const  dfloat *  W,
+                                @restrict const  dfloat *  U,
+				@restrict dfloat *  rhsU){
 
 
   for(dlong e=0; e<Nelements; ++e; @outer(0)){
@@ -430,9 +428,9 @@
 #pragma unroll p_Nq
 	for(int k = 0; k < p_Nq; k++){
 	  const dlong id = e*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
-	  rhsU[id] -= r_rhsU[k];
-	  rhsV[id] -= r_rhsV[k];
-	  rhsW[id] -= r_rhsW[k];
+	  rhsU[id + 0*offset] -= r_rhsU[k];
+	  rhsU[id + 1*offset] -= r_rhsV[k];
+	  rhsU[id + 2*offset] -= r_rhsW[k];
 	}
       }
     }
@@ -453,9 +451,7 @@
                                    @restrict const  int    *  mapB,
                                    @restrict const  dfloat *  W,
                                    @restrict const  dfloat *  U,
-				   @restrict dfloat *  UH,
-				   @restrict dfloat *  VH,
-				   @restrict dfloat *  WH){
+				   @restrict dfloat *  UH){
 
   for(dlong e=0;e<Nelements;e++;@outer(0)){
     for(int f=0; f<p_Nfaces; f++){
@@ -482,7 +478,9 @@
           bc.vM = U[bc.idM + 1*offset]; 
           bc.wM = U[bc.idM + 2*offset]; 
    
-          bc.uP = UH[bc.idM]; bc.vP = VH[bc.idM]; bc.wP = WH[bc.idM]; 
+          bc.uP = UH[bc.idM + 0*offset]; 
+          bc.vP = UH[bc.idM + 1*offset];
+          bc.wP = UH[bc.idM + 2*offset]; 
           bc.pP = 0.f;  bc.wrk = (dfloat *) W;
           bc.fieldOffset = offset; 
 	  
@@ -497,9 +495,9 @@
 	  }else if(bc.id==6){
 	    bc.wP = 0.f; // vP = vM ; uP = uM;           
 	  }
-	  UH[bc.idM] = bc.uP;
-	  VH[bc.idM] = bc.vP;
-	  WH[bc.idM] = bc.wP;
+	  UH[bc.idM + 0*offset] = bc.uP;
+	  UH[bc.idM + 1*offset] = bc.vP;
+	  UH[bc.idM + 2*offset] = bc.wP;
 	}    
       }
     }
diff --git a/src/okl/insVelocityRhsHex3D.okl b/src/okl/insVelocityRhsHex3D.okl
index 4b6127cc4..10a626ba4 100644
--- a/src/okl/insVelocityRhsHex3D.okl
+++ b/src/okl/insVelocityRhsHex3D.okl
@@ -38,9 +38,7 @@ SOFTWARE.
                                       @restrict const  dfloat *  NU,
                                       @restrict const  dfloat *  FU,
                                       @restrict const  dfloat *  GP,
-                                            @restrict dfloat *  rhsU,
-                                            @restrict dfloat *  rhsV,
-                                            @restrict dfloat *  rhsW){
+                                      @restrict dfloat *  rhsU){
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
@@ -104,9 +102,9 @@ SOFTWARE.
                 }
               }
 
-              rhsU[id] = rhsUn;
-              rhsV[id] = rhsVn;
-              rhsW[id] = rhsWn;
+              rhsU[id + 0*fieldOffset] = rhsUn;
+              rhsU[id + 1*fieldOffset] = rhsVn;
+              rhsU[id + 2*fieldOffset] = rhsWn;
             }
           }
         }
diff --git a/src/runTime.cpp b/src/runTime.cpp
index 62d09758d..b7547fc19 100644
--- a/src/runTime.cpp
+++ b/src/runTime.cpp
@@ -13,6 +13,8 @@ void makeq(ins_t *ins, dfloat time, occa::memory o_NS, occa::memory o_FS);
 void scalarSolve(ins_t *ins, dfloat time, dfloat dt, occa::memory o_S);
 void makef(ins_t *ins, dfloat time, occa::memory o_NU, occa::memory o_FU);
 void velocitySolve(ins_t *ins, dfloat time, dfloat dt, occa::memory o_U);
+void velocityStrongSubCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud);
+void scalarStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd);
 
 double etime0 = 0;
 
@@ -30,6 +32,14 @@ void runStep(ins_t *ins, dfloat time, int tstep)
     extbdfCoefficents(ins,tstep);
   }
 
+  // First extrapolate velocity to t^(n+1)
+  ins->subCycleExtKernel(mesh->Nelements,
+                         ins->ExplicitOrder,
+                         ins->fieldOffset,
+                         ins->o_extbdfC,
+                         ins->o_U,
+                         ins->o_Ue);
+
   if(ins->Nscalar) scalarSolve(ins, time, ins->dt, cds->o_S); 
   velocitySolve(ins, time, ins->dt, ins->o_U);
 
@@ -141,19 +151,10 @@ void makeq(ins_t *ins, dfloat time, occa::memory o_NS, occa::memory o_FS){
   cds_t *cds   = ins->cds; 
   mesh_t *mesh = ins->mesh;
 
-  if(cds->Nsubsteps) {
-    cdsStrongSubCycle(cds, time, cds->Nstages, ins->o_U, cds->o_S, o_NS);
-  } else {
-    // First extrapolate velocity to t^(n+1)
-     cds->subCycleExtKernel(mesh->Nelements,
-                            cds->ExplicitOrder,
-                            cds->vOffset,
-                            cds->o_extbdfC,
-                            ins->o_U,
-                            cds->o_Ue);
-     
+  if(cds->Nsubsteps)
+    scalarStrongSubCycle(cds, time, cds->Nstages, cds->o_U, cds->o_S, o_NS);
+  else
     cdsAdvection(cds, time, cds->o_Ue, cds->o_S, o_NS);
-  }
 
   ins->setScalarKernel(cds->Ntotal*cds->NSfields, 0.0, o_FS);
   
@@ -207,7 +208,7 @@ void makef(ins_t *ins, dfloat time, occa::memory o_NU, occa::memory o_FU)
   mesh_t *mesh = ins->mesh;
 
   if(ins->Nsubsteps) {
-    subCycle(ins, time, ins->Nstages, ins->o_U, o_NU);
+    velocityStrongSubCycle(ins, time, ins->Nstages, ins->o_U, o_NU);
   } else {
     if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
        ins->advectionStrongCubatureVolumeKernel(
@@ -219,7 +220,6 @@ void makef(ins_t *ins, dfloat time, occa::memory o_NU, occa::memory o_FU)
          mesh->o_cubProjectT,
          ins->fieldOffset,
          ins->o_U,
-         ins->o_cU,
          o_NU);
      else
        ins->advectionStrongVolumeKernel(
@@ -247,35 +247,260 @@ void velocitySolve(ins_t *ins, dfloat time, dfloat dt, occa::memory o_U)
 {
   makef(ins, time, ins->o_NU, ins->o_FU);
 
-  curlCurl(ins, time, o_U, ins->o_NC);
+  tombo::curlCurl(ins, time, ins->o_Ue, ins->o_NC);
+
+  occa::memory &o_wrk = ins->o_UH;
 
-  // o_rkU = sum_j (alpha_j U_j)- sum_j ( beta_j (FU_J + NU_j + NC_j) )
-  pressureRhs  (ins, time+dt);
-  pressureSolve(ins, time+dt, ins->o_rkP); 
-  ins->o_P.copyFrom(ins->o_rkP, ins->Ntotal*sizeof(dfloat)); 
+  tombo::pressureRhs  (ins, time+dt, ins->o_rhsP);
+  tombo::pressureSolve(ins, time+dt, ins->o_rhsP, o_wrk); 
+  ins->o_P.copyFrom(o_wrk, ins->Ntotal*sizeof(dfloat)); 
 
-  // rhsU^s = MM*1/nu*[ -(grad P) + sum_i ( (a_i) U^(n-i)/dt - b_i (NU+FU)^(n-i) )]
-  velocityRhs  (ins, time+dt);
-  velocitySolve(ins, time+dt, ins->o_rkU);
+  tombo::velocityRhs  (ins, time+dt, ins->o_rhsU);
+  tombo::velocitySolve(ins, time+dt, ins->o_rhsU, o_wrk);
 
   for (int s=ins->Nstages;s>1;s--)
     o_U.copyFrom(o_U, ins->Ntotal*ins->NVfields*sizeof(dfloat), 
      (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), 
      (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat));
 
-  o_U.copyFrom(ins->o_rkU, ins->NVfields*ins->Ntotal*sizeof(dfloat)); 
+  o_U.copyFrom(o_wrk, ins->NVfields*ins->Ntotal*sizeof(dfloat)); 
 
   for (int s=ins->Nstages;s>1;s--) {
     ins->o_NU.copyFrom(ins->o_NU, ins->Ntotal*ins->NVfields*sizeof(dfloat), 
      (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), 
      (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat));
     
-    ins->o_NC.copyFrom(ins->o_NC, ins->Ntotal*ins->NVfields*sizeof(dfloat), 
-     (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), 
-     (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat));
-    
     ins->o_FU.copyFrom(ins->o_FU, ins->Ntotal*ins->NVfields*sizeof(dfloat), 
      (s-1)*ins->Ntotal*ins->NVfields*sizeof(dfloat), 
      (s-2)*ins->Ntotal*ins->NVfields*sizeof(dfloat));
   }
 }
+
+void velocityStrongSubCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud)
+{
+  mesh_t *mesh = ins->mesh;
+
+  const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs);  
+
+  const dfloat tn0 = time - 0*ins->dt;
+  const dfloat tn1 = time - 1*ins->dt;
+  const dfloat tn2 = time - 2*ins->dt;
+
+  dfloat zero = 0.0, one = 1.0;
+  int izero = 0;
+
+  dfloat b, bScale=0;
+
+  // Solve for Each SubProblem
+  for (int torder=ins->ExplicitOrder-1; torder>=0; torder--){
+    
+    b=ins->extbdfB[torder];
+    bScale += b;
+
+    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
+    dlong toffset = torder*ins->NVfields*ins->Ntotal;
+
+    if (torder==ins->ExplicitOrder-1) { //first substep
+      ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U, zero, izero, o_Ud);
+    } else { //add the next field
+      ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U,  one, izero, o_Ud);
+    }     
+
+    // SubProblem  starts from here from t^(n-torder)
+    const dfloat tsub = time - torder*ins->dt;
+    // Advance SubProblem to t^(n-torder+1) 
+    for(int ststep = 0; ststep<ins->Nsubsteps;++ststep){
+      const dfloat tstage = tsub + ststep*ins->sdt;     
+
+      for(int rk=0;rk<ins->SNrk;++rk){// LSERK4 stages
+        // Extrapolate velocity to subProblem stage time
+        dfloat t = tstage +  ins->sdt*ins->Srkc[rk]; 
+
+        switch(ins->ExplicitOrder){
+	case 1:
+	  ins->extC[0] = 1.f; ins->extC[1] = 0.f; ins->extC[2] = 0.f;
+	  break;
+	case 2:
+	  ins->extC[0] = (t-tn1)/(tn0-tn1);
+	  ins->extC[1] = (t-tn0)/(tn1-tn0);
+	  ins->extC[2] = 0.f; 
+	  break;
+	case 3:
+	  ins->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); 
+	  ins->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2));
+	  ins->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1));
+	  break;
+        }
+        ins->o_extC.copyFrom(ins->extC);
+
+        //compute advective velocity fields at time t
+        ins->subCycleExtKernel(NtotalElements,
+                               Nstages,
+                               ins->fieldOffset,
+                               ins->o_extC,
+                               o_U,
+                               ins->o_Ue);
+
+        // Compute Volume Contribution
+	if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")){        
+	  ins->subCycleStrongCubatureVolumeKernel(
+            mesh->Nelements,
+            mesh->o_vgeo,
+            mesh->o_cubvgeo,
+            mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices,
+            mesh->o_cubInterpT,
+            mesh->o_cubProjectT,
+            ins->o_invLumpedMassMatrix,
+            ins->fieldOffset,
+            ins->o_Ue,
+            o_Ud,
+            ins->o_rhsUd);
+	}else{
+	  ins->subCycleStrongVolumeKernel(mesh->Nelements,
+	    mesh->o_vgeo,
+	    mesh->o_Dmatrices,
+	    ins->fieldOffset,
+	    ins->o_Ue,
+	    o_Ud,
+	    ins->o_rhsUd);
+	}
+	
+        ogsGatherScatterMany(ins->o_rhsUd, ins->NVfields, ins->fieldOffset,
+                             ogsDfloat, ogsAdd, mesh->ogs);
+  
+
+        int nfield = ins->dim==2 ? 2:3; 
+        ins->invMassMatrixKernel(mesh->Nelements,
+				 ins->fieldOffset,
+				 nfield,
+				 mesh->o_vgeo,
+				 ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
+				 ins->o_rhsUd);
+
+        ins->subCycleRKUpdateKernel(mesh->Nelements,
+				    ins->sdt,
+				    ins->Srka[rk],
+				    ins->Srkb[rk],
+				    ins->fieldOffset,
+				    ins->o_rhsUd,
+				    ins->o_resU, 
+				    o_Ud);
+      }
+    }
+  }
+}
+
+void scalarStrongSubCycle(cds_t *cds, dfloat time, int Nstages, occa::memory o_U, occa::memory o_S, occa::memory o_Sd){
+
+  mesh_t *mesh = cds->mesh;
+  const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs);  
+
+  const dfloat tn0 = time - 0*cds->dt;
+  const dfloat tn1 = time - 1*cds->dt;
+  const dfloat tn2 = time - 2*cds->dt;
+
+
+  dfloat zero = 0.0, one = 1.0;
+  int izero = 0;
+  dfloat b, bScale=0;
+
+   // Solve for Each SubProblem
+  for (int torder=(cds->ExplicitOrder-1); torder>=0; torder--){
+    
+    b=cds->extbdfB[torder];
+    bScale += b; 
+
+    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
+    dlong toffset = torder*cds->NSfields*cds->Ntotal;
+
+    if (torder==cds->ExplicitOrder-1) { //first substep
+      cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S, zero, izero, o_Sd);
+    } else { //add the next field
+      cds->scaledAddKernel(cds->NSfields*cds->Ntotal, b, toffset, o_S,  one, izero, o_Sd);
+    }     
+
+    // SubProblem  starts from here from t^(n-torder)
+    const dfloat tsub = time - torder*cds->dt;
+    // Advance SubProblem to t^(n-torder+1) 
+    for(int ststep = 0; ststep<cds->Nsubsteps;++ststep){
+     
+      const dfloat tstage = tsub + ststep*cds->sdt;     
+     
+      for(int rk=0;rk<cds->SNrk;++rk){// LSERK4 stages
+        // Extrapolate velocity to subProblem stage time
+        dfloat t = tstage +  cds->sdt*cds->Srkc[rk]; 
+
+        switch(cds->ExplicitOrder){
+          case 1:
+            cds->extC[0] = 1.f; cds->extC[1] = 0.f; cds->extC[2] = 0.f;
+            break;
+          case 2:
+            cds->extC[0] = (t-tn1)/(tn0-tn1);
+            cds->extC[1] = (t-tn0)/(tn1-tn0);
+            cds->extC[2] = 0.f; 
+            break;
+          case 3:
+            cds->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); 
+            cds->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2));
+            cds->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1));
+            break;
+        }
+        cds->o_extC.copyFrom(cds->extC);
+
+        //compute advective velocity fields at time t
+        cds->subCycleExtKernel(NtotalElements,
+                               Nstages,
+                               cds->vOffset,
+                               cds->o_extC,
+                               o_U,
+                               cds->o_Ue);
+
+     
+        // Compute Volume Contribution
+        if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE")){
+          cds->subCycleStrongCubatureVolumeKernel(mesh->Nelements,
+                                            mesh->o_vgeo,
+                                            mesh->o_cubvgeo,
+                                            mesh->o_cubDiffInterpT, // mesh->o_cubDWmatrices,
+                                            mesh->o_cubInterpT,
+                                            mesh->o_cubProjectT,
+                                            cds->vOffset,
+                                            cds->sOffset,             
+                                            cds->o_Ue,
+                                                 o_Sd,
+                                            cds->o_rhsSd);
+        } else{
+          cds->subCycleStrongVolumeKernel(mesh->Nelements,
+                                    mesh->o_vgeo,
+                                    mesh->o_Dmatrices,
+                                    cds->vOffset,
+                                    cds->sOffset,           
+                                    cds->o_Ue,
+                                    o_Sd,
+                                    cds->o_rhsSd);
+
+        }
+
+        ogsGatherScatter(cds->o_rhsSd, ogsDfloat, ogsAdd, mesh->ogs);
+
+        // int nfield = ins->dim==2 ? 2:3; 
+        cds->invMassMatrixKernel(mesh->Nelements,
+                                 cds->sOffset,
+                                 cds->NSfields,
+                                 mesh->o_vgeo,
+                                 cds->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
+                                 cds->o_rhsSd);
+
+        // Update Kernel
+        cds->subCycleRKUpdateKernel(mesh->Nelements,
+                                    cds->sdt,
+                                    cds->Srka[rk],
+                                    cds->Srkb[rk],
+                                    cds->sOffset,
+                                    cds->o_rhsSd,
+                                    cds->o_resS, 
+                                         o_Sd);
+      }
+    }
+  }
+}
diff --git a/src/tombo.cpp b/src/tombo.cpp
index d733bd185..d018aff7f 100644
--- a/src/tombo.cpp
+++ b/src/tombo.cpp
@@ -1,96 +1,14 @@
 #include "nekrs.hpp"
 
-// Compute divergence of the velocity field using physical boundary data at t = time. 
-void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU){
-
-  mesh_t *mesh = ins->mesh;
-
-  //  if (ins->vOptions.compareArgs("DISCRETIZATION","IPDG")) {
-  if(mesh->totalHaloPairs>0){
-
-    // make sure compute device is ready to perform halo extract
-    mesh->device.finish();
-    
-    // switch to data stream
-    mesh->device.setStream(mesh->dataStream);
-    
-    ins->haloGetKernel(mesh->totalHaloPairs,
-		       ins->NVfields,
-		       ins->fieldOffset,
-		       mesh->o_haloElementList,
-		       mesh->o_haloGetNodeIds,
-		       o_U,
-		       ins->o_vHaloBuffer);
-    
-    dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs;
-    
-    // copy extracted halo to HOST 
-    ins->o_vHaloBuffer.copyTo(ins->vSendBuffer, Ndata*sizeof(dfloat), 0, "async: true");// zero offset             
-
-    mesh->device.setStream(mesh->defaultStream);
-  }
-
-  // computes div u^(n+1) volume term
-  ins->divergenceVolumeKernel(mesh->Nelements,
-                             mesh->o_vgeo,
-                             mesh->o_Dmatrices,
-                             ins->fieldOffset,
-                             o_U,
-                             o_DU);
-  
-  if(mesh->totalHaloPairs>0){
-
-    // make sure compute device is ready to perform halo extract
-    mesh->device.setStream(mesh->dataStream);
-    mesh->device.finish();
-    
-    // start halo exchange
-    meshHaloExchangeStart(mesh,
-			  mesh->Nfp*(ins->NVfields)*sizeof(dfloat),
-			  ins->vSendBuffer,
-			  ins->vRecvBuffer);
-        
-    meshHaloExchangeFinish(mesh);
-    
-    dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs;
-    
-    ins->o_vHaloBuffer.copyFrom(ins->vRecvBuffer, Ndata*sizeof(dfloat), 0);  // zero offset
-    
-    ins->haloPutKernel(mesh->totalHaloPairs,
-		       ins->NVfields,
-		       ins->fieldOffset,
-		       mesh->o_haloElementList,
-		       mesh->o_haloPutNodeIds,
-		       ins->o_vHaloBuffer,
-		       o_U);
+void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP);
+void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU);
 
-    mesh->device.setStream(mesh->defaultStream);
-  }
-  
-  //computes div u^(n+1) surface term
-  const dfloat lambda = -ins->g0*ins->idt; 
-  ins->divergenceSurfaceKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_sgeo,
-    mesh->o_LIFTT,
-    mesh->o_vmapM,
-    mesh->o_vmapP,
-    ins->o_EToB,
-    time,
-    lambda, 
-    mesh->o_x,
-    mesh->o_y,
-    mesh->o_z,
-    ins->fieldOffset,
-    ins->o_Wrk,
-    o_U,
-    o_DU);
-}
+namespace tombo {
 
-void pressureRhs(ins_t *ins, dfloat time)
+void pressureRhs(ins_t *ins, dfloat time, occa::memory o_rhsP)
 {
   mesh_t *mesh = ins->mesh;
+  occa::memory &o_wrk = ins->o_UH;
 
   ins->pressureRhsKernel(mesh->Nelements,
                          mesh->o_vgeo,
@@ -105,11 +23,11 @@ void pressureRhs(ins_t *ins, dfloat time)
                          ins->o_NU,
                          ins->o_NC,
                          ins->o_FU,
-                         ins->o_rkU);
+                         o_wrk);
 
   // weak divergence term (not that no boundary contribution) 
   // -> (F, grad(phi))_sigma - g_0/dt*(n*U^n+1)_del_sigma
-  insDivergence(ins, time, ins->o_rkU, ins->o_rhsP);
+  insDivergence(ins, time, o_wrk, o_rhsP);
 
   // Add  -(grad P, grad phi) to rhsP
   const dfloat lambda = 0.0;
@@ -120,20 +38,20 @@ void pressureRhs(ins_t *ins, dfloat time)
                         mesh->o_MM,
                         lambda,
                         ins->o_P,
-                        ins->o_rhsP);
+                        o_rhsP);
 }
 
 
-void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rkP)
+void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rhsP, occa::memory o_rkP)
 {
   mesh_t *mesh = ins->mesh;
   elliptic_t *solver = ins->pSolver;
 
-  ogsGatherScatter(ins->o_rhsP, ogsDfloat, ogsAdd, mesh->ogs);
-  if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_rhsP);
+  ogsGatherScatter(o_rhsP, ogsDfloat, ogsAdd, mesh->ogs);
+  if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, o_rhsP);
   if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_PI);
 
-  ins->NiterP = ellipticSolve(solver, 0.0, ins->presTOL, ins->o_rhsP, ins->o_PI);
+  ins->NiterP = ellipticSolve(solver, 0.0, ins->presTOL, o_rhsP, ins->o_PI);
 
   ins->pressureAddBCKernel(mesh->Nelements,
                            time,
@@ -158,82 +76,13 @@ void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rkP)
                             o_rkP);
 }
 
-void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP)
-{
-  mesh_t *mesh = ins->mesh;
-
-  if(mesh->totalHaloPairs>0){
-
-    // make sure compute device is ready to perform halo extract
-    mesh->device.finish();
-    
-    // switch to data stream
-    mesh->device.setStream(mesh->dataStream);
-
-    int one = 1;
-    dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs;
-
-    ins->haloGetKernel(mesh->totalHaloPairs,
-		       one,
-		       ins->fieldOffset,
-		       mesh->o_haloElementList,
-		       mesh->o_haloGetNodeIds,
-		       o_P,
-		       ins->o_pHaloBuffer);   
-    
-    // copy extracted halo to HOST 
-    ins->o_pHaloBuffer.copyTo(ins->pSendBuffer,
-			      Ndata*sizeof(dfloat), 0, "async: true");
-    
-    mesh->device.setStream(mesh->defaultStream);
-  }
-  
-  // Compute Volume Contribution
-  ins->gradientVolumeKernel(mesh->Nelements,
-			    mesh->o_vgeo,
-			    mesh->o_Dmatrices,
-                            ins->fieldOffset,
-                            o_P,
-                            o_GP);
-
-  // COMPLETE HALO EXCHANGE
-  if(mesh->totalHaloPairs>0){
 
-    // make sure compute device is ready to perform halo extract
-    mesh->device.setStream(mesh->dataStream);
-    mesh->device.finish();
-
-    // start halo exchange
-    meshHaloExchangeStart(mesh,
-			  mesh->Nfp*sizeof(dfloat),
-			  ins->pSendBuffer,
-			  ins->pRecvBuffer);
-    
-    meshHaloExchangeFinish(mesh);
-    
-    int one = 1;
-    dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs;
-    
-    ins->o_pHaloBuffer.copyFrom(ins->pRecvBuffer, Ndata*sizeof(dfloat), 0);  // zero offset
-    
-    ins->haloPutKernel(mesh->totalHaloPairs,
-		       one,
-		       ins->fieldOffset,
-		       mesh->o_haloElementList,
-		       mesh->o_haloPutNodeIds,
-		       ins->o_pHaloBuffer,
-		       o_P);
-
-    mesh->device.finish();
-    mesh->device.setStream(mesh->defaultStream);
-    mesh->device.finish();
-  }
-}
-
-void velocityRhs(ins_t *ins, dfloat time) 
+void velocityRhs(ins_t *ins, dfloat time, occa::memory o_rhsU) 
 {
   mesh_t *mesh = ins->mesh;
-  insGradient (ins, time, ins->o_P, ins->o_rkGP);
+  occa::memory &o_wrk = ins->o_UH;
+
+  insGradient (ins, time, ins->o_P, o_wrk);
 
   ins->velocityRhsKernel(mesh->Nelements,
                          mesh->o_vgeo,
@@ -247,19 +96,27 @@ void velocityRhs(ins_t *ins, dfloat time)
                          ins->o_U,
                          ins->o_NU,
                          ins->o_FU,
-                         ins->o_rkGP,
-                         ins->o_rhsU,
-                         ins->o_rhsV,
-                         ins->o_rhsW);
+                         o_wrk,
+                         o_rhsU);
 }
 
-void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat) 
+void velocitySolve(ins_t *ins, dfloat time, occa::memory o_rhsU, occa::memory o_UH) 
 {
   mesh_t *mesh = ins->mesh;
+  dlong Ntotal = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np;
+
   elliptic_t *usolver = ins->uSolver;
   elliptic_t *vsolver = ins->vSolver;
   elliptic_t *wsolver = ins->wSolver;
 
+  occa::memory o_rhsu = o_rhsU.slice(0*ins->fieldOffset*sizeof(dfloat));
+  occa::memory o_rhsv = o_rhsU.slice(1*ins->fieldOffset*sizeof(dfloat));
+  occa::memory o_rhsw = o_rhsU.slice(2*ins->fieldOffset*sizeof(dfloat));
+
+  occa::memory o_uh = o_UH.slice(0*ins->fieldOffset*sizeof(dfloat));
+  occa::memory o_vh = o_UH.slice(1*ins->fieldOffset*sizeof(dfloat));
+  occa::memory o_wh = o_UH.slice(2*ins->fieldOffset*sizeof(dfloat));
+
   ins->velocityRhsBCKernel(mesh->Nelements,
                            ins->fieldOffset,
                            mesh->o_ggeo,
@@ -278,41 +135,29 @@ void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat)
                            ins->o_VmapB,
                            ins->o_Wrk,
                            ins->o_U,
-                           ins->o_rhsU,
-                           ins->o_rhsV,
-                           ins->o_rhsW);
+                           o_rhsU);
 
-  // TODO: change me to gs_many
-  ogsGatherScatter(ins->o_rhsU, ogsDfloat, ogsAdd, mesh->ogs);
-  ogsGatherScatter(ins->o_rhsV, ogsDfloat, ogsAdd, mesh->ogs);
-  if (ins->dim==3)
-    ogsGatherScatter(ins->o_rhsW, ogsDfloat, ogsAdd, mesh->ogs);
-
-  dlong Ntotal = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np;
+  ogsGatherScatterMany(o_rhsU, ins->NVfields, ins->fieldOffset,
+                       ogsDfloat, ogsAdd, mesh->ogs);
 
   // Use old velocity for velocity solver initial condition
-  // TODO: fure into one kernel
-  ins->o_UH.copyFrom(ins->o_U,Ntotal*sizeof(dfloat),0,0*ins->fieldOffset*sizeof(dfloat));
-  ins->o_VH.copyFrom(ins->o_U,Ntotal*sizeof(dfloat),0,1*ins->fieldOffset*sizeof(dfloat));
-  if (ins->dim==3)
-    ins->o_WH.copyFrom(ins->o_U,Ntotal*sizeof(dfloat),0,2*ins->fieldOffset*sizeof(dfloat));
-
-  // TODO: fuse into single kernel 
-  if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, ins->o_UH);
-  if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, ins->o_VH);
+  o_UH.copyFrom(ins->o_U,ins->NVfields*Ntotal*sizeof(dfloat));
+
+  // TODO: fuse with rhs into single kernel 
+  if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, o_uh);
+  if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, o_vh);
   if (ins->dim==3 && wsolver->Nmasked)
-    mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, ins->o_WH);
+    mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, o_wh);
 
-  // TODO: fuse into single kernel
-  if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, ins->o_rhsU);
-  if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, ins->o_rhsV);
+  if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, o_rhsu);
+  if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, o_rhsv);
   if (ins->dim==3 && wsolver->Nmasked)
-    mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, ins->o_rhsW);
+    mesh->maskKernel(wsolver->Nmasked, wsolver->o_maskIds, o_rhsw);
 
-  ins->NiterU = ellipticSolve(usolver, ins->lambda, ins->velTOL, ins->o_rhsU, ins->o_UH);
-  ins->NiterV = ellipticSolve(vsolver, ins->lambda, ins->velTOL, ins->o_rhsV, ins->o_VH);
+  ins->NiterU = ellipticSolve(usolver, ins->lambda, ins->velTOL, o_rhsu, o_uh);
+  ins->NiterV = ellipticSolve(vsolver, ins->lambda, ins->velTOL, o_rhsv, o_vh);
   if (ins->dim==3) 
-    ins->NiterW = ellipticSolve(wsolver, ins->lambda, ins->velTOL, ins->o_rhsW, ins->o_WH);
+    ins->NiterW = ellipticSolve(wsolver, ins->lambda, ins->velTOL, o_rhsw, o_wh);
   
   ins->velocityAddBCKernel(mesh->Nelements,
                            ins->fieldOffset,
@@ -325,239 +170,215 @@ void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat)
                            ins->o_VmapB,
                            ins->o_Wrk,
                            ins->o_U,
-                           ins->o_UH,
-                           ins->o_VH,
-                           ins->o_WH);
-
-  //copy into intermediate stage storage
-  // TODO: fuse into one copyTo
-  ins->o_UH.copyTo(o_Uhat,Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0);
-  ins->o_VH.copyTo(o_Uhat,Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0);
-  if (ins->dim==3)
-    ins->o_WH.copyTo(o_Uhat,Ntotal*sizeof(dfloat),2*ins->fieldOffset*sizeof(dfloat),0);
-}
-
-void subCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud)
-{
-  mesh_t *mesh = ins->mesh;
-
-  const dlong NtotalElements = (mesh->Nelements+mesh->totalHaloPairs);  
-
-  const dfloat tn0 = time - 0*ins->dt;
-  const dfloat tn1 = time - 1*ins->dt;
-  const dfloat tn2 = time - 2*ins->dt;
-
-  dfloat zero = 0.0, one = 1.0;
-  int izero = 0;
-
-  dfloat b, bScale=0;
-
-  // Solve for Each SubProblem
-  for (int torder=ins->ExplicitOrder-1; torder>=0; torder--){
-    
-    b=ins->extbdfB[torder];
-    bScale += b;
-
-    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
-    dlong toffset = torder*ins->NVfields*ins->Ntotal;
-
-    if (torder==ins->ExplicitOrder-1) { //first substep
-      ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U, zero, izero, o_Ud);
-    } else { //add the next field
-      ins->scaledAddKernel(ins->NVfields*ins->Ntotal, b, toffset, o_U,  one, izero, o_Ud);
-    }     
-
-    // SubProblem  starts from here from t^(n-torder)
-    const dfloat tsub = time - torder*ins->dt;
-    // Advance SubProblem to t^(n-torder+1) 
-    for(int ststep = 0; ststep<ins->Nsubsteps;++ststep){
-      const dfloat tstage = tsub + ststep*ins->sdt;     
-
-      for(int rk=0;rk<ins->SNrk;++rk){// LSERK4 stages
-        // Extrapolate velocity to subProblem stage time
-        dfloat t = tstage +  ins->sdt*ins->Srkc[rk]; 
-
-        switch(ins->ExplicitOrder){
-	case 1:
-	  ins->extC[0] = 1.f; ins->extC[1] = 0.f; ins->extC[2] = 0.f;
-	  break;
-	case 2:
-	  ins->extC[0] = (t-tn1)/(tn0-tn1);
-	  ins->extC[1] = (t-tn0)/(tn1-tn0);
-	  ins->extC[2] = 0.f; 
-	  break;
-	case 3:
-	  ins->extC[0] = (t-tn1)*(t-tn2)/((tn0-tn1)*(tn0-tn2)); 
-	  ins->extC[1] = (t-tn0)*(t-tn2)/((tn1-tn0)*(tn1-tn2));
-	  ins->extC[2] = (t-tn0)*(t-tn1)/((tn2-tn0)*(tn2-tn1));
-	  break;
-        }
-        ins->o_extC.copyFrom(ins->extC);
-
-        //compute advective velocity fields at time t
-        ins->subCycleExtKernel(NtotalElements,
-                               Nstages,
-                               ins->fieldOffset,
-                               ins->o_extC,
-                               o_U,
-                               ins->o_Ue);
-
-        // Compute Volume Contribution
-	if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE")){        
-	  ins->subCycleStrongCubatureVolumeKernel(
-            mesh->Nelements,
-            mesh->o_vgeo,
-            mesh->o_cubvgeo,
-            mesh->o_cubDiffInterpT, //mesh->o_cubDWmatrices,
-            mesh->o_cubInterpT,
-            mesh->o_cubProjectT,
-            ins->o_invLumpedMassMatrix,
-            ins->fieldOffset,
-            ins->o_Ue,
-            o_Ud,
-            ins->o_cU,     
-            ins->o_cUd,     
-            ins->o_rhsUd);
-	}else{
-	  ins->subCycleStrongVolumeKernel(mesh->Nelements,
-	    mesh->o_vgeo,
-	    mesh->o_Dmatrices,
-	    ins->fieldOffset,
-	    ins->o_Ue,
-	    o_Ud,
-	    ins->o_rhsUd);
-	}
-	
-        // TODO: change me to gs_many
-	for(int k=0;k<ins->dim;++k){
-	  ogsGatherScatter(ins->o_rhsUd+k*ins->fieldOffset*sizeof(dfloat), 
-                           ogsDfloat, ogsAdd, mesh->ogs);
-	} 
-
-        int nfield = ins->dim==2 ? 2:3; 
-        ins->invMassMatrixKernel(mesh->Nelements,
-				 ins->fieldOffset,
-				 nfield,
-				 mesh->o_vgeo,
-				 ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
-				 ins->o_rhsUd);
-
-        ins->subCycleRKUpdateKernel(mesh->Nelements,
-				    ins->sdt,
-				    ins->Srka[rk],
-				    ins->Srkb[rk],
-				    ins->fieldOffset,
-				    ins->o_rhsUd,
-				    ins->o_resU, 
-				    o_Ud);
-      }
-    }
-  }
+                           o_UH);
 }
 
 void curlCurl(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_NC)
 {
   mesh_t *mesh = ins->mesh;
 
-  // TODO: extrapolte velocity
+  occa::memory &o_wrk = ins->o_UH;
 
-  // No that UH, VH and WH is used as the temporary arrays to store curl(u)
-  // Note that multiplied with Mass Matrix i.e. JW to prepare smoothing !!!!!
   ins->curlKernel(mesh->Nelements,
                   mesh->o_vgeo,
                   mesh->o_Dmatrices,
                   mesh->o_MM, 
                   ins->fieldOffset,
                   o_U,
-                  ins->o_UH, // Wx
-                  ins->o_VH, // Wy
-                  ins->o_WH);// Wz
-
-  // TODO: change me to gs_many
-  if(ins->dim==3){    
-   ogsGatherScatter(ins->o_UH, ogsDfloat, ogsAdd, mesh->ogs);
-   ogsGatherScatter(ins->o_VH, ogsDfloat, ogsAdd, mesh->ogs);
-  } 
-   ogsGatherScatter(ins->o_WH, ogsDfloat, ogsAdd, mesh->ogs);
-
-  int nfield = 0; 
-  if (ins->dim==3){
-    nfield = 3; 
-    // TODO: fuse into single copyTo
-    ins->o_UH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0);
-    ins->o_VH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0);    
-    ins->o_WH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),2*ins->fieldOffset*sizeof(dfloat),0);    
-  }else{
-    nfield =1; 
-    // if quad or tri copy Wz to first place
-    ins->o_WH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0); 
-  }
-  
- 
-  // Multiply With Inverse Mass Matrix
+                  o_wrk);
+
+  ogsGatherScatterMany(o_wrk, ins->NVfields, ins->fieldOffset,
+                       ogsDfloat, ogsAdd, mesh->ogs);
+
   ins->invMassMatrixKernel(
     mesh->Nelements,
     ins->fieldOffset,
-    nfield,
+    ins->NVfields,
     mesh->o_vgeo,
     ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
-    ins->o_rkU);
-
+    o_wrk);
 
-  if(ins->dim==2){
-  // Second curl on smoothed curl(u)
-  ins->curlBKernel(mesh->Nelements,
-                  mesh->o_vgeo,
-                  mesh->o_Dmatrices,
-                  mesh->o_MM, 
-                  ins->fieldOffset,
-                  ins->o_rkU,
-                  ins->o_UH, // Wx
-                  ins->o_VH, // Wy
-                  ins->o_WH);// Wz
-  }else{
-    ins->curlKernel(mesh->Nelements,
+  ins->curlKernel(mesh->Nelements,
                   mesh->o_vgeo,
                   mesh->o_Dmatrices,
                   mesh->o_MM, 
                   ins->fieldOffset,
-                  ins->o_rkU,
-                  ins->o_UH, // Wx
-                  ins->o_VH, // Wy
-                  ins->o_WH);// Wz
-  }
- 
-  // TODO: change me to gs_many
-  ogsGatherScatter(ins->o_UH, ogsDfloat, ogsAdd, mesh->ogs);
-  ogsGatherScatter(ins->o_VH, ogsDfloat, ogsAdd, mesh->ogs);
-  if(ins->dim==3)       
-    ogsGatherScatter(ins->o_WH, ogsDfloat, ogsAdd, mesh->ogs);
-
-  nfield = 0; 
-  if (ins->dim==3){
-    nfield = 3;
-    // TODO: fuse into single copyTo 
-    ins->o_UH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0);
-    ins->o_VH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0);    
-    ins->o_WH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),2*ins->fieldOffset*sizeof(dfloat),0);    
-  }else{
-    nfield =2; 
-    // TODO: fuse into single copyTo 
-    ins->o_UH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),0*ins->fieldOffset*sizeof(dfloat),0);
-    ins->o_VH.copyTo(ins->o_rkU,ins->Ntotal*sizeof(dfloat),1*ins->fieldOffset*sizeof(dfloat),0); 
-  }
+                  o_wrk,
+                  o_NC);
+
+  ogsGatherScatterMany(o_NC, ins->NVfields, ins->fieldOffset,
+                       ogsDfloat, ogsAdd, mesh->ogs);
 
   ins->invMassMatrixKernel(
     mesh->Nelements,
     ins->fieldOffset,
-    nfield,
+    ins->NVfields,
     mesh->o_vgeo,
     ins->o_InvM, // mesh->o_MM, // should be invMM for tri/tet
-    ins->o_rkU);
-   
-  ins->o_rkU.copyTo(o_NC,ins->NVfields*ins->Ntotal*sizeof(dfloat),
-                    0*ins->fieldOffset*sizeof(dfloat),0);
+    o_NC);
 }
 
 
 
+} // namespace
+
+void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP)
+{
+  mesh_t *mesh = ins->mesh;
+
+  if(mesh->totalHaloPairs>0){
+
+    // make sure compute device is ready to perform halo extract
+    mesh->device.finish();
+    
+    // switch to data stream
+    mesh->device.setStream(mesh->dataStream);
+
+    int one = 1;
+    dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs;
+
+    ins->haloGetKernel(mesh->totalHaloPairs,
+		       one,
+		       ins->fieldOffset,
+		       mesh->o_haloElementList,
+		       mesh->o_haloGetNodeIds,
+		       o_P,
+		       ins->o_pHaloBuffer);   
+    
+    // copy extracted halo to HOST 
+    ins->o_pHaloBuffer.copyTo(ins->pSendBuffer,
+			      Ndata*sizeof(dfloat), 0, "async: true");
+    
+    mesh->device.setStream(mesh->defaultStream);
+  }
+  
+  // Compute Volume Contribution
+  ins->gradientVolumeKernel(mesh->Nelements,
+			    mesh->o_vgeo,
+			    mesh->o_Dmatrices,
+                            ins->fieldOffset,
+                            o_P,
+                            o_GP);
+
+  // COMPLETE HALO EXCHANGE
+  if(mesh->totalHaloPairs>0){
+
+    // make sure compute device is ready to perform halo extract
+    mesh->device.setStream(mesh->dataStream);
+    mesh->device.finish();
+
+    // start halo exchange
+    meshHaloExchangeStart(mesh,
+			  mesh->Nfp*sizeof(dfloat),
+			  ins->pSendBuffer,
+			  ins->pRecvBuffer);
+    
+    meshHaloExchangeFinish(mesh);
+    
+    int one = 1;
+    dlong Ndata = one*mesh->Nfp*mesh->totalHaloPairs;
+    
+    ins->o_pHaloBuffer.copyFrom(ins->pRecvBuffer, Ndata*sizeof(dfloat), 0);  // zero offset
+    
+    ins->haloPutKernel(mesh->totalHaloPairs,
+		       one,
+		       ins->fieldOffset,
+		       mesh->o_haloElementList,
+		       mesh->o_haloPutNodeIds,
+		       ins->o_pHaloBuffer,
+		       o_P);
+
+    mesh->device.finish();
+    mesh->device.setStream(mesh->defaultStream);
+    mesh->device.finish();
+  }
+}
+
+// Compute divergence of the velocity field using physical boundary data at t = time. 
+void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU){
+
+  mesh_t *mesh = ins->mesh;
+
+  //  if (ins->vOptions.compareArgs("DISCRETIZATION","IPDG")) {
+  if(mesh->totalHaloPairs>0){
+
+    // make sure compute device is ready to perform halo extract
+    mesh->device.finish();
+    
+    // switch to data stream
+    mesh->device.setStream(mesh->dataStream);
+    
+    ins->haloGetKernel(mesh->totalHaloPairs,
+		       ins->NVfields,
+		       ins->fieldOffset,
+		       mesh->o_haloElementList,
+		       mesh->o_haloGetNodeIds,
+		       o_U,
+		       ins->o_vHaloBuffer);
+    
+    dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs;
+    
+    // copy extracted halo to HOST 
+    ins->o_vHaloBuffer.copyTo(ins->vSendBuffer, Ndata*sizeof(dfloat), 0, "async: true");// zero offset             
+
+    mesh->device.setStream(mesh->defaultStream);
+  }
+
+  // computes div u^(n+1) volume term
+  ins->divergenceVolumeKernel(mesh->Nelements,
+                             mesh->o_vgeo,
+                             mesh->o_Dmatrices,
+                             ins->fieldOffset,
+                             o_U,
+                             o_DU);
+  
+  if(mesh->totalHaloPairs>0){
+
+    // make sure compute device is ready to perform halo extract
+    mesh->device.setStream(mesh->dataStream);
+    mesh->device.finish();
+    
+    // start halo exchange
+    meshHaloExchangeStart(mesh,
+			  mesh->Nfp*(ins->NVfields)*sizeof(dfloat),
+			  ins->vSendBuffer,
+			  ins->vRecvBuffer);
+        
+    meshHaloExchangeFinish(mesh);
+    
+    dlong Ndata = ins->NVfields*mesh->Nfp*mesh->totalHaloPairs;
+    
+    ins->o_vHaloBuffer.copyFrom(ins->vRecvBuffer, Ndata*sizeof(dfloat), 0);  // zero offset
+    
+    ins->haloPutKernel(mesh->totalHaloPairs,
+		       ins->NVfields,
+		       ins->fieldOffset,
+		       mesh->o_haloElementList,
+		       mesh->o_haloPutNodeIds,
+		       ins->o_vHaloBuffer,
+		       o_U);
+
+    mesh->device.setStream(mesh->defaultStream);
+  }
+  
+  //computes div u^(n+1) surface term
+  const dfloat lambda = -ins->g0*ins->idt; 
+  ins->divergenceSurfaceKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_sgeo,
+    mesh->o_LIFTT,
+    mesh->o_vmapM,
+    mesh->o_vmapP,
+    ins->o_EToB,
+    time,
+    lambda, 
+    mesh->o_x,
+    mesh->o_y,
+    mesh->o_z,
+    ins->fieldOffset,
+    ins->o_Wrk,
+    o_U,
+    o_DU);
+}
+
diff --git a/src/tombo.hpp b/src/tombo.hpp
index 2ea3aa857..3c468745c 100644
--- a/src/tombo.hpp
+++ b/src/tombo.hpp
@@ -2,14 +2,15 @@
 #define nekrs_tombo_hpp_
 
 #include "nekrs.hpp"
-void pressureRhs(ins_t *ins, dfloat time);
-void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rkP);
-void insGradient(ins_t *ins, dfloat time, occa::memory o_P, occa::memory o_GP);
-void velocityRhs(ins_t *ins, dfloat time);
-void velocitySolve(ins_t *ins, dfloat time, occa::memory o_Uhat);
-void subCycle(ins_t *ins, dfloat time, int Nstages, occa::memory o_U, occa::memory o_Ud);
-void advection(ins_t *ins, dfloat time);
+
+namespace tombo {
+
+void pressureRhs(ins_t *ins, dfloat time, occa::memory o_rhsP);
+void pressureSolve(ins_t *ins, dfloat time, occa::memory o_rhsP, occa::memory o_rkP);
+void velocityRhs(ins_t *ins, dfloat time, occa::memory o_rhsU);
+void velocitySolve(ins_t *ins, dfloat time, occa::memory o_rhsU, occa::memory o_Uhat);
 void curlCurl(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_NC);
-void insDivergence(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_DU);
+
+}
 
 #endif