diff --git a/.gitignore b/.gitignore
index fb07c7f53..9c8dff4f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 build/
-*.so
-*.o
+CMakeFiles
 *.tgz
diff --git a/3rd_party/gslib/ogs/include/ogstypes.h b/3rd_party/gslib/ogs/include/ogstypes.h
new file mode 100644
index 000000000..9e804bbb1
--- /dev/null
+++ b/3rd_party/gslib/ogs/include/ogstypes.h
@@ -0,0 +1,45 @@
+#if !defined(ogstypes_h)
+#define ogstypes_h 
+
+//float data type
+#if 0
+#define DFLOAT_SINGLE
+#define dfloat float
+#define MPI_DFLOAT MPI_FLOAT
+#define dfloatFormat "%f"
+#define dfloatString "float"
+#else
+#define DFLOAT_DOUBLE
+#define dfloat double
+#define MPI_DFLOAT MPI_DOUBLE
+#define dfloatFormat "%lf"
+#define dfloatString "double"
+#endif
+
+//host index data type
+#if 0
+#define hlong int
+#define MPI_HLONG MPI_INT
+#define hlongFormat "%d"
+#define hlongString "int"
+#else
+#define hlong long long int
+#define MPI_HLONG MPI_LONG_LONG_INT
+#define hlongFormat "%lld"
+#define hlongString "long long int"
+#endif
+
+//device index data type
+#if 1
+#define dlong int
+#define MPI_DLONG MPI_INT
+#define dlongFormat "%d"
+#define dlongString "int"
+#else
+#define dlong long long int
+#define MPI_DLONG MPI_LONG_LONG_INT
+#define dlongFormat "%lld"
+#define dlongString "long long int"
+#endif
+
+#endif
diff --git a/3rd_party/gslib/ogs/ogs.hpp b/3rd_party/gslib/ogs/ogs.hpp
index 4e2f68d72..e79606be9 100644
--- a/3rd_party/gslib/ogs/ogs.hpp
+++ b/3rd_party/gslib/ogs/ogs.hpp
@@ -29,8 +29,8 @@ SOFTWARE.
 
   The code
   
-  	dlong N;
-    hlong id[N];    // the hlong and dlong types are defined in "types.h"
+  	int N;
+    long long int id[N];    // the long long int and int types are defined in "types.h"
     int   haloFlag[N];    
     ...
     struct ogs_t *ogs = ogsSetup(N, id, &comm, verbose);
@@ -118,7 +118,6 @@ SOFTWARE.
 #include <occa.hpp>
 
 #include "mpi.h"
-#include "types.h"
 
 #define ogsFloat  "float"
 #define ogsDouble "double"
@@ -127,7 +126,7 @@ SOFTWARE.
 #define ogsFloatCommHalf "floatCommHalf"
 #define ogsInt  "int"
 #define ogsLong "long long int"
-#define ogsDlong dlongString
+#define ogsDlong intString
 #define ogsHlong hlongString
 
 #define ogsAdd "add"
@@ -141,21 +140,21 @@ typedef struct {
   MPI_Comm comm;
   occa::device device;
 
-  dlong         N;
-  dlong         Ngather;        //  total number of gather nodes
-  dlong         Nlocal;         //  number of local nodes
-  dlong         NlocalGather;   //  number of local gathered nodes 
-  dlong         Nhalo;          //  number of halo nodes
-  dlong         NhaloGather;    //  number of gathered nodes on halo
-  dlong         NownedHalo;     //  number of owned halo nodes
+  int         N;
+  int         Ngather;        //  total number of gather nodes
+  int         Nlocal;         //  number of local nodes
+  int         NlocalGather;   //  number of local gathered nodes 
+  int         Nhalo;          //  number of halo nodes
+  int         NhaloGather;    //  number of gathered nodes on halo
+  int         NownedHalo;     //  number of owned halo nodes
 
-  dlong         *localGatherOffsets;
-  dlong         *localGatherIds;
+  int         *localGatherOffsets;
+  int         *localGatherIds;
   occa::memory o_localGatherOffsets;  
   occa::memory o_localGatherIds;      
 
-  dlong         *haloGatherOffsets;
-  dlong         *haloGatherIds;
+  int         *haloGatherOffsets;
+  int         *haloGatherIds;
   occa::memory o_haloGatherOffsets;
   occa::memory o_haloGatherIds;    
 
@@ -171,7 +170,7 @@ typedef struct {
 }ogs_t;
 
 
-ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, 
+ogs_t *ogsSetup(int N, long long int *ids, MPI_Comm &comm, 
                 int verbose, occa::device device);
 
 void ogsFree(ogs_t* ogs);
@@ -179,51 +178,51 @@ void ogsFree(ogs_t* ogs);
 // Host array versions
 void ogsGatherScatter    (void  *v, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
 void ogsGatherScatterVec (void  *v, const int k, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
-void ogsGatherScatterMany(void  *v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
+void ogsGatherScatterMany(void  *v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
 
 void ogsGather    (void  *gv, void  *v, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherVec (void  *gv, void  *v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsGatherMany(void  *gv, void  *v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsGatherMany(void  *gv, void  *v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 void ogsScatter    (void  *sv, void  *v, const char *type, const char *op, ogs_t *ogs);
 void ogsScatterVec (void  *sv, void  *v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsScatterMany(void  *sv, void  *v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsScatterMany(void  *sv, void  *v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 
 // Synchronous device buffer versions
 void ogsGatherScatter    (occa::memory  o_v, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
 void ogsGatherScatterVec (occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
-void ogsGatherScatterMany(occa::memory  o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
+void ogsGatherScatterMany(occa::memory  o_v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call
 
 void ogsGather    (occa::memory  o_gv, occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherVec (occa::memory  o_gv, occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsGatherMany(occa::memory  o_gv, occa::memory  o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsGatherMany(occa::memory  o_gv, occa::memory  o_v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 void ogsScatter    (occa::memory  o_sv, occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsScatterVec (occa::memory  o_sv, occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsScatterMany(occa::memory  o_sv, occa::memory  o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsScatterMany(occa::memory  o_sv, occa::memory  o_v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 // Asynchronous device buffer versions
 void ogsGatherScatterStart     (occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherScatterFinish    (occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherScatterVecStart  (occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherScatterVecFinish (occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsGatherScatterManyStart (occa::memory  o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs);
-void ogsGatherScatterManyFinish(occa::memory  o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsGatherScatterManyStart (occa::memory  o_v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs);
+void ogsGatherScatterManyFinish(occa::memory  o_v, const int k, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 void ogsGatherStart     (occa::memory  o_Gv, occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherFinish    (occa::memory  o_Gv, occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherVecStart  (occa::memory  o_Gv, occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
 void ogsGatherVecFinish (occa::memory  o_Gv, occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsGatherManyStart (occa::memory  o_Gv, occa::memory  o_v, const int k, const dlong gstride, const dlong stride, const char *type, const char *op, ogs_t *ogs);
-void ogsGatherManyFinish(occa::memory  o_Gv, occa::memory  o_v, const int k, const dlong gstride, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsGatherManyStart (occa::memory  o_Gv, occa::memory  o_v, const int k, const int gstride, const int stride, const char *type, const char *op, ogs_t *ogs);
+void ogsGatherManyFinish(occa::memory  o_Gv, occa::memory  o_v, const int k, const int gstride, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 void ogsScatterStart     (occa::memory  o_Sv, occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsScatterFinish    (occa::memory  o_Sv, occa::memory  o_v, const char *type, const char *op, ogs_t *ogs);
 void ogsScatterVecStart  (occa::memory  o_Sv, occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
 void ogsScatterVecFinish (occa::memory  o_Sv, occa::memory  o_v, const int k, const char *type, const char *op, ogs_t *ogs);
-void ogsScatterManyStart (occa::memory  o_Sv, occa::memory  o_v, const int k, const dlong sstride, const dlong stride, const char *type, const char *op, ogs_t *ogs);
-void ogsScatterManyFinish(occa::memory  o_Sv, occa::memory  o_v, const int k, const dlong sstride, const dlong stride, const char *type, const char *op, ogs_t *ogs);
+void ogsScatterManyStart (occa::memory  o_Sv, occa::memory  o_v, const int k, const int sstride, const int stride, const char *type, const char *op, ogs_t *ogs);
+void ogsScatterManyFinish(occa::memory  o_Sv, occa::memory  o_v, const int k, const int sstride, const int stride, const char *type, const char *op, ogs_t *ogs);
 
 void *ogsHostMallocPinned(occa::device &device, size_t size, void *source, occa::memory &mem, occa::memory &h_mem);
 
@@ -255,12 +254,12 @@ typedef struct {
 
 namespace oogs{
 
-void start(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h);
-void finish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h);
-void startFinish(void *v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h);
-void startFinish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h);
-oogs_t *setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::function<void()> callback, oogs_mode gsMode);
-oogs_t *setup(dlong N, hlong *ids, const int k, const dlong stride, const char *type, MPI_Comm &comm,
+void start(occa::memory o_v, const int k, const int stride, const char *type, const char *op, oogs_t *h);
+void finish(occa::memory o_v, const int k, const int stride, const char *type, const char *op, oogs_t *h);
+void startFinish(void *v, const int k, const int stride, const char *type, const char *op, oogs_t *h);
+void startFinish(occa::memory o_v, const int k, const int stride, const char *type, const char *op, oogs_t *h);
+oogs_t *setup(ogs_t *ogs, int nVec, int stride, const char *type, std::function<void()> callback, oogs_mode gsMode);
+oogs_t *setup(int N, long long int *ids, const int k, const int stride, const char *type, MPI_Comm &comm,
               int verbose, occa::device device, std::function<void()> callback, oogs_mode mode);
 void destroy(oogs_t *h);
 
diff --git a/3rd_party/gslib/ogs/okl/gather.okl b/3rd_party/gslib/ogs/okl/gather.okl
index e86a72ec1..324ab75fb 100644
--- a/3rd_party/gslib/ogs/okl/gather.okl
+++ b/3rd_party/gslib/ogs/okl/gather.okl
@@ -94,15 +94,15 @@ SOFTWARE.
 @kernel void gather_longAdd(const dlong Ngather,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
 
     const dlong start = gatherStarts[g];
     const dlong end = gatherStarts[g+1];
      
-    long long int gq = 0;
+    hlong gq = 0;
     for(dlong n=start;n<end;++n){
       const dlong id = gatherIds[n];
       gq += q[id];
@@ -182,15 +182,15 @@ SOFTWARE.
 @kernel void gather_longMul(const dlong Ngather,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
 
     const dlong start = gatherStarts[g];
     const dlong end = gatherStarts[g+1];
      
-    long long int gq = 1.f;
+    hlong gq = 1.f;
     for(dlong n=start;n<end;++n){
       const dlong id = gatherIds[n];
       gq *= q[id];
@@ -276,8 +276,8 @@ SOFTWARE.
 @kernel void gather_longMin(const dlong Ngather,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
 
@@ -285,7 +285,7 @@ SOFTWARE.
     const dlong end = gatherStarts[g+1];
      
     const dlong startId = gatherIds[start];
-    long long int gq = q[startId];
+    hlong gq = q[startId];
     for(dlong n=start+1;n<end;++n){
       const dlong id = gatherIds[n];
       gq = (q[id] < gq) ? q[id] : gq;
@@ -369,8 +369,8 @@ SOFTWARE.
 @kernel void gather_longMax(const dlong Ngather,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
 
@@ -378,7 +378,7 @@ SOFTWARE.
     const dlong end = gatherStarts[g+1];
      
     const dlong startId = gatherIds[start];
-    long long int gq = q[startId];
+    hlong gq = q[startId];
     for(dlong n=start+1;n<end;++n){
       const dlong id = gatherIds[n];
       gq = (q[id] >  gq) ? q[id] : gq;
diff --git a/3rd_party/gslib/ogs/okl/gatherMany.okl b/3rd_party/gslib/ogs/okl/gatherMany.okl
index a4afd55a2..be6aaf1ee 100644
--- a/3rd_party/gslib/ogs/okl/gatherMany.okl
+++ b/3rd_party/gslib/ogs/okl/gatherMany.okl
@@ -112,8 +112,8 @@ SOFTWARE.
                  const  dlong   gstride,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -122,7 +122,7 @@ SOFTWARE.
     const dlong start = gatherStarts[gid];
     const dlong end = gatherStarts[gid+1];
      
-    long long int gq = 0;
+    hlong gq = 0;
     for(dlong n=start;n<end;++n){
       const dlong id = gatherIds[n];
       gq += q[id+k*stride];
@@ -220,8 +220,8 @@ SOFTWARE.
                  const  dlong   gstride,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -230,7 +230,7 @@ SOFTWARE.
     const dlong start = gatherStarts[gid];
     const dlong end = gatherStarts[gid+1];
      
-    long long int gq = 1.f;
+    hlong gq = 1.f;
     for(dlong n=start;n<end;++n){
       const dlong id = gatherIds[n];
       gq *= q[id+k*stride];
@@ -334,8 +334,8 @@ SOFTWARE.
                  const  dlong   gstride,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -345,7 +345,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
      
     const dlong startId = gatherIds[start];
-    long long int gq = q[startId+k*stride];
+    hlong gq = q[startId+k*stride];
     for(dlong n=start+1;n<end;++n){
       const dlong id = gatherIds[n];
       gq = (q[id+k*stride] < gq) ? q[id+k*stride] : gq;
@@ -447,8 +447,8 @@ SOFTWARE.
                  const  dlong   gstride,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -458,7 +458,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
      
     const dlong startId = gatherIds[start];
-    long long int gq = q[startId+k*stride];
+    hlong gq = q[startId+k*stride];
     for(dlong n=start+1;n<end;++n){
       const dlong id = gatherIds[n];
       gq = (q[id+k*stride] >  gq) ? q[id+k*stride] : gq;
diff --git a/3rd_party/gslib/ogs/okl/gatherScatter.okl b/3rd_party/gslib/ogs/okl/gatherScatter.okl
index 40c4e9b87..2a861ff5a 100644
--- a/3rd_party/gslib/ogs/okl/gatherScatter.okl
+++ b/3rd_party/gslib/ogs/okl/gatherScatter.okl
@@ -106,7 +106,7 @@ SOFTWARE.
 @kernel void gatherScatter_longAdd(const dlong Ngather,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
     
@@ -114,7 +114,7 @@ SOFTWARE.
     const dlong end = gatherStarts[g+1];
     if((start+1)!=end){
     
-      long long int gq = 0;
+      hlong gq = 0;
       
       for(dlong n=start;n<end;++n){
         const dlong id = gatherIds[n];
@@ -210,7 +210,7 @@ SOFTWARE.
 @kernel void gatherScatter_longMul(const dlong Ngather,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
     
@@ -218,7 +218,7 @@ SOFTWARE.
     const dlong end = gatherStarts[g+1];
     if((start+1)!=end){
     
-      long long int gq = 1;
+      hlong gq = 1;
       
       for(dlong n=start;n<end;++n){
         const dlong id = gatherIds[n];
@@ -318,7 +318,7 @@ SOFTWARE.
 @kernel void gatherScatter_longMin(const dlong Ngather,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
     
@@ -327,7 +327,7 @@ SOFTWARE.
     if((start+1)!=end){
     
       const dlong startId = gatherIds[start];
-      long long int gq = q[startId];
+      hlong gq = q[startId];
 
       for(dlong n=start+1;n<end;++n){
         const dlong id = gatherIds[n];
@@ -426,7 +426,7 @@ SOFTWARE.
 @kernel void gatherScatter_longMax(const dlong Ngather,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather;++g;@tile(256,@outer,@inner)){
     
@@ -435,7 +435,7 @@ SOFTWARE.
     if((start+1)!=end){
     
       const dlong startId = gatherIds[start];
-      long long int gq = q[startId];
+      hlong gq = q[startId];
 
       for(dlong n=start+1;n<end;++n){
         const dlong id = gatherIds[n];
@@ -448,4 +448,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/3rd_party/gslib/ogs/okl/gatherScatterMany.okl b/3rd_party/gslib/ogs/okl/gatherScatterMany.okl
index b37ee6d3c..844ddfc37 100644
--- a/3rd_party/gslib/ogs/okl/gatherScatterMany.okl
+++ b/3rd_party/gslib/ogs/okl/gatherScatterMany.okl
@@ -120,7 +120,7 @@ SOFTWARE.
                                     const dlong stride,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -130,7 +130,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
     if((start+1)!=end){
     
-      long long int gq = 0;
+      hlong gq = 0;
       
       for(dlong n=start;n<end;++n){
         const dlong id = gatherIds[n];
@@ -240,7 +240,7 @@ SOFTWARE.
                                     const dlong stride,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -250,7 +250,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
     if((start+1)!=end){
     
-      long long int gq = 1;
+      hlong gq = 1;
       
       for(dlong n=start;n<end;++n){
         const dlong id = gatherIds[n];
@@ -364,7 +364,7 @@ SOFTWARE.
                                     const dlong stride,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -375,7 +375,7 @@ SOFTWARE.
     if((start+1)!=end){
     
       const dlong startId = gatherIds[start];
-      long long int gq = q[startId+k*stride];
+      hlong gq = q[startId+k*stride];
       
       for(dlong n=start+1;n<end;++n){
         const dlong id = gatherIds[n];
@@ -488,7 +488,7 @@ SOFTWARE.
                                     const dlong stride,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -499,7 +499,7 @@ SOFTWARE.
     if((start+1)!=end){
     
       const dlong startId = gatherIds[start];
-      long long int gq = q[startId+k*stride];
+      hlong gq = q[startId+k*stride];
       
       for(dlong n=start+1;n<end;++n){
         const dlong id = gatherIds[n];
@@ -512,4 +512,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/3rd_party/gslib/ogs/okl/gatherScatterVec.okl b/3rd_party/gslib/ogs/okl/gatherScatterVec.okl
index 37d188f5c..72b064e0e 100644
--- a/3rd_party/gslib/ogs/okl/gatherScatterVec.okl
+++ b/3rd_party/gslib/ogs/okl/gatherScatterVec.okl
@@ -116,7 +116,7 @@ SOFTWARE.
                                     const int Nentries,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -126,7 +126,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
     if((start+1)!=end){
     
-      long long int gq = 0;
+      hlong gq = 0;
       
       for(dlong n=start;n<end;++n){
         const dlong id = gatherIds[n];
@@ -232,7 +232,7 @@ SOFTWARE.
                                     const int Nentries,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -242,7 +242,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
     if((start+1)!=end){
     
-      long long int gq = 1;
+      hlong gq = 1;
       
       for(dlong n=start;n<end;++n){
         const dlong id = gatherIds[n];
@@ -352,7 +352,7 @@ SOFTWARE.
                                     const int Nentries,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -363,7 +363,7 @@ SOFTWARE.
     if((start+1)!=end){
     
       const dlong startId = gatherIds[start];
-      long long int gq = q[startId*Nentries+k];
+      hlong gq = q[startId*Nentries+k];
       
       for(dlong n=start+1;n<end;++n){
         const dlong id = gatherIds[n];
@@ -472,7 +472,7 @@ SOFTWARE.
                                     const int Nentries,
                                     @restrict const  dlong *  gatherStarts,
                                     @restrict const  dlong *  gatherIds,
-                                    @restrict long long int *  q){
+                                    @restrict hlong *  q){
   
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
     
@@ -483,7 +483,7 @@ SOFTWARE.
     if((start+1)!=end){
     
       const dlong startId = gatherIds[start];
-      long long int gq = q[startId*Nentries+k];
+      hlong gq = q[startId*Nentries+k];
       
       for(dlong n=start+1;n<end;++n){
         const dlong id = gatherIds[n];
@@ -496,4 +496,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/3rd_party/gslib/ogs/okl/gatherVec.okl b/3rd_party/gslib/ogs/okl/gatherVec.okl
index f64876286..48d5d62f1 100644
--- a/3rd_party/gslib/ogs/okl/gatherVec.okl
+++ b/3rd_party/gslib/ogs/okl/gatherVec.okl
@@ -104,8 +104,8 @@ SOFTWARE.
                  const  int      Nentries,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -114,7 +114,7 @@ SOFTWARE.
     const dlong start = gatherStarts[gid];
     const dlong end = gatherStarts[gid+1];
      
-    long long int gq = 0;
+    hlong gq = 0;
     for(dlong n=start;n<end;++n){
       const dlong id = gatherIds[n];
       gq += q[id*Nentries+k];
@@ -204,8 +204,8 @@ SOFTWARE.
                  const  int      Nentries,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -214,7 +214,7 @@ SOFTWARE.
     const dlong start = gatherStarts[gid];
     const dlong end = gatherStarts[gid+1];
      
-    long long int gq = 1.f;
+    hlong gq = 1.f;
     for(dlong n=start;n<end;++n){
       const dlong id = gatherIds[n];
       gq *= q[id*Nentries+k];
@@ -310,8 +310,8 @@ SOFTWARE.
                  const  int      Nentries,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -321,7 +321,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
      
     const dlong startId = gatherIds[start];
-    long long int gq = q[startId*Nentries+k];
+    hlong gq = q[startId*Nentries+k];
     for(dlong n=start+1;n<end;++n){
       const dlong id = gatherIds[n];
       gq = (q[id*Nentries+k] < gq) ? q[id*Nentries+k] : gq;
@@ -415,8 +415,8 @@ SOFTWARE.
                  const  int      Nentries,
        @restrict const  dlong *  gatherStarts,
        @restrict const  dlong *  gatherIds,
-       @restrict const  long long int *  q,
-       @restrict long long int *  gatherq){
+       @restrict const  hlong *  q,
+       @restrict hlong *  gatherq){
 
   for(dlong g=0;g<Ngather*Nentries;++g;@tile(256,@outer,@inner)){
 
@@ -426,7 +426,7 @@ SOFTWARE.
     const dlong end = gatherStarts[gid+1];
      
     const dlong startId = gatherIds[start];
-    long long int gq = q[startId*Nentries+k];
+    hlong gq = q[startId*Nentries+k];
     for(dlong n=start+1;n<end;++n){
       const dlong id = gatherIds[n];
       gq = (q[id*Nentries+k] >  gq) ? q[id*Nentries+k] : gq;
@@ -435,4 +435,4 @@ SOFTWARE.
     //contiguously packed
     gatherq[g] = gq;
   }
-}
\ No newline at end of file
+}
diff --git a/3rd_party/gslib/ogs/okl/scatter.okl b/3rd_party/gslib/ogs/okl/scatter.okl
index 336a2a25e..841419875 100644
--- a/3rd_party/gslib/ogs/okl/scatter.okl
+++ b/3rd_party/gslib/ogs/okl/scatter.okl
@@ -88,12 +88,12 @@ SOFTWARE.
 @kernel void scatter_long(const dlong Nscatter,
                    @restrict const  dlong *  scatterStarts,
                    @restrict const  dlong *  scatterIds,
-                   @restrict const  long long int *  q,
-                   @restrict long long int *  scatterq){
+                   @restrict const  hlong *  q,
+                   @restrict hlong *  scatterq){
 
   for(dlong s=0;s<Nscatter;++s;@tile(256,@outer,@inner)){
 
-    const long long int qs = q[s];
+    const hlong qs = q[s];
     
     const dlong start = scatterStarts[s];
     const dlong end = scatterStarts[s+1];
@@ -103,4 +103,4 @@ SOFTWARE.
       scatterq[id] = qs;
     }
   }
-}
\ No newline at end of file
+}
diff --git a/3rd_party/gslib/ogs/okl/scatterMany.okl b/3rd_party/gslib/ogs/okl/scatterMany.okl
index a39a9c290..a47e37171 100644
--- a/3rd_party/gslib/ogs/okl/scatterMany.okl
+++ b/3rd_party/gslib/ogs/okl/scatterMany.okl
@@ -109,15 +109,15 @@ SOFTWARE.
                              const  dlong sstride,
                    @restrict const  dlong *  scatterStarts,
                    @restrict const  dlong *  scatterIds,
-                   @restrict const  long long int *  q,
-                   @restrict long long int *  scatterq){
+                   @restrict const  hlong *  q,
+                   @restrict hlong *  scatterq){
 
   for(dlong s=0;s<Nscatter*Nentries;++s;@tile(256,@outer,@inner)){
 
     const dlong sid = s%Nscatter;
     const int k = s/Nscatter;
     
-    const long long int qs = q[sid+k*stride];
+    const hlong qs = q[sid+k*stride];
 
     const dlong start = scatterStarts[sid];
     const dlong end = scatterStarts[sid+1];
diff --git a/3rd_party/gslib/ogs/okl/scatterVec.okl b/3rd_party/gslib/ogs/okl/scatterVec.okl
index 96285c928..00d1e5c21 100644
--- a/3rd_party/gslib/ogs/okl/scatterVec.okl
+++ b/3rd_party/gslib/ogs/okl/scatterVec.okl
@@ -98,12 +98,12 @@ SOFTWARE.
                              const   int Nentries,
                    @restrict const  dlong *  scatterStarts,
                    @restrict const  dlong *  scatterIds,
-                   @restrict const  long long int *  q,
-                   @restrict long long int *  scatterq){
+                   @restrict const  hlong *  q,
+                   @restrict hlong *  scatterq){
 
   for(dlong s=0;s<Nscatter*Nentries;++s;@tile(256,@outer,@inner)){
 
-    const long long int qs = q[s];
+    const hlong qs = q[s];
     
     const dlong sid = s/Nentries;
     const int k = s%Nentries;
@@ -115,4 +115,4 @@ SOFTWARE.
       scatterq[id*Nentries+k] = qs;
     }
   }
-}
\ No newline at end of file
+}
diff --git a/3rd_party/gslib/ogs/src/ogsGather.cpp b/3rd_party/gslib/ogs/src/ogsGather.cpp
index f25864f3a..52fcebcc8 100644
--- a/3rd_party/gslib/ogs/src/ogsGather.cpp
+++ b/3rd_party/gslib/ogs/src/ogsGather.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsGatherMany.cpp b/3rd_party/gslib/ogs/src/ogsGatherMany.cpp
index b3378af29..93e8ce9c7 100644
--- a/3rd_party/gslib/ogs/src/ogsGatherMany.cpp
+++ b/3rd_party/gslib/ogs/src/ogsGatherMany.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp b/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp
index 3fb74eff2..b1384127d 100644
--- a/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp
+++ b/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp b/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp
index 16ef0305c..b2672967d 100644
--- a/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp
+++ b/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp b/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp
index 50f99933c..82edfdac1 100644
--- a/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp
+++ b/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsGatherVec.cpp b/3rd_party/gslib/ogs/src/ogsGatherVec.cpp
index 512feb6b9..72a55139b 100644
--- a/3rd_party/gslib/ogs/src/ogsGatherVec.cpp
+++ b/3rd_party/gslib/ogs/src/ogsGatherVec.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsHostSetup.c b/3rd_party/gslib/ogs/src/ogsHostSetup.c
index f48f6ae79..515825003 100644
--- a/3rd_party/gslib/ogs/src/ogsHostSetup.c
+++ b/3rd_party/gslib/ogs/src/ogsHostSetup.c
@@ -33,7 +33,7 @@ SOFTWARE.
 
 #include "gslib.h"
 
-#include "types.h"
+#include "ogstypes.h"
 
 void *ogsHostSetup(MPI_Comm meshComm,
                    dlong NuniqueBases,
diff --git a/3rd_party/gslib/ogs/src/ogsKernels.cpp b/3rd_party/gslib/ogs/src/ogsKernels.cpp
index 5d64db3ef..ba99973fc 100644
--- a/3rd_party/gslib/ogs/src/ogsKernels.cpp
+++ b/3rd_party/gslib/ogs/src/ogsKernels.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 
@@ -173,8 +174,8 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
   if(sizeof(dlong)==4){
    ogs::kernelInfo["defines/" "dlong"]="int";
   }
-  if(sizeof(dlong)==8){
-   ogs::kernelInfo["defines/" "dlong"]="long long int";
+  if(sizeof(hlong)==8){
+   ogs::kernelInfo["defines/" "hlong"]="long long int";
   }
 
   if(sizeof(dfloat) == sizeof(double)){
@@ -188,6 +189,10 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
 
   if(device.mode()=="OpenCL"){
    //ogs::kernelInfo["compiler_flags"] += "-cl-opt-disable";
+   ogs::kernelInfo["compiler_flags"] += " -cl-std=CL2.0 ";
+   ogs::kernelInfo["compiler_flags"] += " -cl-strict-aliasing ";
+   ogs::kernelInfo["compiler_flags"] += " -cl-mad-enable ";
+   ogs::kernelInfo["defines/" "hlong"]="long";
   }
 
   if(device.mode()=="CUDA"){ // add backend compiler optimization for CUDA
diff --git a/3rd_party/gslib/ogs/src/ogsMappedAlloc.cpp b/3rd_party/gslib/ogs/src/ogsMappedAlloc.cpp
index 0e876b20b..51ff5d63a 100644
--- a/3rd_party/gslib/ogs/src/ogsMappedAlloc.cpp
+++ b/3rd_party/gslib/ogs/src/ogsMappedAlloc.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 
 void *ogsHostMallocPinned(occa::device &device, size_t size, void *source, occa::memory &mem, occa::memory &h_mem){
diff --git a/3rd_party/gslib/ogs/src/ogsScatter.cpp b/3rd_party/gslib/ogs/src/ogsScatter.cpp
index ddef43245..3e8d40afc 100644
--- a/3rd_party/gslib/ogs/src/ogsScatter.cpp
+++ b/3rd_party/gslib/ogs/src/ogsScatter.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsScatterMany.cpp b/3rd_party/gslib/ogs/src/ogsScatterMany.cpp
index 1adf28ab3..53332ecae 100644
--- a/3rd_party/gslib/ogs/src/ogsScatterMany.cpp
+++ b/3rd_party/gslib/ogs/src/ogsScatterMany.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsScatterVec.cpp b/3rd_party/gslib/ogs/src/ogsScatterVec.cpp
index f3de2cb76..70f404345 100644
--- a/3rd_party/gslib/ogs/src/ogsScatterVec.cpp
+++ b/3rd_party/gslib/ogs/src/ogsScatterVec.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/ogsSetup.cpp b/3rd_party/gslib/ogs/src/ogsSetup.cpp
index 6c8568b1b..acfa0986a 100644
--- a/3rd_party/gslib/ogs/src/ogsSetup.cpp
+++ b/3rd_party/gslib/ogs/src/ogsSetup.cpp
@@ -24,6 +24,7 @@ SOFTWARE.
 
 */
 
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
diff --git a/3rd_party/gslib/ogs/src/oogs.cpp b/3rd_party/gslib/ogs/src/oogs.cpp
index 1539c5f61..3a543922e 100644
--- a/3rd_party/gslib/ogs/src/oogs.cpp
+++ b/3rd_party/gslib/ogs/src/oogs.cpp
@@ -1,9 +1,11 @@
 #include <limits>
+#include <list>
 #include <occa.hpp>
+
+#include "ogstypes.h"
 #include "ogs.hpp"
 #include "ogsKernels.hpp"
 #include "ogsInterface.h"
-#include <list>
 
 //#define DISABLE_OOGS
 //#define OGS_ENABLE_TIMER
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f506e8f56..93f6dc2a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,7 +161,9 @@ include(config/nek5000.cmake)
 # Definitions for libP, gslib, and blaslapack
 ###############################################################################
 
-include(config/libp.cmake)
+
+include(config/mesh.cmake)
+include(config/elliptic.cmake)
 include(config/gslib.cmake)
 include(config/blaslapack.cmake)
 
@@ -170,37 +172,43 @@ include(config/blaslapack.cmake)
 ###############################################################################
 
 set(SRC 
-    src/nekrs.cpp
+    src/lib/nekrs.cpp
+    src/io/writeFld.cpp
+    src/core/utils/mysort.cpp
+    src/core/utils/parallelSort.cpp
+    src/core/utils/occaHelpers.cpp
+    src/core/utils/tinyexpr.c
+    src/core/setupAide.cpp
     src/core/cfl.cpp
     src/core/filter.cpp
     src/core/bcMap.cpp
-    src/core/insSetup.cpp
-    src/core/runTime.cpp
-    src/core/tombo.cpp
-    src/core/cds.cpp
-    src/udf/udf.cpp
-    src/mesh/meshSetup.cpp
-    src/mesh/meshNekReader.cpp
-    src/mesh/meshPhysicalNodesHex3D.cpp
-    src/mesh/meshParallelConnectNodes.cpp
-    src/core/occaDeviceConfig.cpp
-    src/nekInterface/nekInterfaceAdapter.cpp
+    src/core/setup.cpp
+    src/timeStepper/runTime.cpp
+    src/lns/tombo.cpp
+    src/cds/cds.cpp
     src/core/parReader.cpp
     src/core/configReader.cpp
-    src/core/tinyexpr.c
     src/core/timer.cpp
     src/linAlg/linAlg.cpp
+    src/linAlg/matrixConditionNumber.cpp
+    src/linAlg/matrixInverse.cpp
+    src/linAlg/matrixEig.cpp
+    src/linAlg/matrixTranspose.cpp
+    src/linAlg/matrixRightSolve.cpp
     src/plugins/avg.cpp
     src/plugins/velRecycling.cpp
     src/plugins/RANSktau.cpp
     src/plugins/lowMach.cpp
-    ## To get fortran flags
-    src/dummy.f
-    ${BLASLAPACK_SOURCES}
-    ${OGS_SOURCES}
-    ${LIBP_SOURCES}
+    src/udf/udf.cpp
+    src/nekInterface/nekInterfaceAdapter.cpp
+    src/linearSolver/NBFPCG.cpp
+    src/linearSolver/NBPCG.cpp
+    src/linearSolver/PCG.cpp
+    ${MESH_SOURCES}
     ${PARALMOND_SOURCES}
     ${ELLIPTIC_SOURCES}
+    ${BLASLAPACK_SOURCES}
+    ${OGS_SOURCES}
 )
 
 add_library(nekrs-lib SHARED ${SRC})
@@ -210,26 +218,24 @@ target_link_libraries(nekrs-lib PUBLIC libocca PRIVATE HYPRE ${GSLIB})
 target_compile_definitions(nekrs-lib PUBLIC
     ${LIBP_DEFINES}
     -DDOGS="${CMAKE_INSTALL_PREFIX}/gatherScatter"
-    -DDHOLMES="${CMAKE_INSTALL_PREFIX}/libparanumal" 
-    -DDPARALMOND="${CMAKE_INSTALL_PREFIX}/parAlmond" 
-    -DDELLIPTIC="${CMAKE_INSTALL_PREFIX}/elliptic" 
     -DHYPRE)
 
 target_include_directories(nekrs-lib 
   PUBLIC 
-  src
-  src/mesh
   src/core
+  src/core/utils
+  src/io
   src/udf
   src/linAlg
+  src/timeStepper
+  src/lns
+  src/cds
+  ${MESH_SOURCE_DIR}
   ${NEKINTERFACEDIR}
   ${OGS_SOURCE_DIR}/include
   ${OGS_SOURCE_DIR}
-  ${LIBP_SOURCE_DIR}/include
   ${ELLIPTIC_SOURCE_DIR}
-  ${PARALMOND_SOURCE_DIR}/include
   ${PARALMOND_SOURCE_DIR}
-  ${PARALMOND_SOURCE_DIR}/hypre
   PRIVATE
   ${HYPRE_SOURCE_DIR}/src
   ${HYPRE_SOURCE_DIR}/src/utilities
@@ -244,15 +250,10 @@ target_include_directories(nekrs-lib
 target_link_libraries(nekrs-lib PUBLIC libocca PRIVATE gs HYPRE)
 
 add_executable(nekrs-bin src/main.cpp)
+target_include_directories(nekrs-bin PRIVATE src/lib)
 set_target_properties(nekrs-bin PROPERTIES LINKER_LANGUAGE CXX OUTPUT_NAME nekrs)
 target_link_libraries(nekrs-bin nekrs-lib)
 
-set(ELLIPTIC_MAIN ${ELLIPTIC_SOURCE_DIR}/src/ellipticMain.c)
-set_source_files_properties(${ELLIPTIC_MAIN} PROPERTIES LANGUAGE CXX)
-add_executable(ellipticMain ${ELLIPTIC_MAIN})
-set_target_properties(ellipticMain PROPERTIES LINKER_LANGUAGE CXX EXCLUDE_FROM_ALL 1)
-target_link_libraries(ellipticMain nekrs-lib)
-
 #################################################################################
 ### Install                                                                     #
 #################################################################################
@@ -267,8 +268,8 @@ install(TARGETS nekrs-lib nekrs-bin
 # Trailing slash prevents parent directory from being copied
 install(DIRECTORY scripts/ DESTINATION bin
   FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
-install(DIRECTORY okl/ DESTINATION okl FILES_MATCHING REGEX "\.okl$")
-install(DIRECTORY src/ DESTINATION include FILES_MATCHING REGEX "\.hpp$|\.h$")
+install(DIRECTORY okl/ DESTINATION okl FILES_MATCHING REGEX "\.okl$|\.c$")
+install(DIRECTORY src/ DESTINATION include FILES_MATCHING REGEX "\.hpp$|\.h$|\.tpp$")
 install(FILES src/udf/CMakeLists.txt DESTINATION udf)
 install(DIRECTORY src/nekInterface/ DESTINATION nekInterface REGEX "\.hpp$|\.cpp$" EXCLUDE)
 
diff --git a/README.md b/README.md
index 5ee131cad..1cd52f066 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![Build Status](https://travis-ci.org/Nek5000/nekRS.svg?branch=master)](https://travis-ci.org/Nek5000/nekRS)
 [![License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-**nekRS** is an open-source Navier Stokes solver based on the spectral element method targeting modern processors and accelerators. The code uses the [CEED](https://ceed.exascaleproject.org/) software products [OCCA](https://github.com/libocca/occa) and [libParanumal](https://github.com/paranumal/libparanumal).
+**nekRS** is an open-source Navier Stokes solver based on the spectral element method targeting classical processors and hardware accelerators like GPUs. The code is a fork of [libParanumal](https://github.com/paranumal/libparanumal) tailored to our needs. For portable programming [OCCA](https://github.com/libocca/occa) is used.  
 
 Capabilities:
 
@@ -18,7 +18,7 @@ Capabilities:
 * LES and RANS turbulence models
 * VisIt & Paraview support for data analysis and visualization
 
-Note, the code is an early prototype so it's very likely that you run into undiscovered issues. Moreover it's evolving quickly so things might change from one version to another without being backward compatible. 
+Note, the code is an prototype so it's very likely that you run into undiscovered issues. Moreover it's evolving quickly so things might change from one version to another without being backward compatible. 
 
 
 ## Build Instructions
@@ -67,14 +67,21 @@ then type `source $HOME/.bash_profile` in the current terminal window.
 
 ```sh
 cd $NEKRS_HOME/examples/ethier
+nrspre ethier 2 # optional step to JIT precompile
 nrsmpi ethier 2 # run on two MPI ranks
 ```
-## Contributing
+You may have to adjust the example launch scripts `nrsmpi/nrsbmpi` to your environment. 
+
+## Performance Considerations
+For good performance a GPU with a high memory bandwidth and FP64-support (in hardware) is required.
+Most desktop/gaming GPUs do not meet this requirements.
+It is recommended to pin MPI-tasks (using e.g. numactl) to ensure the correct CPU/GPU binding.
+Moreover a GPU enabled MPI implementation (if available set OGS_MPI_SUPPORT=1) should be used.
 
+## Contributing
 Our project is hosted on [GitHub](https://github.com/Nek5000/nekRS) and everbody is welcome to become a part of it. If you are planning a large contribution, we encourage you to discuss the concept here on GitHub and interact with us frequently to ensure that your effort is well-directed.
 
 ## Troubleshooting
-
 If you run into problems compiling, installing, or running nekRS, please send a message to the User's Group [mailing list](https://groups.google.com/forum/#!forum/nekRS). Please [sign up](https://groups.google.com/forum/#!forum/nekRS/join) to post your questions, concerns or suggestions.
 
 ## Reporting Bugs
diff --git a/RELEASE.md b/RELEASE.md
index a01311505..872a6a766 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -2,19 +2,24 @@
 
 ## What is new? 
 
-* Additive V-cycle with overlapping coarse grid solve
 * (Chebyshev accelerated) ASM and RAS smoother
-* Residual projection
+* Improved gs performance
+* Initial guess projection
 * Runtime averages
-* Improved CPU performance
+* Stress formulation
 * Various bug fixes 
 
 ## What you may have to change to be compatible 
 
 * common block SCRNS was replaced by pointer array NRSSCPTR (see ethier example) 
+* update boundary device function names and bc struct members in oudf (e.g. insVelocityDirichlet3D -> velocityDirichlet, bc->uP -> bc->u)
+* remove copyTo() call to get nek IC from UDF_Setup() 
+* call nek_ocopyTo(time) after nek_userchk() if you want to modify the solution before writing to file
 
 ## Known Bugs 
 
+* [166](https://github.com/Nek5000/nekRS/issues/166)
+
 ## Thanks to our Contributors
 
 We are grateful to all who added new features, filed issues or helped resolve them, 
diff --git a/config/elliptic.cmake b/config/elliptic.cmake
new file mode 100644
index 000000000..e13498ffc
--- /dev/null
+++ b/config/elliptic.cmake
@@ -0,0 +1,55 @@
+set(ELLIPTIC_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/elliptic)
+
+set(ELLIPTIC_SOURCES
+        ${ELLIPTIC_SOURCE_DIR}/ellipticBuildContinuous.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticBuildContinuousGalerkin.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticBuildIpdg.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticBuildJacobi.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticHaloExchange.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticKernelInfo.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticBuildMultigridLevelFine.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticBuildMultigridLevel.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridLevel.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridLevelSetup.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridSchwarz.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridSetup.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticOperator.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticPreconditioner.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticPreconditionerSetup.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticResidualProjection.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticScaledAdd.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticSolve.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticSolveSetup.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticUpdateNBFPCG.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticUpdateNBPCG.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticUpdatePCG.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticVectors.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticWeightedInnerProduct.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticWeightedNorm2.cpp
+        ${ELLIPTIC_SOURCE_DIR}/ellipticZeroMean.cpp)
+
+set(PARALMOND_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/elliptic/parAlmond)
+
+set(PARALMOND_SOURCES
+        ${PARALMOND_SOURCE_DIR}/crs_hypre.cpp
+        ${PARALMOND_SOURCE_DIR}/SpMV.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgLevel.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/agmgSetup.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/constructProlongation.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/formAggregates.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/galerkinProd.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/strongGraph.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/transpose.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSmoother.cpp
+        ${PARALMOND_SOURCE_DIR}/coarseSolver.cpp
+        ${PARALMOND_SOURCE_DIR}/kernels.cpp
+        ${PARALMOND_SOURCE_DIR}/level.cpp
+        ${PARALMOND_SOURCE_DIR}/matrix.cpp
+        ${PARALMOND_SOURCE_DIR}/multigrid.cpp
+        ${PARALMOND_SOURCE_DIR}/parAlmond.cpp
+        ${PARALMOND_SOURCE_DIR}/pcg.cpp
+        ${PARALMOND_SOURCE_DIR}/pgmres.cpp
+        ${PARALMOND_SOURCE_DIR}/solver.cpp
+        ${PARALMOND_SOURCE_DIR}/timer.cpp
+        ${PARALMOND_SOURCE_DIR}/utils.cpp
+        ${PARALMOND_SOURCE_DIR}/vector.cpp)
diff --git a/config/gslib.cmake b/config/gslib.cmake
index 15f069dd3..a55f243f4 100644
--- a/config/gslib.cmake
+++ b/config/gslib.cmake
@@ -67,6 +67,8 @@ set(OGS_SOURCES
         ${OGS_SOURCE_DIR}/src/ogsSetup.cpp
         ${OGS_SOURCE_DIR}/src/oogs.cpp)
 
+set(file_pattern "\.cu$|\.hip$|\.okl$|\.c$|\.hpp$|\.tpp$|\.h$$")
+
 install(DIRECTORY
         ${OGS_SOURCE_DIR}/include
         ${OGS_SOURCE_DIR}/okl
diff --git a/config/libp.cmake b/config/libp.cmake
deleted file mode 100644
index 2d6df8f2b..000000000
--- a/config/libp.cmake
+++ /dev/null
@@ -1,161 +0,0 @@
-set(LIBP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/libP)
-set(PARALMOND_SOURCE_DIR ${LIBP_SOURCE_DIR}/parAlmond)
-set(ELLIPTIC_SOURCE_DIR ${LIBP_SOURCE_DIR}/solvers/elliptic)
-
-set(LIBP_SOURCES
-        ${LIBP_SOURCE_DIR}/src/hash.c
-        ${LIBP_SOURCE_DIR}/src/matrixConditionNumber.c
-        ${LIBP_SOURCE_DIR}/src/matrixInverse.c
-        ${LIBP_SOURCE_DIR}/src/matrixEig.cpp
-        ${LIBP_SOURCE_DIR}/src/matrixTranspose.cpp
-        ${LIBP_SOURCE_DIR}/src/matrixRightSolve.cpp
-        ${LIBP_SOURCE_DIR}/src/meshBasis1D.cpp
-        ${LIBP_SOURCE_DIR}/src/meshBasisHex3D.cpp
-        ${LIBP_SOURCE_DIR}/src/meshApplyElementMatrix.c
-        ${LIBP_SOURCE_DIR}/src/meshConnect.c
-        ${LIBP_SOURCE_DIR}/src/meshConnectBoundary.c
-        ${LIBP_SOURCE_DIR}/src/meshConnectFaceNodes2D.c
-        ${LIBP_SOURCE_DIR}/src/meshConnectFaceNodes3D.c
-        ${LIBP_SOURCE_DIR}/src/meshConnectPeriodicFaceNodes2D.c
-        ${LIBP_SOURCE_DIR}/src/meshConnectPeriodicFaceNodes3D.c
-        ${LIBP_SOURCE_DIR}/src/meshFree.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricFactorsHex3D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricFactorsQuad2D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricFactorsQuad3D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricFactorsTet3D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricFactorsTri2D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricFactorsTri3D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricPartition2D.c
-        ${LIBP_SOURCE_DIR}/src/meshGeometricPartition3D.c
-        ${LIBP_SOURCE_DIR}/src/meshHaloExchange.c
-        ${LIBP_SOURCE_DIR}/src/meshHaloExtract.c
-        ${LIBP_SOURCE_DIR}/src/meshHaloSetup.c
-        ${LIBP_SOURCE_DIR}/src/meshLoadReferenceNodesHex3D.c
-        ${LIBP_SOURCE_DIR}/src/meshLoadReferenceNodesQuad2D.c
-        ${LIBP_SOURCE_DIR}/src/meshLoadReferenceNodesTet3D.c
-        ${LIBP_SOURCE_DIR}/src/meshLoadReferenceNodesTri2D.c
-        ${LIBP_SOURCE_DIR}/src/meshOccaSetup2D.c
-        ${LIBP_SOURCE_DIR}/src/meshOccaSetup3D.c
-        ${LIBP_SOURCE_DIR}/src/meshOccaSetupQuad3D.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelConnectOpt.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelConsecutiveGlobalNumbering.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelGatherScatterSetup.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelReaderHex3D.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelReaderQuad2D.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelReaderQuad3D.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelReaderTet3D.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelReaderTri2D.c
-        ${LIBP_SOURCE_DIR}/src/meshParallelReaderTri3D.c
-        ${LIBP_SOURCE_DIR}/src/meshPartitionStatistics.c
-        ${LIBP_SOURCE_DIR}/src/meshPhysicalNodesQuad2D.c
-        ${LIBP_SOURCE_DIR}/src/meshPhysicalNodesQuad3D.c
-        ${LIBP_SOURCE_DIR}/src/meshPhysicalNodesTet3D.c
-        ${LIBP_SOURCE_DIR}/src/meshPhysicalNodesTri2D.c
-        ${LIBP_SOURCE_DIR}/src/meshPhysicalNodesTri3D.c
-        ${LIBP_SOURCE_DIR}/src/meshSurfaceGeometricFactorsHex3D.c
-        ${LIBP_SOURCE_DIR}/src/meshSurfaceGeometricFactorsQuad2D.c
-        ${LIBP_SOURCE_DIR}/src/meshSurfaceGeometricFactorsQuad3D.c
-        ${LIBP_SOURCE_DIR}/src/meshSurfaceGeometricFactorsTet3D.c
-        ${LIBP_SOURCE_DIR}/src/meshSurfaceGeometricFactorsTri2D.c
-        ${LIBP_SOURCE_DIR}/src/meshSurfaceGeometricFactorsTri3D.c
-        ${LIBP_SOURCE_DIR}/src/mysort.c
-        ${LIBP_SOURCE_DIR}/src/occaHostMallocPinned.c
-        ${LIBP_SOURCE_DIR}/src/parallelSort.c
-        ${LIBP_SOURCE_DIR}/src/readArray.c
-        ${LIBP_SOURCE_DIR}/src/setupAide.c
-        ${LIBP_SOURCE_DIR}/src/timer.c)
-
-set_source_files_properties(${LIBP_SOURCES} PROPERTIES LANGUAGE CXX)
-
-set(PARALMOND_SOURCES
-        ${PARALMOND_SOURCE_DIR}/hypre/hypre.c
-        ${PARALMOND_SOURCE_DIR}/src/SpMV.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgLevel.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSetup/agmgSetup.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSetup/constructProlongation.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSetup/formAggregates.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSetup/galerkinProd.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSetup/strongGraph.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSetup/transpose.cpp
-        ${PARALMOND_SOURCE_DIR}/src/agmgSmoother.cpp
-        ${PARALMOND_SOURCE_DIR}/src/coarseSolver.cpp
-        ${PARALMOND_SOURCE_DIR}/src/kernels.cpp
-        ${PARALMOND_SOURCE_DIR}/src/level.cpp
-        ${PARALMOND_SOURCE_DIR}/src/matrix.cpp
-        ${PARALMOND_SOURCE_DIR}/src/multigrid.cpp
-        ${PARALMOND_SOURCE_DIR}/src/parAlmond.cpp
-        ${PARALMOND_SOURCE_DIR}/src/pcg.cpp
-        ${PARALMOND_SOURCE_DIR}/src/pgmres.cpp
-        ${PARALMOND_SOURCE_DIR}/src/solver.cpp
-        ${PARALMOND_SOURCE_DIR}/src/timer.cpp
-        ${PARALMOND_SOURCE_DIR}/src/utils.cpp
-        ${PARALMOND_SOURCE_DIR}/src/vector.cpp)
-
-# ---------------------------------------------------------
-# libelliptic
-# ---------------------------------------------------------
-
-set(ELLIPTIC_SOURCES
-        ${ELLIPTIC_SOURCE_DIR}/src/NBFPCG.c
-        ${ELLIPTIC_SOURCE_DIR}/src/NBPCG.c
-        ${ELLIPTIC_SOURCE_DIR}/src/PCG.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticBuildContinuous.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticBuildContinuousGalerkin.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticBuildIpdg.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticBuildJacobi.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticHaloExchange.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticKernelInfo.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticBuildMultigridLevelFine.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticBuildMultigridLevel.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticMultiGridLevel.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticMultiGridLevelSetup.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticMultiGridSchwarz.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticMultiGridSetup.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticOperator.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticPreconditioner.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticPreconditionerSetup.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticResidualProjection.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticScaledAdd.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticSolve.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticSolveSetup.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticUpdateNBFPCG.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticUpdateNBPCG.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticUpdatePCG.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticVectors.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticWeightedInnerProduct.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticWeightedNorm2.c
-        ${ELLIPTIC_SOURCE_DIR}/src/ellipticZeroMean.c)
-
-set_source_files_properties(${ELLIPTIC_SOURCES} PROPERTIES LANGUAGE CXX)
-
-# ---------------------------------------------------------
-# install
-# ---------------------------------------------------------
-
-set(file_pattern "\.cu$|\.hip$|\.okl$|\.c$|\.hpp$|\.tpp$|\.h$|hex.*\.dat$")
-
-install(DIRECTORY 
-  ${LIBP_SOURCE_DIR}/include 
-  ${LIBP_SOURCE_DIR}/nodes
-  ${LIBP_SOURCE_DIR}/okl 
-  DESTINATION libparanumal
-  FILES_MATCHING REGEX ${file_pattern})
-
-install(DIRECTORY
-  ${PARALMOND_SOURCE_DIR}/include
-  ${PARALMOND_SOURCE_DIR}/okl
-  DESTINATION parAlmond
-  FILES_MATCHING REGEX ${file_pattern})
-install(FILES ${PARALMOND_SOURCE_DIR}/parAlmond.hpp DESTINATION gatherScatter)
-
-install(DIRECTORY
-  ${ELLIPTIC_SOURCE_DIR}/data
-  ${ELLIPTIC_SOURCE_DIR}/okl
-  DESTINATION elliptic
-  FILES_MATCHING REGEX ${file_pattern})
-install(FILES 
-  ${ELLIPTIC_SOURCE_DIR}/elliptic.h
-  ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGrid.h
-  ${ELLIPTIC_SOURCE_DIR}/ellipticResidualProjection.h
-  ${ELLIPTIC_SOURCE_DIR}/ellipticPrecon.h
-  DESTINATION gatherScatter)
diff --git a/config/mesh.cmake b/config/mesh.cmake
new file mode 100644
index 000000000..cae926aad
--- /dev/null
+++ b/config/mesh.cmake
@@ -0,0 +1,25 @@
+set(MESH_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/mesh)
+
+set(MESH_SOURCES
+    ${MESH_SOURCE_DIR}/meshSetup.cpp
+    ${MESH_SOURCE_DIR}/meshNekReader.cpp
+    ${MESH_SOURCE_DIR}/meshPhysicalNodesHex3D.cpp
+    ${MESH_SOURCE_DIR}/meshParallelConnectNodes.cpp
+    ${MESH_SOURCE_DIR}/meshBasis1D.cpp
+    ${MESH_SOURCE_DIR}/meshBasisHex3D.cpp
+    ${MESH_SOURCE_DIR}/meshApplyElementMatrix.cpp
+    ${MESH_SOURCE_DIR}/meshConnect.cpp
+    ${MESH_SOURCE_DIR}/meshConnectBoundary.cpp
+    ${MESH_SOURCE_DIR}/meshConnectFaceNodes3D.cpp
+    ${MESH_SOURCE_DIR}/meshConnectPeriodicFaceNodes3D.cpp
+    ${MESH_SOURCE_DIR}/meshFree.cpp
+    ${MESH_SOURCE_DIR}/meshGeometricFactorsHex3D.cpp
+    ${MESH_SOURCE_DIR}/meshHaloExchange.cpp
+    ${MESH_SOURCE_DIR}/meshHaloExtract.cpp
+    ${MESH_SOURCE_DIR}/meshHaloSetup.cpp
+    ${MESH_SOURCE_DIR}/meshLoadReferenceNodesHex3D.cpp
+    ${MESH_SOURCE_DIR}/meshOccaSetup3D.cpp
+    ${MESH_SOURCE_DIR}/meshParallelConsecutiveGlobalNumbering.cpp
+    ${MESH_SOURCE_DIR}/meshParallelGatherScatterSetup.cpp
+    ${MESH_SOURCE_DIR}/meshSurfaceGeometricFactorsHex3D.cpp
+    ${MESH_SOURCE_DIR}/meshParallelConnectOpt.cpp)
diff --git a/config/parAlmond.cmake b/config/parAlmond.cmake
new file mode 100644
index 000000000..ee339db9e
--- /dev/null
+++ b/config/parAlmond.cmake
@@ -0,0 +1,25 @@
+set(PARALMOND_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/parAlmond)
+
+set(PARALMOND_SOURCES
+        ${PARALMOND_SOURCE_DIR}/crs_hypre.cpp
+        ${PARALMOND_SOURCE_DIR}/SpMV.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgLevel.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/agmgSetup.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/constructProlongation.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/formAggregates.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/galerkinProd.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/strongGraph.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSetup/transpose.cpp
+        ${PARALMOND_SOURCE_DIR}/agmgSmoother.cpp
+        ${PARALMOND_SOURCE_DIR}/coarseSolver.cpp
+        ${PARALMOND_SOURCE_DIR}/kernels.cpp
+        ${PARALMOND_SOURCE_DIR}/level.cpp
+        ${PARALMOND_SOURCE_DIR}/matrix.cpp
+        ${PARALMOND_SOURCE_DIR}/multigrid.cpp
+        ${PARALMOND_SOURCE_DIR}/parAlmond.cpp
+        ${PARALMOND_SOURCE_DIR}/pcg.cpp
+        ${PARALMOND_SOURCE_DIR}/pgmres.cpp
+        ${PARALMOND_SOURCE_DIR}/solver.cpp
+        ${PARALMOND_SOURCE_DIR}/timer.cpp
+        ${PARALMOND_SOURCE_DIR}/utils.cpp
+        ${PARALMOND_SOURCE_DIR}/vector.cpp)
diff --git a/examples/conj_ht/conj_ht_ci.h b/examples/conj_ht/ci.inc
similarity index 87%
rename from examples/conj_ht/conj_ht_ci.h
rename to examples/conj_ht/ci.inc
index 088e0909e..f621f56f6 100644
--- a/examples/conj_ht/conj_ht_ci.h
+++ b/examples/conj_ht/ci.inc
@@ -1,5 +1,7 @@
 #include <math.h>
 
+static int ciMode = 0;
+
 #define PASS { if (rank == 0) printf("TESTS passed \n"); MPI_Finalize(); exit(0); }
 #define FAIL { if (rank == 0) printf("TESTS failed!\n"); MPI_Finalize(); exit(2); }
 
@@ -10,7 +12,7 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("POLYNOMIAL DEGREE", string("7"));
   options.setArgs("RESTART FROM FILE", string("0"));
   options.setArgs("TSTEPS FOR SOLUTION OUTPUT", "0");
-  options.setArgs("FINAL TIME", string("10"));
+  options.setArgs("END TIME", string("10"));
   options.setArgs("DT", string("2e-2"));
   options.setArgs("SUBCYCLING STEPS", string("0"));
   if (ciMode == 2) options.setArgs("SUBCYCLING STEPS", string("1"));
@@ -22,11 +24,11 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("VARIABLEPROPERTIES", "TRUE");
 }
 
-void ciTestErrors(ins_t *ins, dfloat time, int tstep)
+void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (tstep != ins->NtimeSteps) return;
+  if (!nrs->lastStep) return;
  
-  const int rank = ins->mesh->rank;
+  const int rank = nrs->mesh->rank;
 
   nek_ocopyFrom(time, tstep);
   nek_userchk();
diff --git a/examples/conj_ht/conj_ht.oudf b/examples/conj_ht/conj_ht.oudf
index 924c8a107..d48ac59b8 100644
--- a/examples/conj_ht/conj_ht.oudf
+++ b/examples/conj_ht/conj_ht.oudf
@@ -1,14 +1,14 @@
 // Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {                                                                        
-  bc->uP = 4.0*bc->y*(1. - bc->y); 
-  bc->vP = 0.0; 
-  bc->wP = 0.0;
+  bc->u = 4.0*bc->y*(1. - bc->y); 
+  bc->v = 0.0; 
+  bc->w = 0.0;
 }
 
-void cdsDirichletConditions3D(bcData *bc)
+void scalarDirichletConditions(bcData *bc)
 {
-  bc->sP = 0.0;
+  bc->s = 0.0;
 }
 
 
diff --git a/examples/conj_ht/conj_ht.udf b/examples/conj_ht/conj_ht.udf
index e8f100f0e..0712f443a 100644
--- a/examples/conj_ht/conj_ht.udf
+++ b/examples/conj_ht/conj_ht.udf
@@ -4,29 +4,28 @@
 #include <math.h>
 #include "udf.hpp"
 
-static int ciMode = 0;
-#include "conj_ht_ci.h"
+#include "ci.inc"
 
 static occa::kernel cFillKernel;
 
 static int updateProperties = 1;
 
-void userq(ins_t *ins, dfloat time, occa::memory o_S, occa::memory o_FS)
+void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_FS)
 {
-  cds_t *cds   = ins->cds;
+  cds_t *cds   = nrs->cds;
   mesh_t *mesh = cds->mesh;
   const dfloat qvolFluid = 0.0;
   const dfloat qvolSolid = 1.0;
-  cFillKernel(mesh->Nelements, qvolFluid, qvolSolid, ins->o_elementInfo, o_FS);
+  cFillKernel(mesh->Nelements, qvolFluid, qvolSolid, nrs->o_elementInfo, o_FS);
 }
 
-void uservp(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_S,
+void uservp(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_S,
             occa::memory o_UProp, occa::memory o_SProp)
 {
-  cds_t *cds   = ins->cds;
+  cds_t *cds   = nrs->cds;
 
   if(updateProperties) {
-    if(ins->mesh->rank == 0) cout << "updating properties" << "\n";
+    if(nrs->mesh->rank == 0) cout << "updating properties" << "\n";
     const dfloat rho = 1.0;
     const dfloat mue = 1/1000.0;
     const dfloat rhoCpFluid = rho*1.0;
@@ -35,24 +34,24 @@ void uservp(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_S,
     const dfloat conSolid = 10*conFluid;
 
     // velocity 
-    const occa::memory o_mue = o_UProp.slice(0*ins->fieldOffset*sizeof(dfloat));
-    const occa::memory o_rho = o_UProp.slice(1*ins->fieldOffset*sizeof(dfloat));
-    cFillKernel(ins->mesh->Nelements, mue, 0, ins->o_elementInfo, o_mue);
-    cFillKernel(ins->mesh->Nelements, rho, 0, ins->o_elementInfo, o_rho);
+    const occa::memory o_mue = o_UProp.slice(0*nrs->fieldOffset*sizeof(dfloat));
+    const occa::memory o_rho = o_UProp.slice(1*nrs->fieldOffset*sizeof(dfloat));
+    cFillKernel(nrs->mesh->Nelements, mue, 0, nrs->o_elementInfo, o_mue);
+    cFillKernel(nrs->mesh->Nelements, rho, 0, nrs->o_elementInfo, o_rho);
     // temperature 
     const occa::memory o_con   = o_SProp.slice(0*cds->fieldOffset*sizeof(dfloat));
     const occa::memory o_rhoCp = o_SProp.slice(1*cds->fieldOffset*sizeof(dfloat));
-    cFillKernel(cds->mesh->Nelements, conFluid, conSolid, ins->o_elementInfo, o_con);
-    cFillKernel(cds->mesh->Nelements, rhoCpFluid, rhoCpSolid, ins->o_elementInfo, o_rhoCp);
+    cFillKernel(cds->mesh->Nelements, conFluid, conSolid, nrs->o_elementInfo, o_con);
+    cFillKernel(cds->mesh->Nelements, rhoCpFluid, rhoCpSolid, nrs->o_elementInfo, o_rhoCp);
     updateProperties = 0;
   }
 }
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  cFillKernel = udfBuildKernel(ins, "cFill");
+  cFillKernel = udfBuildKernel(nrs, "cFill");
 }
 
 void UDF_Setup0(MPI_Comm comm, setupAide &options)
@@ -61,20 +60,17 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options)
   if (ciMode) ciSetup(comm, options);
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-
   udf.sEqnSource = &userq;
   udf.properties = &uservp;
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
     nek_userchk();
   }
-  if (ciMode) ciTestErrors(ins, time, tstep);
+  if (ciMode) ciTestErrors(nrs, time, tstep);
 }
diff --git a/examples/eddy/eddy.box b/examples/eddy/eddy.box
deleted file mode 100644
index e835c3e86..000000000
--- a/examples/eddy/eddy.box
+++ /dev/null
@@ -1,20 +0,0 @@
-one3d.rea
--3                     spatial dimension
-1                     number of fields
-#========================================================
-#
-#    This gives a 9 x 1 box, periodic in x,
-#    here used for Rayleigh Benard convection.
-#
-#    Note that number of fields < 0 implies that box.rea
-#    will be ascii.
-#
-#========================================================
-#
-Box
--16 -16 -1          nelx,nely,nelz for Box)
-0 1 1               x0 x1 ratio  !! Rescaled in usrdat2().
-0 1 1               y0 y1 ratio 
-0 1 1               z0 z1 ratio 
-#P  ,P  ,P  ,P  ,E  ,E   BCs (3 characters each!)
-v  ,v  ,v  ,v  ,v  ,v   BCs (3 characters each!)
diff --git a/examples/eddy/eddy.co2 b/examples/eddy/eddy.co2
deleted file mode 100644
index 0bce2d3dc..000000000
Binary files a/examples/eddy/eddy.co2 and /dev/null differ
diff --git a/examples/eddy/eddy.oudf b/examples/eddy/eddy.oudf
deleted file mode 100644
index 54d69c9ef..000000000
--- a/examples/eddy/eddy.oudf
+++ /dev/null
@@ -1,88 +0,0 @@
-void exact(dfloat *uu, dfloat *vv, const dfloat xx, const dfloat yy,
-           const dfloat time, const dfloat visc)
-{
-      const dfloat u0 = 1.0; 
-      const dfloat v0 = 0.3;
-
-      dfloat a [2][5] = {
-        {-0.2, 0.25, 0, 0, 0},
-        {-0.2,  0.0, 0, 0, 0}
-      };
-
-      dfloat cpsi[2][5] = {
-        {0.0, 3.0, 0, 0, 0},
-        {5.0, 4.0, 0, 0, 0}
-      };
-
-      dfloat pi, aa, arg, e;
-      dfloat x, u, cx, sx;
-      dfloat c1, c2;
-      dfloat s1x, c1x, s2x, c2x;
-      dfloat s1y, c1y, s2y, c2y;
-      dfloat y, v, cy, sy;
-
-      int i,k;
-
-      pi    = M_PI;
-
-      aa    = cpsi[1][0]*cpsi[1][0];
-      arg   = -visc*time*aa;
-      e     = exp(arg);
-
-      x = xx - u0*time;
-      y = yy - v0*time;
-
-      sx = sin(cpsi[1][0]*x);
-      cx = cos(cpsi[1][0]*x);
-      sy = sin(cpsi[1][0]*y);
-      cy = cos(cpsi[1][0]*y);
-      u  = a[0][0]*cpsi[1][0]*cy; 
-      v  = a[1][0]*cpsi[1][0]*sx;
-
-      for(k=1; k<5; ++k){
-         s1x = sin(cpsi[0][k]*x);
-         c1x = cos(cpsi[0][k]*x);
-         s2x = sin(cpsi[1][k]*x);
-         c2x = cos(cpsi[1][k]*x);
-
-         s1y = sin(cpsi[0][k]*y);
-         c1y = cos(cpsi[0][k]*y);
-         s2y = sin(cpsi[1][k]*y);
-         c2y = cos(cpsi[1][k]*y);
-         
-         c1  = cpsi[0][k];
-         c2  = cpsi[1][k];
-
-         if (k==1) u = u + a[0][k]*s1x*c2y*c2;
-         if (k==1) v = v - a[0][k]*c1x*s2y*c1;
-         if (k==1) u = u - a[1][k]*s2x*c1y*c1;
-         if (k==1) v = v + a[1][k]*c2x*s1y*c2;
-
-         if (k==2) u = u - a[0][k]*s1x*c2y*c2;
-         if (k==2) v = v + a[0][k]*c1x*s2y*c1;
-         if (k==2) u = u - a[1][k]*c2x*c1y*c1;
-         if (k==2) v = v - a[1][k]*s2x*s1y*c2;
-
-         if (k==3) u = u + a[0][k]*c1x*c2y*c2;
-         if (k==3) v = v + a[0][k]*s1x*s2y*c1;
-         if (k==3) u = u + a[1][k]*c2x*c1y*c1;
-         if (k==3) v = v + a[1][k]*s2x*s1y*c2;
-
-         if (k==4) u = u - a[0][k]*s1x*c2y*c2;
-         if (k==4) v = v + a[0][k]*c1x*s2y*c1;
-         if (k==4) u = u - a[1][k]*s2x*c1y*c1;
-         if (k==4) v = v + a[1][k]*c2x*s1y*c2;
-      }
-      *uu = u*e + u0;
-      *vv = v*e + v0;
-}
-
-// Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
-{                                                                        
-  dfloat u, v;
-  exact(&u, &v, bc->x, bc->y, bc->time, p_nu);
-  bc->uP = u;
-  bc->vP = v;
-  bc->wP = 0;
-}
diff --git a/examples/eddy/eddy.par b/examples/eddy/eddy.par
deleted file mode 100644
index cdb596a2b..000000000
--- a/examples/eddy/eddy.par
+++ /dev/null
@@ -1,20 +0,0 @@
-[OCCA]
-backend = CUDA
-deviceNumber = LOCAL-RANK
-
-[GENERAL] 
-polynomialOrder = 7
-#startFrom = restart.fld
-numSteps = 1000
-dt = 1e-04
-timeStepper = tombo2
-writeInterval = 100
-
-[PRESSURE]
-residualTol = 1e-09
-
-[VELOCITY]
-boundaryTypeMap = inlet
-residualTol = 1e-12
-density = 1.0
-viscosity = 0.05 
diff --git a/examples/eddy/eddy.re2 b/examples/eddy/eddy.re2
deleted file mode 100644
index 958b484cb..000000000
Binary files a/examples/eddy/eddy.re2 and /dev/null differ
diff --git a/examples/eddy/eddy.udf b/examples/eddy/eddy.udf
deleted file mode 100644
index e490550b1..000000000
--- a/examples/eddy/eddy.udf
+++ /dev/null
@@ -1,33 +0,0 @@
-//
-// nekRS User Defined File
-//
-
-#include <math.h>
-#include "udf.hpp"
-
-/* UDF Functions */                                                      
-
-void UDF_LoadKernels(ins_t *ins)
-{
-  occa::properties& kernelInfo = *ins->kernelInfo;
-  setupAide &options = ins->options;
-
-  dfloat mue, rho;
-  options.getArgs("VISCOSITY", mue);
-  options.getArgs("DENSITY", rho); 
-  kernelInfo["defines/p_nu"] = mue/rho;
-}
-
-void UDF_Setup(ins_t *ins)
-{
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-}
-
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
-{
-  if (ins->isOutputStep) {
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
-  }
-}
diff --git a/examples/eddy/eddy.usr b/examples/eddy/eddy.usr
deleted file mode 100644
index ab7e0fead..000000000
--- a/examples/eddy/eddy.usr
+++ /dev/null
@@ -1,318 +0,0 @@
-c-----------------------------------------------------------------------
-      subroutine exact(uu,vv,xx,yy,n,time,visc,u0,v0)
-c
-c     This routine creates initial conditions for an exact solution
-c     to the Navier-Stokes equations based on the paper of Walsh,
-c     with an additional translational velocity (u0,v0).
-c     
-c     The computational domain is [0,2pi]^2 with doubly-periodic 
-c     boundary conditions.
-c
-c
-      include 'SIZE'
-      include 'INPUT'
-c
-      real uu(n),vv(n),xx(n),yy(n)
-c
-      real cpsi(2,5), a(2,5)
-      save cpsi     , a
-
-c     data a / .4,.45 , .4,.2 , -.2,-.1 , .2,.05, -.09,-.1 / ! See eddy.m
-c     data cpsi / 0,65 , 16,63 , 25,60 , 33,56 , 39,52 /     ! See squares.f
-c     data cpsi / 0,85 , 13,84 , 36,77 , 40,75 , 51,68 /
-
-
-c     This data from Walsh's Figure 1 [1]:
-
-      data a / -.2,-.2, .25,0.,   0,0  ,  0,0  ,  0,0  /
-      data cpsi / 0, 5 ,  3, 4 ,  0,0  ,  0,0  ,  0,0  /
-
-      one   = 1.
-      pi    = 4.*atan(one)
-
-      aa    = cpsi(2,1)**2
-      arg   = -visc*time*aa  ! domain is [0:2pi]
-      e     = exp(arg)
-c
-c     ux = psi_y,  uy = -psi_x
-c
-      do i=1,n
-         x = xx(i) - u0*time
-         y = yy(i) - v0*time
-
-         sx = sin(cpsi(2,1)*x)
-         cx = cos(cpsi(2,1)*x)
-         sy = sin(cpsi(2,1)*y)
-         cy = cos(cpsi(2,1)*y)
-         u  =  a(1,1)*cpsi(2,1)*cy 
-         v  =  a(2,1)*cpsi(2,1)*sx
-
-         do k=2,5
-            s1x = sin(cpsi(1,k)*x)
-            c1x = cos(cpsi(1,k)*x)
-            s2x = sin(cpsi(2,k)*x)
-            c2x = cos(cpsi(2,k)*x)
-
-            s1y = sin(cpsi(1,k)*y)
-            c1y = cos(cpsi(1,k)*y)
-            s2y = sin(cpsi(2,k)*y)
-            c2y = cos(cpsi(2,k)*y)
-            
-            c1  = cpsi(1,k)
-            c2  = cpsi(2,k)
-
-            if (k.eq.2) u = u + a(1,k)*s1x*c2y*c2
-            if (k.eq.2) v = v - a(1,k)*c1x*s2y*c1
-            if (k.eq.2) u = u - a(2,k)*s2x*c1y*c1
-            if (k.eq.2) v = v + a(2,k)*c2x*s1y*c2
-
-            if (k.eq.3) u = u - a(1,k)*s1x*c2y*c2
-            if (k.eq.3) v = v + a(1,k)*c1x*s2y*c1
-            if (k.eq.3) u = u - a(2,k)*c2x*c1y*c1
-            if (k.eq.3) v = v - a(2,k)*s2x*s1y*c2
-
-            if (k.eq.4) u = u + a(1,k)*c1x*c2y*c2
-            if (k.eq.4) v = v + a(1,k)*s1x*s2y*c1
-            if (k.eq.4) u = u + a(2,k)*c2x*c1y*c1
-            if (k.eq.4) v = v + a(2,k)*s2x*s1y*c2
-
-            if (k.eq.5) u = u - a(1,k)*s1x*c2y*c2
-            if (k.eq.5) v = v + a(1,k)*c1x*s2y*c1
-            if (k.eq.5) u = u - a(2,k)*s2x*c1y*c1
-            if (k.eq.5) v = v + a(2,k)*c2x*s1y*c2
-         enddo
-         uu(i) = u*e + u0
-         vv(i) = v*e + v0
-      enddo
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine exactp(pe,x2,y2,n2,time,visc,u0,v0)
-
-c     This routine, complementary to the exact routine above, returns
-c     the exact pressure, given the pressure counterpart to the
-c     arguments for the exact routine i.e., xx -> x2, yy -> y2, n -> n2
-c
-c     Brandon E. Merrill, Yulia T. Peet, Paul F. Fischer, and
-c     James W. Lottes. "A Spectrally Accurate Method for Overlapping
-c     Grid Solution of Incompressible Navier-Stokes Equations." Journal
-c     of Computational Physics 307 (2016), 60--93.
-
-      real x2(n2),y2(n2),pe(n2)
-
-      e = exp(-50*time*visc)
-
-      do i=1,n2
-         x = x2(i) - u0*time
-         y = y2(i) - v0*time
-
-         pe(i) = (1.0/64.0)*e*(16*cos(6*x) + 8*cos(8*x-4*y)
-     $         - 32*cos(2*(x-2*y)) + 9*cos(8*y) - 8*cos(4*(2*x+y))
-     $         + 32*cos(2*(x+2*y)) - 4*sin(3*(x-3*y)) + 32*sin(5*(x-y))
-     $         + 36*sin(3*x-y) - 32*sin(5*(x+y)) + 36*sin(3*x+y)
-     $         - 4*sin(3*(x+3*y)))
-      enddo
-
-      call ortho(pe)
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine uservp (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-C
-      udiff =0.
-      utrans=0.
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine userf  (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-C
-      ffx = 0.0
-      ffy = 0.0
-      ffz = 0.0
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine userq  (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-C
-      qvol   = 0.0
-      source = 0.0
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine userchk
-      include 'SIZE'  
-      include 'TOTAL' 
-c
-      common /exacu/ ue(lx1,ly1,lz1,lelt),ve(lx1,ly1,lz1,lelt)
-      common /exacp/ pe(lx2,ly2,lz2,lelt)
-      common /exacd/ ud(lx1,ly1,lz1,lelt),vd(lx1,ly1,lz1,lelt)
-     $              ,pd(lx2,ly2,lz2,lelt)
-      common /mybc/ u_mybc(lx1,ly1,lz1,lelt),v_mybc(lx1,ly1,lz1,lelt)
-
-      common /SCNRS/ wrk(lx1*ly1*lz1*lelt*3)
-
-      ifield = 1  ! for outpost
-
-      n    = nx1*ny1*nz1*nelv
-      n2   = nx2*ny2*nz2*nelv
-      visc = param(2)
-      u0   = 1.0 !param(96)
-      v0   = 0.3 !param(97)
-
-      call ortho  (pr)
-
-      call exact  (ue,ve,xm1,ym1,n,time,visc,u0,v0)
-      call exactp (pe,xm2,ym2,n2,time,visc,u0,v0)
-
-      if (istep.eq.0) then        !  Reset velocity & pressure to eliminate
-         if(nid.eq.0) write(6,*) 'setting vx,vy,pr ', istep, time, visc 
-         call copy (vx,ue,n)      !  start-up contributions to
-         call copy (vy,ve,n)      !  temporal-accuracy behavior.
-         call copy (pr,pe,n2)
-      endif
-
-      if (istep.eq.0) call outpost(ue,ve,vx,pe,t,'   ')
-
-      call sub3   (ud,ue,vx,n)
-      call sub3   (vd,ve,vy,n)
-      call sub3   (pd,pe,pr,n2)
-
-      if (istep.eq.nsteps) call outpost(ud,vd,vx,pd,t,'err')
-
-      umx = glamax(vx,n)
-      vmx = glamax(vy,n)
-      pmx = glamax(pr,n2)
-      uex = glamax(ue,n)
-      vex = glamax(ve,n)
-      pex = glamax(pe,n2)
-      udx = glamax(ud,n)
-      vdx = glamax(vd,n)
-      pdx = glamax(pd,n2)
-
-      wrk(1) = udx
-      wrk(2) = vdx
-      wrk(3) = pdx
-
-      if (nid.eq.0) then
-         write(6,11) istep,time,udx,umx,uex,u0,'  X err'
-         write(6,11) istep,time,vdx,vmx,vex,v0,'  Y err'
-         write(6,12) istep,time,pdx,pmx,pex,'                P err'
-   11    format(i5,1p5e14.6,a7)
-   12    format(i5,1p4e14.6,a21)
-      endif
-
-c      future = time + dt   ! Assumes constant DT
-c      call exact  (u_mybc,v_mybc,xm1,ym1,n,future,visc,u0,v0)
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine userbc (i,j,k,f,eg)
-c     NOTE ::: This subroutine MAY NOT be called by every process
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-
-      call exitt
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine useric (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-
-      common /exacu/ ue(lx1,ly1,lz1,lelt),ve(lx1,ly1,lz1,lelt)
-      common /exacp/ pe(lx2,ly2,lz2,lelt)
-      common /exacd/ ud(lx1,ly1,lz1,lelt),vd(lx1,ly1,lz1,lelt)
-     $              ,pd(lx2,ly2,lz2,lelt)
-
-      integer icalld
-      save    icalld
-      data    icalld  /0/
-
-      n = nx1*ny1*nz1*nelv
-      if (icalld.eq.0) then
-         icalld = icalld + 1
-         time = 0.
-         u0   = 1.0 !param(96)
-         v0   = 0.3 !param(97)
-         call exact (ue,ve,xm1,ym1,n,time,visc,u0,v0)
-      endif
-
-      ie = gllel(ieg)
-      ux=ue(ix,iy,iz,ie)
-      uy=ve(ix,iy,iz,ie)
-      uz=0.0
-      temp=0
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrdat
-      include 'SIZE'
-      include 'TOTAL'
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrdat3
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrdat2
-      include 'SIZE'
-      include 'TOTAL'
-
-      one   = 1.0
-      zero  = 0.
-      zmax  = 1.0
-      twopi = 8.*atan(one)
-
-      call rescale_x(xm1,zero,twopi)
-      call rescale_x(ym1,zero,twopi)
-      call rescale_x(zm1,zero,zmax)
-
-      do iel=1,nelt
-      do ifc=5,6
-         cbc(ifc,iel,1) = 'P  ' ! required for kludge (see below)
-      enddo
-      enddo
-
-      do iel=1,nelt
-      do ifc=1,2*ndim
-         if (cbc(ifc,iel,1) .eq. 'v  ') boundaryID(ifc,iel) = 1
-      enddo
-      enddo
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrsetvert(glo_num,nel,nx,ny,nz) ! to modify glo_num
-      integer*8 glo_num(1)
-
-      ! kludge for periodic bc in z
-      nxy  = nx*ny
-      nxyz = nx*ny*nz
-      do iel = 1,nel
-         ioff = nxyz*(iel-1)
-         do ixy = 1,nxy
-            glo_num(ioff + nxy*(nz-1) + ixy) = glo_num(ioff + ixy)
-         enddo
-      enddo
-
-      return
-      end
-c-----------------------------------------------------------------------
diff --git a/examples/eddyPeriodic/eddy.udf b/examples/eddyPeriodic/eddy.udf
index 94235434f..ad6731b00 100644
--- a/examples/eddyPeriodic/eddy.udf
+++ b/examples/eddyPeriodic/eddy.udf
@@ -8,20 +8,14 @@
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
-  }
 }
diff --git a/examples/ethier/ethier_ci.h b/examples/ethier/ci.inc
similarity index 80%
rename from examples/ethier/ethier_ci.h
rename to examples/ethier/ci.inc
index d573c98d5..9ec77e509 100644
--- a/examples/ethier/ethier_ci.h
+++ b/examples/ethier/ci.inc
@@ -1,5 +1,7 @@
 #include <math.h>
 
+static int ciMode = 0;
+
 #define PASS { if (rank == 0) printf("TESTS passed \n"); MPI_Finalize(); exit(0); }
 #define FAIL { if (rank == 0) printf("TESTS failed!\n"); MPI_Finalize(); exit(2); }
 
@@ -17,10 +19,15 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("SCALAR00 DENSITY", string("1"));
   options.setArgs("SCALAR01 DIFFUSIVITY", string("0.01"));
   options.setArgs("SCALAR01 DENSITY", string("1"));
-  options.setArgs("FINAL TIME", string("0.2"));
+  options.setArgs("END TIME", string("0.2"));
   options.setArgs("DT", string("2e-3"));
   options.setArgs("SUBCYCLING STEPS", string("0"));
   options.setArgs("PRESSURE RESIDUAL PROJECTION", "FALSE");
+
+  options.setArgs("VELOCITY BLOCK SOLVER", "FALSE");
+  options.setArgs("SCALAR INITIAL GUESS DEFAULT","PREVIOUS");
+  options.setArgs("VELOCITY INITIAL GUESS DEFAULT","PREVIOUS");
+
   if (ciMode == 2) {
     options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
     options.setArgs("SUBCYCLING STEPS", string("1"));
@@ -35,11 +42,11 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("VARIABLEPROPERTIES", "FALSE");
 }
 
-void ciTestErrors(ins_t *ins, dfloat time, int tstep)
+void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (tstep != ins->NtimeSteps) return;
+  if (!nrs->lastStep) return;
  
-  const int rank = ins->mesh->rank;
+  const int rank = nrs->mesh->rank;
  
   nek_ocopyFrom(time, tstep);
   nek_userchk();
@@ -54,15 +61,15 @@ void ciTestErrors(ins_t *ins, dfloat time, int tstep)
   int velIterErr;
 
   switch (ciMode) {
-    case 1 : velIterErr = abs(ins->NiterU - 10);
+    case 1 : velIterErr = abs(nrs->NiterU - 10);
              s1Err = abs((err[2] - 1.00E-11)/err[2]);
              s2Err = abs((err[3] - 1.31E-11)/err[3]);
-             pIterErr = abs(ins->NiterP - 4);
+             pIterErr = abs(nrs->NiterP - 4);
              break;
-    case 2 : velIterErr = abs(ins->NiterU - 10);
+    case 2 : velIterErr = abs(nrs->NiterU - 10);
              s1Err = abs((err[2] - 1.71E-11)/err[2]);
              s2Err = abs((err[3] - 2.00E-11)/err[3]);
-             pIterErr = abs(ins->NiterP - 1);
+             pIterErr = abs(nrs->NiterP - 1);
              break;
   }
 
diff --git a/examples/ethier/ethier.oudf b/examples/ethier/ethier.oudf
index d1385ca7d..22b035fb8 100644
--- a/examples/ethier/ethier.oudf
+++ b/examples/ethier/ethier.oudf
@@ -142,24 +142,25 @@ void grad_u(dfloat *grad_u_x, dfloat *grad_u_y, dfloat *grad_u_z,
 
 
 // Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {                                                                        
   dfloat u, v, w, p;
   exact(&u, &v, &w, &p, bc->x, bc->y, bc->z, bc->time, p_nu);
-  bc->uP = u;
-  bc->vP = v;
-  bc->wP = w;
+  bc->u = u;
+  bc->v = v;
+  bc->w = w;
 }
 
-void cdsDirichletConditions3D(bcData *bc)
+void scalarDirichletConditions(bcData *bc)
 {                                                                        
   dfloat u, v, w, p;
   exact(&u, &v, &w, &p, bc->x, bc->y, bc->z, bc->time, p_nu);
-  bc->sP = u;
+  bc->s = u;
 }
-void cdsNeumannConditions3D(bcData *bc)
+
+void scalarNeumannConditions(bcData *bc)
 {                                                                        
   dfloat grad_u_x, grad_u_y, grad_u_z;
   grad_u(&grad_u_x, &grad_u_y, &grad_u_z, bc->x, bc->y, bc->z, bc->time, p_nu);
-  bc->sF = p_nu * (grad_u_x * bc->nx + grad_u_y * bc->ny + grad_u_z * bc->nz);
+  bc->flux = p_nu * (grad_u_x * bc->nx + grad_u_y * bc->ny + grad_u_z * bc->nz);
 }
diff --git a/examples/ethier/ethier.udf b/examples/ethier/ethier.udf
index 53ddb8d75..e2120f4e4 100644
--- a/examples/ethier/ethier.udf
+++ b/examples/ethier/ethier.udf
@@ -6,28 +6,27 @@
 #include "udf.hpp"
 #include "casedata.h"
 
-static int ciMode = 0;
-#include "ethier_ci.h"
+#include "ci.inc"
 
 /* UDF Functions */   
 
 occa::kernel dpdxKernel; 
 occa::kernel exactUVWPKernel; 
 
-void userq(ins_t *ins, dfloat time, occa::memory o_S, occa::memory o_FS)
+void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_FS)
 {
-  mesh_t *mesh = ins->mesh; 
-  cds_t *cds   = ins->cds; 
-  const dlong Nlocal = ins->mesh->Nelements * ins->mesh->Np;
+  mesh_t *mesh = nrs->mesh; 
+  cds_t *cds   = nrs->cds; 
+  const dlong Nlocal = nrs->mesh->Nelements * nrs->mesh->Np;
 
   dpdxKernel(Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, 0*cds->fieldOffset, o_FS);
   dpdxKernel(Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, 1*cds->fieldOffset, o_FS);
 }                                                   
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  occa::properties& kernelInfo = *ins->kernelInfo;
-  setupAide &options = ins->options;
+  occa::properties& kernelInfo = *nrs->kernelInfo;
+  setupAide &options = nrs->options;
 
   dfloat mue, rho;
   options.getArgs("VISCOSITY", mue);
@@ -38,8 +37,8 @@ void UDF_LoadKernels(ins_t *ins)
   kernelInfo["defines/p_W0"] = P_W0;
   kernelInfo["defines/p_A"]  = P_A0 * M_PI;
   kernelInfo["defines/p_D"]  = P_D0 * M_PI;
-  dpdxKernel = udfBuildKernel(ins, "dpdx");
-  exactUVWPKernel = udfBuildKernel(ins, "exactUVWP"); 
+  dpdxKernel = udfBuildKernel(nrs, "dpdx");
+  exactUVWPKernel = udfBuildKernel(nrs, "exactUVWP"); 
 }
 
 void UDF_Setup0(MPI_Comm comm, setupAide &options)
@@ -48,28 +47,26 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options)
   if (ciMode) ciSetup(comm, options);
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
   udf.sEqnSource = &userq;
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  mesh_t *mesh = ins->mesh;
-  cds_t *cds = ins->cds;
+  mesh_t *mesh = nrs->mesh;
+  cds_t *cds = nrs->cds;
 
   if (tstep <= 5) {
-    exactUVWPKernel(ins->Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, ins->fieldOffset, ins->o_P, ins->o_U);
-    ellipticZeroMean(ins->pSolver, ins->o_P);
-    cds->o_S.copyFrom(ins->o_U, ins->Nlocal*sizeof(dfloat), 0*cds->fieldOffset*sizeof(dfloat));
-    cds->o_S.copyFrom(ins->o_U, ins->Nlocal*sizeof(dfloat), 1*cds->fieldOffset*sizeof(dfloat));
+    exactUVWPKernel(nrs->Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, nrs->fieldOffset, nrs->o_P, nrs->o_U);
+    ellipticZeroMean(nrs->pSolver, nrs->o_P);
+    cds->o_S.copyFrom(nrs->o_U, nrs->Nlocal*sizeof(dfloat), 0*cds->fieldOffset*sizeof(dfloat));
+    cds->o_S.copyFrom(nrs->o_U, nrs->Nlocal*sizeof(dfloat), 1*cds->fieldOffset*sizeof(dfloat));
   }
 
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
     nek_userchk();
   }
-  if (ciMode) ciTestErrors(ins, time, tstep);
+  if (ciMode) ciTestErrors(nrs, time, tstep);
 }
diff --git a/examples/gabls1/gabls.oudf b/examples/gabls1/gabls.oudf
index d074f4623..56e8ae952 100644
--- a/examples/gabls1/gabls.oudf
+++ b/examples/gabls1/gabls.oudf
@@ -23,20 +23,20 @@
 }
 
 // Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {                                                                        
-  bc->uP = 1.0;
-  bc->vP = 0;
-  bc->wP = 0;
+  bc->u = 1.0;
+  bc->v = 0;
+  bc->w = 0;
 }
 
-void cdsDirichletConditions3D(bcData *bc)
+void scalarDirichletConditions(bcData *bc)
 {
-  bc->sP = p_T0s;
-  bc->sP += p_sCoolRate * bc->time - 1.0;
+  bc->s = p_T0s;
+  bc->s += p_sCoolRate * bc->time - 1.0;
 }
 
-void cdsNeumannConditions3D(bcData *bc)
+void scalarNeumannConditions(bcData *bc)
 {
-  bc->sF = p_cond*p_TsGrad;
+  bc->flux = p_cond*p_TsGrad;
 }
diff --git a/examples/gabls1/gabls.udf b/examples/gabls1/gabls.udf
index c410d3cea..b5069b302 100644
--- a/examples/gabls1/gabls.udf
+++ b/examples/gabls1/gabls.udf
@@ -10,24 +10,24 @@
 
 occa::kernel coriolisKernel; 
 
-void userf(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_FU)
+void userf(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_FU)
 {
   coriolisKernel(
-    ins->Nlocal, 
-    ins->fieldOffset, 
-    ins->mesh->o_y,
-    ins->cds->o_S,
-    ins->o_U,
+    nrs->Nlocal, 
+    nrs->fieldOffset, 
+    nrs->mesh->o_y,
+    nrs->cds->o_S,
+    nrs->o_U,
     o_FU);
 }
 
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  occa::properties& kernelInfo = *ins->kernelInfo;
-  setupAide &options = ins->options;
+  occa::properties& kernelInfo = *nrs->kernelInfo;
+  setupAide &options = nrs->options;
 
   dfloat cond;
   options.getArgs("SCALAR00 DIFFUSIVITY", cond);
@@ -43,21 +43,19 @@ void UDF_LoadKernels(ins_t *ins)
   kernelInfo["defines/p_sCoolRate"] = SCR * LREF/TREF/UREF/3600;
   kernelInfo["defines/p_YLEN"]      = YLEN;
 
-  coriolisKernel = udfBuildKernel(ins, "coriolis");
+  coriolisKernel = udfBuildKernel(nrs, "coriolis");
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-
   udf.uEqnSource = &userf;
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
     nek_userchk();
+    nek_ocopyTo(time);
   }
 }
diff --git a/examples/hemi/hemi.oudf b/examples/hemi/hemi.oudf
index 4a2900e0b..be907561f 100644
--- a/examples/hemi/hemi.oudf
+++ b/examples/hemi/hemi.oudf
@@ -1,21 +1,21 @@
 // Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {                                                                        
   const dfloat arg = bc->z * M_PI * 0.5/0.6;
   if (arg > 0.5*M_PI) 
-    bc->uP = 1.0;
+    bc->u = 1.0;
   else
-    bc->uP = sin(arg);
+    bc->u = sin(arg);
 
-  bc->vP = 0.0;
-  bc->wP = 0.0;
+  bc->v = 0.0;
+  bc->w = 0.0;
 }
 
 // Stabilized outflow (Dong et al)
-void insPressureDirichletConditions3D(bcData *bc)
+void pressureDirichletConditions(bcData *bc)
 {
   const dfloat iU0delta = 10.0;
-  const dfloat un = bc->uM*bc->nx + bc->vM*bc->ny + bc->wM*bc->nz;
+  const dfloat un = bc->u*bc->nx + bc->v*bc->ny + bc->w*bc->nz;
   const dfloat s0 = 0.5 * (1.0 - tanh(un*iU0delta)); 
-  bc->pP = -0.5 * (bc->uM*bc->uM + bc->vM*bc->vM + bc->wM*bc->wM) * s0;
+  bc->p = -0.5 * (bc->u*bc->u + bc->v*bc->v + bc->w*bc->w) * s0;
 }
diff --git a/examples/hemi/hemi.udf b/examples/hemi/hemi.udf
index 8998541b9..b9c6ec098 100644
--- a/examples/hemi/hemi.udf
+++ b/examples/hemi/hemi.udf
@@ -6,20 +6,19 @@
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  nek_copyTo(ins->startTime);
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
-    nek_userchk();
+    nek_userchk(); // compute lambda2
+    nek_ocopyTo(time);
   }
 }
diff --git a/examples/ktauChannel/channel.oudf b/examples/ktauChannel/channel.oudf
index 41b13e3ea..92d08a71b 100644
--- a/examples/ktauChannel/channel.oudf
+++ b/examples/ktauChannel/channel.oudf
@@ -11,8 +11,8 @@
   }
 }
 
-void cdsDirichletConditions3D(bcData *bc)
+void scalarDirichletConditions(bcData *bc)
 {
-  bc->sP = 0;
-  if(bc->scalarId == 0) bc->sP = 0; 
+  bc->s = 0;
+  if(bc->scalarId == 0) bc->s = 0; 
 }
diff --git a/examples/ktauChannel/channel.udf b/examples/ktauChannel/channel.udf
index 7351b89b7..b34b4f1a9 100644
--- a/examples/ktauChannel/channel.udf
+++ b/examples/ktauChannel/channel.udf
@@ -10,66 +10,59 @@
 static dfloat rho, mueLam;
 occa::kernel userfKernel; 
 
-void userf(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_FU)
+void userf(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_FU)
 {
   const dfloat Re_tau = 2000.0; 
   const dfloat Re_b   = rho/mueLam; 
   const dfloat DPDX   = (Re_tau/Re_b)*(Re_tau/Re_b);
-  userfKernel(ins->Nlocal, 0*ins->fieldOffset, DPDX, o_FU);
+  userfKernel(nrs->Nlocal, 0*nrs->fieldOffset, DPDX, o_FU);
 }
 
-void userq(ins_t *ins, dfloat time, occa::memory o_S, occa::memory o_FS)
+void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_FS)
 {
-  mesh_t *mesh = ins->mesh;
-  cds_t *cds   = ins->cds;
+  mesh_t *mesh = nrs->mesh;
+  cds_t *cds   = nrs->cds;
 
   RANSktau::updateSourceTerms();
 }
 
-void uservp(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_S,
+void uservp(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_S,
             occa::memory o_UProp, occa::memory o_SProp)
 {
-  mesh_t *mesh = ins->mesh;
-  cds_t *cds   = ins->cds;
+  mesh_t *mesh = nrs->mesh;
+  cds_t *cds   = nrs->cds;
 
   RANSktau::updateProperties();
 
   dfloat conductivity;
-  ins->options.getArgs("SCALAR00 DIFFUSIVITY", conductivity);
+  nrs->options.getArgs("SCALAR00 DIFFUSIVITY", conductivity);
   const dfloat Pr_t = 0.7;
   occa::memory o_mue_t = RANSktau::o_mue_t();
   occa::memory o_temp_mue = cds->o_diff + 0*cds->fieldOffset*sizeof(dfloat);
-  ins->scalarScaledAddKernel(ins->Nlocal, mueLam, 1/Pr_t, o_mue_t, o_temp_mue); 
+  nrs->scalarScaledAddKernel(nrs->Nlocal, mueLam, 1/Pr_t, o_mue_t, o_temp_mue); 
 }
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  userfKernel = udfBuildKernel(ins, "cfill");
-  RANSktau::buildKernel(ins);
+  userfKernel = udfBuildKernel(nrs, "cfill");
+  RANSktau::buildKernel(nrs);
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  mesh_t *mesh = ins->mesh;
-  cds_t *cds = ins->cds;
-
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
+  mesh_t *mesh = nrs->mesh;
+  cds_t *cds = nrs->cds;
 
   udf.properties = &uservp;
   udf.uEqnSource = &userf;
   udf.sEqnSource = &userq;
 
   const int scalarFieldStart = 1;
-  ins->options.getArgs("VISCOSITY", mueLam); 
-  ins->options.getArgs("DENSITY", rho); 
-  RANSktau::setup(ins, mueLam, rho, scalarFieldStart);
+  nrs->options.getArgs("VISCOSITY", mueLam); 
+  nrs->options.getArgs("DENSITY", rho); 
+  RANSktau::setup(nrs, mueLam, rho, scalarFieldStart);
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
-  }
 }
diff --git a/examples/ktauChannel/channel.usr b/examples/ktauChannel/channel.usr
index 221058124..745b1b0b6 100644
--- a/examples/ktauChannel/channel.usr
+++ b/examples/ktauChannel/channel.usr
@@ -46,9 +46,6 @@ c-----------------------------------------------------------------------
       include 'SIZE'
       include 'TOTAL'
 
-c      ubar  = glsc2(vx,bm1,nx1*ny1*nz1*nelt)/volvm1
-c      if (nid.eq.0) write(6,*) 'ubar=', ubar
-
       return
       end
 c-----------------------------------------------------------------------
diff --git a/examples/lowMach/lowMach_ci.h b/examples/lowMach/ci.inc
similarity index 82%
rename from examples/lowMach/lowMach_ci.h
rename to examples/lowMach/ci.inc
index 9c579bad1..398f03cb0 100644
--- a/examples/lowMach/lowMach_ci.h
+++ b/examples/lowMach/ci.inc
@@ -1,5 +1,7 @@
 #include <math.h>
 
+static int ciMode = 0;
+
 #define PASS { if (rank == 0) printf("TESTS passed \n"); MPI_Finalize(); exit(0); }
 #define FAIL { if (rank == 0) printf("TESTS failed!\n"); MPI_Finalize(); exit(2); }
 
@@ -10,7 +12,7 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("POLYNOMIAL DEGREE", string("7"));
   options.setArgs("RESTART FROM FILE", string("0"));
   options.setArgs("TSTEPS FOR SOLUTION OUTPUT", "0");
-  options.setArgs("FINAL TIME", string("0.3"));
+  options.setArgs("END TIME", string("0.3"));
   options.setArgs("DT", string("1e-3"));
   options.setArgs("SUBCYCLING STEPS", string("0"));
   if (ciMode == 2) options.setArgs("SUBCYCLING STEPS", string("1"));
@@ -21,15 +23,15 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   options.setArgs("SCALAR01 SOLVER TOLERANCE", string("1e-12"));
 }
 
-void ciTestErrors(ins_t *ins, dfloat time, int tstep)
+void ciTestErrors(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (tstep != ins->NtimeSteps) return;
+  if (!nrs->lastStep) return;
  
-  const int rank = ins->mesh->rank;
+  const int rank = nrs->mesh->rank;
 
-  ins->o_div.copyTo(ins->div);
-  dlong Nlocal = ins->mesh->Nelements * ins->mesh->Np;
-  memcpy(nekData.qtl, ins->div, sizeof(dfloat)*Nlocal);
+  nrs->o_div.copyTo(nrs->div);
+  dlong Nlocal = nrs->mesh->Nelements * nrs->mesh->Np;
+  memcpy(nekData.qtl, nrs->div, sizeof(dfloat)*Nlocal);
  
   nek_ocopyFrom(time, tstep);
   nek_userchk();
diff --git a/examples/lowMach/lowMach.oudf b/examples/lowMach/lowMach.oudf
index af14efc77..dadeee3c5 100644
--- a/examples/lowMach/lowMach.oudf
+++ b/examples/lowMach/lowMach.oudf
@@ -1,14 +1,14 @@
 // Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {
-  bc->uP = 0.5*(3.0 + tanh(bc->x/p_DELTA)); 
-  bc->vP = 0.0; 
-  bc->wP = 0.0;
+  bc->u = 0.5*(3.0 + tanh(bc->x/p_DELTA)); 
+  bc->v = 0.0; 
+  bc->w = 0.0;
 }
 
-void cdsDirichletConditions3D(bcData *bc)
+void scalarDirichletConditions(bcData *bc)
 {
-  bc->sP = 0.5*(3.0 + tanh(bc->x/p_DELTA));
+  bc->s = 0.5*(3.0 + tanh(bc->x/p_DELTA));
 }
 
 
diff --git a/examples/lowMach/lowMach.udf b/examples/lowMach/lowMach.udf
index 3ac6200a0..fc80a1469 100644
--- a/examples/lowMach/lowMach.udf
+++ b/examples/lowMach/lowMach.udf
@@ -5,37 +5,36 @@
 #include "udf.hpp"
 #include "plugins/lowMach.hpp"
 #
-static int ciMode = 0;
-#include "lowMach_ci.h"
+#include "ci.inc"
 #include "casedata.h"
 
 static occa::kernel userQKernel;
 static occa::kernel userVpKernel;
 
-void userq(ins_t *ins, dfloat time, occa::memory o_S, occa::memory o_FS)
+void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_FS)
 {
-  cds_t *cds   = ins->cds;
+  cds_t *cds   = nrs->cds;
   mesh_t *mesh = cds->mesh;
   userQKernel(mesh->Nelements, mesh->o_x, o_FS);
 }
 
-void uservp(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_S,
+void uservp(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_S,
             occa::memory o_UProp, occa::memory o_SProp)
 {
-  mesh_t *mesh = ins->mesh;
+  mesh_t *mesh = nrs->mesh;
 
-  userVpKernel(mesh->Nelements, ins->fieldOffset, ins->cds->fieldOffset, 
+  userVpKernel(mesh->Nelements, nrs->fieldOffset, nrs->cds->fieldOffset, 
                o_S, o_UProp, o_SProp);
 }
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  occa::properties& kernelInfo = *ins->kernelInfo;
+  occa::properties& kernelInfo = *nrs->kernelInfo;
   kernelInfo["defines/p_DELTA"] = P_DELTA;
-  userQKernel = udfBuildKernel(ins, "userQ");
-  userVpKernel = udfBuildKernel(ins, "userVp");
+  userQKernel = udfBuildKernel(nrs, "userQ");
+  userVpKernel = udfBuildKernel(nrs, "userVp");
 }
 
 void UDF_Setup0(MPI_Comm comm, setupAide &options)
@@ -44,22 +43,19 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options)
   if (ciMode) ciSetup(comm, options);
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-
   udf.sEqnSource = &userq;
   udf.properties = &uservp;
 
-  lowMach::setup(ins);
+  lowMach::setup(nrs);
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
     nek_userchk();
   }
-  if (ciMode) ciTestErrors(ins, time, tstep);
+  if (ciMode) ciTestErrors(nrs, time, tstep);
 }
diff --git a/examples/pb146/pb.oudf b/examples/pb146/pb.oudf
index 06473ef67..42cea9741 100644
--- a/examples/pb146/pb.oudf
+++ b/examples/pb146/pb.oudf
@@ -1,27 +1,26 @@
 // Boundary conditions
-/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5, z-slip 6 */
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {                                                                        
-  bc->uP = 0.0;
-  bc->vP = 0.0;
-  bc->wP = 1.0;
+  bc->u = 0.0;
+  bc->v = 0.0;
+  bc->w = 1.0;
 }
 
-void cdsDirichletConditions3D(bcData *bc)
-{                                                                        
-  bc->sP = 0.0;
+// Stabilized outflow (Dong et al)
+void pressureDirichletConditions(bcData *bc)
+{
+  const dfloat iU0delta = 20.0;
+  const dfloat un = bc->u*bc->nx + bc->v*bc->ny + bc->w*bc->nz;
+  const dfloat s0 = 0.5 * (1.0 - tanh(un*iU0delta)); 
+  bc->p = -0.5 * (bc->u*bc->u + bc->v*bc->v + bc->w*bc->w) * s0;
 }
 
-void cdsNeumannConditions3D(bcData *bc)
+void scalarDirichletConditions(bcData *bc)
 {                                                                        
-  bc->sF = 1.0;
+  bc->s = 0.0;
 }
 
-// Stabilized outflow (Dong et al)
-void insPressureDirichletConditions3D(bcData *bc)
-{
-  const dfloat iU0delta = 20.0;
-  const dfloat un = bc->uM*bc->nx + bc->vM*bc->ny + bc->wM*bc->nz;
-  const dfloat s0 = 0.5 * (1.0 - tanh(un*iU0delta)); 
-  bc->pP = -0.5 * (bc->uM*bc->uM + bc->vM*bc->vM + bc->wM*bc->wM) * s0;
+void scalarNeumannConditions(bcData *bc)
+{                                                                        
+  bc->flux = 1.0;
 }
diff --git a/examples/pb146/pb.par b/examples/pb146/pb.par
index de044666e..b71fc1070 100644
--- a/examples/pb146/pb.par
+++ b/examples/pb146/pb.par
@@ -25,7 +25,6 @@ filterModes = 2
 residualTol = 1e-04
 
 [VELOCITY]
-solver = pcg+block
 boundaryTypeMap = inlet, outlet, wall, wall
 density = 1.0
 viscosity = -5000.0
@@ -36,6 +35,3 @@ boundaryTypeMap = inlet, outlet, insulated, flux
 residualTol = 1e-06
 rhoCp = 1.0
 conductivity = -5000
-
-[BOOMERAMG]
-iterations = 1
diff --git a/examples/pb146/pb.udf b/examples/pb146/pb.udf
index c4b683d82..da3c8265e 100644
--- a/examples/pb146/pb.udf
+++ b/examples/pb146/pb.udf
@@ -7,20 +7,14 @@
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
-  }
 }
diff --git a/examples/rbc/rbc.oudf b/examples/rbc/rbc.oudf
index 37e370288..917650dfa 100644
--- a/examples/rbc/rbc.oudf
+++ b/examples/rbc/rbc.oudf
@@ -1,7 +1,7 @@
 // Boundary conditions
 
-void cdsDirichletConditions3D(bcData *bc) {
-  bc->sP = 1.f - bc->z; 
+void scalarDirichletConditions(bcData *bc) {
+  bc->s = 1.f - bc->z; 
 }
 
 // Kernels
diff --git a/examples/rbc/rbc.udf b/examples/rbc/rbc.udf
index 67e41814f..7aca9912e 100644
--- a/examples/rbc/rbc.udf
+++ b/examples/rbc/rbc.udf
@@ -4,31 +4,28 @@
 #include <math.h>
 #include "udf.hpp"
 
-occa::kernel fillFUKernel; 
+static occa::kernel fillFUKernel; 
 
-void userf(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_FU)
+void userf(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_FU)
 {
-  const dlong Nlocal = ins->mesh->Nelements * ins->mesh->Np;
-  fillFUKernel(Nlocal, ins->fieldOffset, ins->cds->o_S, ins->o_FU);
+  nrs->linAlg->fill(nrs->Nlocal, DPDZ, o_FUz);
+  fillFUKernel(Nlocal, nrs->fieldOffset, nrs->cds->o_S, nrs->o_FU);
 }
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  fillFUKernel = udfBuildKernel(ins, "fillFU");
+  fillFUKernel = udfBuildKernel(nrs, "fillFU");
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-
   // assign function pointer to drive flow by constant mean pressure gradient
   udf.uEqnSource = &userf;
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
     nek_userchk();
   }
diff --git a/examples/tgv/tgv.udf b/examples/tgv/tgv.udf
index f0a247282..1d9a01362 100644
--- a/examples/tgv/tgv.udf
+++ b/examples/tgv/tgv.udf
@@ -7,27 +7,27 @@
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
  // set IC
- mesh_t *mesh = ins->mesh;
+ mesh_t *mesh = nrs->mesh;
  for (int n=0; n<mesh->Np*mesh->Nelements; n++) {
    dfloat x = mesh->x[n];
    dfloat y = mesh->y[n];
    dfloat z = mesh->z[n];
-   ins->U[n+0*ins->fieldOffset] = sin(x)*cos(y)*cos(z);;
-   ins->U[n+1*ins->fieldOffset] = -cos(x)*sin(y)*cos(z); 
-   ins->U[n+2*ins->fieldOffset] = 0; 
+   nrs->U[n+0*nrs->fieldOffset] = sin(x)*cos(y)*cos(z);;
+   nrs->U[n+1*nrs->fieldOffset] = -cos(x)*sin(y)*cos(z); 
+   nrs->U[n+2*nrs->fieldOffset] = 0; 
  }
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
+  if (nrs->isOutputStep) {
     nek_ocopyFrom(time, tstep);
     nek_userchk();
   }
diff --git a/examples/turbPipe/turbPipe.oudf b/examples/turbPipe/turbPipe.oudf
index 3ddfb3f63..a2433017a 100644
--- a/examples/turbPipe/turbPipe.oudf
+++ b/examples/turbPipe/turbPipe.oudf
@@ -1,16 +1,16 @@
 // Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
+void velocityDirichletConditions(bcData *bc)
 {                                                                        
-  bc->uP = bc->wrk[bc->idM + 0*bc->fieldOffset];
-  bc->vP = bc->wrk[bc->idM + 1*bc->fieldOffset];
-  bc->wP = bc->wrk[bc->idM + 2*bc->fieldOffset];
+  bc->u = bc->wrk[bc->idM + 0*bc->fieldOffset];
+  bc->v = bc->wrk[bc->idM + 1*bc->fieldOffset];
+  bc->w = bc->wrk[bc->idM + 2*bc->fieldOffset];
 }
 
 // Stabilized outflow (Dong et al)
-void insPressureDirichletConditions3D(bcData *bc)
+void pressureDirichletConditions(bcData *bc)
 {
   const dfloat iU0delta = 20.0;
-  const dfloat un = bc->uM*bc->nx + bc->vM*bc->ny + bc->wM*bc->nz;
+  const dfloat un = bc->u*bc->nx + bc->v*bc->ny + bc->w*bc->nz;
   const dfloat s0 = 0.5 * (1.0 - tanh(un*iU0delta)); 
-  bc->pP = -0.5 * (bc->uM*bc->uM + bc->vM*bc->vM + bc->wM*bc->wM) * s0;
+  bc->p = -0.5 * (bc->u*bc->u + bc->v*bc->v + bc->w*bc->w) * s0;
 }
diff --git a/examples/turbPipe/turbPipe.par b/examples/turbPipe/turbPipe.par
index 77792e5ad..4bde73a91 100644
--- a/examples/turbPipe/turbPipe.par
+++ b/examples/turbPipe/turbPipe.par
@@ -39,6 +39,6 @@ residualTol = 1e-06
 #coarsenType = 10
 #interpolationType = 6
 #smootherType = -1
-iterations = 1
+#iterations = 1
 #strongThreshold = 0.25
 #nonGalerkinTol = 0.1
diff --git a/examples/turbPipe/turbPipe.udf b/examples/turbPipe/turbPipe.udf
index 378179331..ae3adbf09 100644
--- a/examples/turbPipe/turbPipe.udf
+++ b/examples/turbPipe/turbPipe.udf
@@ -10,44 +10,39 @@
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
-  velRecycling::buildKernel(ins);
-  avg::buildKernel(ins);
+  velRecycling::buildKernel(nrs);
+  avg::buildKernel(nrs);
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-
   // setup scratch space
-  ins->o_usrwrk = ins->mesh->device.malloc(ins->NVfields*ins->fieldOffset*sizeof(dfloat));
+  nrs->o_usrwrk = nrs->mesh->device.malloc(nrs->NVfields*nrs->fieldOffset*sizeof(dfloat));
 
   // recycling
   const dfloat wbar  = 1.0;
   const int bID      = 1; 
   dfloat zRecycLayer = 0.25*ZLENGTH;
   const hlong offset = NELSLAB * round(NSLABS * zRecycLayer/ZLENGTH); 
-  velRecycling::setup(ins, ins->o_usrwrk, offset, bID, wbar);
+  velRecycling::setup(nrs, nrs->o_usrwrk, offset, bID, wbar);
 
-  avg::setup(ins);
+  avg::setup(nrs);
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  mesh_t *mesh = ins->mesh;
+  mesh_t *mesh = nrs->mesh;
 
   velRecycling::copy();
   avg::run(time);
 
-  if (ins->isOutputStep) {
-    occa::memory o_UZ = ins->o_U + 2*ins->fieldOffset * sizeof(dfloat);
-    const dfloat ubar = ins->linAlg->innerProd(ins->Nlocal, o_UZ, mesh->o_LMM, mesh->comm)/mesh->volume;
+  if (nrs->isOutputStep) {
+    occa::memory o_UZ = nrs->o_U + 2*nrs->fieldOffset * sizeof(dfloat);
+    const dfloat ubar = nrs->linAlg->innerProd(nrs->Nlocal, o_UZ, mesh->o_LMM, mesh->comm)/mesh->volume;
     if (mesh->rank == 0) printf(" uBulk: %g\n", ubar);
 
     avg::outfld();
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
   }
 }
diff --git a/examples/turbPipe/turbPipe.usr b/examples/turbPipe/turbPipe.usr
index e31074aae..54130fdd5 100644
--- a/examples/turbPipe/turbPipe.usr
+++ b/examples/turbPipe/turbPipe.usr
@@ -6,52 +6,6 @@ c-----------------------------------------------------------------------
       include 'SIZE'
       include 'TOTAL'
 
-      data icalld /0/
-      save icalld
-
-      real x0(3)
-      data x0 /0.0, 0.0, 0.0/
-      save x0
-
-      integer bIDs(1)
-      save iobj_wall
-
-      save atime,timel,drag_avg
-
-      if (icalld.eq.0) then
-        bIDs(1) = 1
-        call create_obj(iobj_wall,bIDs,1)
-        drag_avg = 0
-        atime    = 0
-        timel    = time
-        icalld   = 1
-        call cfill(vdiff,param(2),nx1*ny1*nz1*nelt)
-      endif
-
-      dtime = time - timel
-      atime = atime + dtime
-
-      ! averaging over time
-      if (atime.ne.0. .and. dtime.ne.0.) then
-        beta      = dtime / atime
-        alpha     = 1. - beta
-
-        call torque_calc(1.0,x0,.false.,.false.) ! compute wall shear
-        drag_avg = alpha*drag_avg + beta*dragz(iobj_wall)
-
-        rho    = 1
-        dnu    = param(2)
-        A_w    = 2*pi * 0.5 * ZLENGTH
-        tw     = drag_avg / A_w
- 
-        u_tau  = sqrt(abs(tw) / rho)
-        Re_tau = u_tau * 0.5 / dnu
-        ubar  = glsc2(vz,bm1,nx1*ny1*nz1*nelt)/volvm1
-
-        if (nid.eq.0) write(6,*) 'Re_tau', Re_tau, 'U_b', ubar 
-        timel = time
-      endif
-
       return
       end
 c-----------------------------------------------------------------------
diff --git a/examples/turbPipePeriodic/turbPipe.udf b/examples/turbPipePeriodic/turbPipe.udf
index 9cc60cfb6..6298cc4a7 100644
--- a/examples/turbPipePeriodic/turbPipe.udf
+++ b/examples/turbPipePeriodic/turbPipe.udf
@@ -7,38 +7,31 @@
 
 /* User Functions */
 
-void userf(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_FU)
+void userf(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_FU)
 {
   dfloat mue, rho;
-  ins->options.getArgs("VISCOSITY", mue);
-  ins->options.getArgs("DENSITY", rho);
+  nrs->options.getArgs("VISCOSITY", mue);
+  nrs->options.getArgs("DENSITY", rho);
   const dfloat RE_B = rho/mue; 
   const dfloat DPDZ = 4 * (2*RE_TAU/RE_B)*(2*RE_TAU/RE_B);
 
-  occa::memory o_FUz = o_FU + 2*ins->fieldOffset*sizeof(dfloat);
-  ins->linAlg->fill(ins->Nlocal, DPDZ, o_FUz);
+  occa::memory o_FUz = o_FU + 2*nrs->fieldOffset*sizeof(dfloat);
+  nrs->linAlg->fill(nrs->Nlocal, DPDZ, o_FUz);
 }
 
 
 /* UDF Functions */                                                      
 
-void UDF_LoadKernels(ins_t *ins)
+void UDF_LoadKernels(nrs_t *nrs)
 {
 }
 
-void UDF_Setup(ins_t *ins)
+void UDF_Setup(nrs_t *nrs)
 {
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-
   // assign function pointer to drive flow by constant mean pressure gradient
   udf.uEqnSource = &userf;
 }
 
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
+void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep)
 {
-  if (ins->isOutputStep) {
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
-  }
 }
diff --git a/examples/turbPipePeriodic/turbPipe.usr b/examples/turbPipePeriodic/turbPipe.usr
index 4e2cbf258..9e0a3ed8e 100644
--- a/examples/turbPipePeriodic/turbPipe.usr
+++ b/examples/turbPipePeriodic/turbPipe.usr
@@ -7,52 +7,6 @@ c-----------------------------------------------------------------------
       include 'SIZE'
       include 'TOTAL'
 
-      data icalld /0/
-      save icalld
-
-      real x0(3)
-      data x0 /0.0, 0.0, 0.0/
-      save x0
-
-      integer bIDs(1)
-      save iobj_wall
-
-      save atime,timel,drag_avg
-
-      if (icalld.eq.0) then
-        bIDs(1) = 1
-        call create_obj(iobj_wall,bIDs,1)
-        drag_avg = 0
-        atime    = 0
-        timel    = time
-        icalld   = 1
-        call cfill(vdiff,param(2),nx1*ny1*nz1*nelt)
-      endif
-
-      dtime = time - timel
-      atime = atime + dtime
-
-      ! averaging over time
-      if (atime.ne.0. .and. dtime.ne.0.) then
-        beta      = dtime / atime
-        alpha     = 1. - beta
-
-        call torque_calc(1.0,x0,.false.,.false.) ! compute wall shear
-        drag_avg = alpha*drag_avg + beta*dragz(iobj_wall)
-
-        rho    = 1
-        dnu    = param(2)
-        A_w    = 2*pi * 0.5 * ZLENGTH
-        tw     = drag_avg / A_w
- 
-        u_tau  = sqrt(abs(tw) / rho)
-        Re_tau = u_tau * 0.5 / dnu
-        ubar  = glsc2(vz,bm1,nx1*ny1*nz1*nelt)/volvm1
-
-        if (nid.eq.0) write(6,*) 'Re_tau', Re_tau, 'U_b', ubar 
-        timel = time
-      endif
-
       return
       end
 c-----------------------------------------------------------------------
diff --git a/examples/vortexOutflow/vortex.box b/examples/vortexOutflow/vortex.box
deleted file mode 100644
index 607dc144e..000000000
--- a/examples/vortexOutflow/vortex.box
+++ /dev/null
@@ -1,15 +0,0 @@
-base.rea
--3                      spatial dimension
-1                      number of fields
-#========================================================
-#
-# Build an E1 x E1 box for the convecting cone problem
-#
-#========================================================
-#
-Box
--8  -8  -1          nelx,nely,nelz for Box)
-0.0    1.0    1.0   x0 x1 ratio 
-0.0    1.0    1.0   y0 y1 ratio 
-0.0    1.0    1.0   z0 z1 ratio 
-v  ,o  ,P  ,P  ,v  ,v     bc's  (3 characters each, including blanks!)
diff --git a/examples/vortexOutflow/vortex.co2 b/examples/vortexOutflow/vortex.co2
deleted file mode 100644
index 7c649eff3..000000000
Binary files a/examples/vortexOutflow/vortex.co2 and /dev/null differ
diff --git a/examples/vortexOutflow/vortex.oudf b/examples/vortexOutflow/vortex.oudf
deleted file mode 100644
index 25e83beb0..000000000
--- a/examples/vortexOutflow/vortex.oudf
+++ /dev/null
@@ -1,16 +0,0 @@
-// Boundary conditions
-void insVelocityDirichletConditions3D(bcData *bc)
-{                                                                        
-  bc->uP = 1.0;
-  bc->vP = 0.0;
-  bc->wP = 0.0;
-}
-
-void insPressureDirichletConditions3D(bcData *bc)
-{
-  // Stabilized outflow (Dong et al)
-  const dfloat iU0delta = 10.0;
-  const dfloat s0 = 0.5 * (1.0 - 
-        tanh((bc->uM*bc->nx + bc->vM*bc->ny + bc->wM*bc->nz)*iU0delta));
-  bc->pP = -0.5 * (bc->uM*bc->uM + bc->vM*bc->vM + bc->wM*bc->wM) * s0;
-}
diff --git a/examples/vortexOutflow/vortex.par b/examples/vortexOutflow/vortex.par
deleted file mode 100644
index 93add8f09..000000000
--- a/examples/vortexOutflow/vortex.par
+++ /dev/null
@@ -1,21 +0,0 @@
-[OCCA]
-backend = CUDA
-deviceNumber = LOCAL-RANK
-
-[GENERAL] 
-polynomialOrder = 7
-#startFrom = restart.fld
-stopAt = endTime
-endTime = 15
-dt = 2e-03
-timeStepper = tombo2
-writeInterval = 500
-
-[PRESSURE]
-residualTol = 1e-05
-
-[VELOCITY]
-boundaryTypeMap = inlet, outlet
-residualTol = 1e-08
-density = 1.0
-viscosity = 1/1000 
diff --git a/examples/vortexOutflow/vortex.re2 b/examples/vortexOutflow/vortex.re2
deleted file mode 100644
index f774f9d91..000000000
Binary files a/examples/vortexOutflow/vortex.re2 and /dev/null differ
diff --git a/examples/vortexOutflow/vortex.udf b/examples/vortexOutflow/vortex.udf
deleted file mode 100644
index 5b2c5e90f..000000000
--- a/examples/vortexOutflow/vortex.udf
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// nekRS User Defined File
-//
-
-#include <math.h>
-#include "udf.hpp"
-
-/* UDF Functions */                                                      
-
-void UDF_LoadKernels(ins_t *ins)
-{
-}
-
-void UDF_Setup0(MPI_Comm comm, setupAide &options)
-{
-}
-
-void UDF_Setup(ins_t *ins)
-{
-  // get IC from nek
-  if (!ins->readRestartFile) nek_copyTo(ins->startTime);
-}
-
-void UDF_ExecuteStep(ins_t *ins, dfloat time, int tstep)
-{
-  if (ins->isOutputStep) {
-    nek_ocopyFrom(time, tstep);
-    nek_userchk();
-  }
-}
diff --git a/examples/vortexOutflow/vortex.usr b/examples/vortexOutflow/vortex.usr
deleted file mode 100644
index 07d127ba7..000000000
--- a/examples/vortexOutflow/vortex.usr
+++ /dev/null
@@ -1,159 +0,0 @@
-#define LENGTH 10.0
-#define VSTRENGTH 10.0 
-
-c-----------------------------------------------------------------------
-c
-c  User specified routines:
-c
-c     - boundary conditions
-c     - initial conditions
-c     - variable properties
-c     - forcing function for fluid (f)
-c     - forcing function for passive scalar (q)
-c     - general purpose routine for checking errors etc.
-c
-c-----------------------------------------------------------------------
-      subroutine uservp (ix,iy,iz,ieg)
-      include 'SIZE'
-c     include 'TSTEP'
-      include 'TOTAL'
-      include 'NEKUSE'
-c
-      udiff =0.
-      utrans=0.
-      return
-      end
-c
-c-----------------------------------------------------------------------
-      subroutine userf  (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TSTEP'
-c     include 'TOTAL'
-      include 'NEKUSE'
-c
-      FFX = 0.0
-      FFY = 0.0
-      FFZ = 0.0
-      return
-      end
-c
-c-----------------------------------------------------------------------
-      subroutine userq  (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-c
-      QVOL   = 0.0
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine userchk
-      include 'SIZE'
-      include 'TOTAL'
-
-      common /SCRNS/ wo1(lx1,ly1,lz1,lelv)
-     &              ,wo2(lx1,ly1,lz1,lelv)
-     &              ,omg(lx1*ly1*lz1*lelv,ldim)
-
-      call comp_vort3(omg,wo1,wo2,vx,vy,vz)
-      call copy(t,omg(1,3),nx1*ny1*nz1*nelv)
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine userbc (ix,iy,iz,iside,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine useric (ix,iy,iz,ieg)
-      include 'SIZE'
-      include 'TOTAL'
-      include 'NEKUSE'
-
-      xt = x - LENGTH/2 
-      yt = y - LENGTH/2
-      r2 = xt*xt + yt*yt
-
-      aa= VSTRENGTH * exp(-0.5*r2)
-      ux = 1.0 - aa * yt
-      uy = aa * xt 
-      uz = 0.0
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrdat
-      include 'SIZE'
-      include 'TOTAL'
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrdat2
-      include 'SIZE'
-      include 'TOTAL'
-
-      xmin = 0
-      xmax = LENGTH
-      call rescale_x(xm1,xmin,xmax) 
-      call rescale_x(ym1,xmin,xmax)
-
-      do iel=1,nelt
-        do ifc=1,2*ndim
-          if (cbc(ifc,iel,1) .eq. 'v  ') boundaryID(ifc,iel) = 1
-          if (cbc(ifc,iel,1) .eq. 'o  ') boundaryID(ifc,iel) = 2
-        enddo
-        cbc(5,iel,1) = 'P  '
-        cbc(6,iel,1) = 'P  '
-      enddo
-
-      ifto = .true.
-
-      return
-      end
-c-----------------------------------------------------------------------
-      subroutine usrdat3
-      return
-      end
-c-----------------------------------------------------------------------
-      function dongOutflow(ix,iy,iz,iel,iside,u0,delta)
-
-      include 'SIZE'
-      include 'SOLN'
-      include 'GEOM'
-
-      real sn(3)
-
-      ux = vx(ix,iy,iz,iel)
-      uy = vy(ix,iy,iz,iel)
-      uz = vz(ix,iy,iz,iel)
-
-      call getSnormal(sn,ix,iy,iz,iside,iel)
-      vn = ux*sn(1) + uy*sn(2) + uz*sn(3) 
-      S0 = 0.5*(1.0 - tanh(vn/u0/delta))
-
-      dongOutflow = -0.5*(ux*ux+uy*uy+uz*uz)*S0
-
-      return
-      end
-C-----------------------------------------------------------------------
-      subroutine usrsetvert(glo_num,nel,nx,ny,nz) ! to modify glo_num
-      integer*8 glo_num(1)
-
-      ! kludge for periodic bc in z
-      nxy  = nx*ny
-      nxyz = nx*ny*nz
-      do iel = 1,nel
-         ioff = nxyz*(iel-1)
-         do ixy = 1,nxy
-            glo_num(ioff + nxy*(nz-1) + ixy) = glo_num(ioff + ixy)
-         enddo
-      enddo
-
-      return
-      end
diff --git a/makenrs b/makenrs
index 8a1fafc53..714ddbec6 100755
--- a/makenrs
+++ b/makenrs
@@ -9,8 +9,8 @@
 : ${NEKRS_CC:="mpicc"}
 : ${NEKRS_CXX:="mpic++"}
 : ${NEKRS_FC:="mpif77"}
-: ${NEKRS_CXXFLAGS:=""}
 
+# compiler settings for CPU backend
 : ${OCCA_CXX:="g++"}
 : ${OCCA_CXXFLAGS:="-O2 -ftree-vectorize -funroll-loops -march=native -mtune=native"}
 
@@ -113,6 +113,8 @@ fi
 : ${OCCA_OPENCL_ENABLED:=0}
 : ${OCCA_METAL_ENABLED:=0}
 
+: ${NEKRS_CXXFLAGS:=""}
+
 USE_OCCA_MEM_BYTE_ALIGN="64" 
 
 if uname -a | grep 'ppc64'; then
@@ -122,7 +124,6 @@ fi
 
 NEKRS_CFLAGS="${NEKRS_CXXFLAGS}"
 NEKRS_FFLAGS="${NEKRS_CXXFLAGS}"
-NEKRS_LIBP_DEFINES="-DUSE_NULL_PROJECTION=1"
 NEKRS_NEK5000_PPLIST="PARRSB DPROCMAP"
 
 export OCCA_CUDA_ENABLED
diff --git a/src/libP/okl/addScalar.okl b/okl/core/addScalar.okl
similarity index 100%
rename from src/libP/okl/addScalar.okl
rename to okl/core/addScalar.okl
diff --git a/okl/core/cdsHelmholtzBCHex3D.okl b/okl/core/cdsHelmholtzBCHex3D.okl
index eae026866..545208a97 100644
--- a/okl/core/cdsHelmholtzBCHex3D.okl
+++ b/okl/core/cdsHelmholtzBCHex3D.okl
@@ -37,17 +37,17 @@
     bc.y   = y[bc.idM];                                     \
     bc.z   = z[bc.idM];                                     \
     bc.fieldOffset = offset;                                \
-    bc.sM = S[bc.idM];                                      \
+    bc.s = S[bc.idM];                                       \
     bc.wrk = W;                                             \
                                                             \
     const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];        \
-    const dlong bcType = mapB[bc.idM];                      \
-    bc.sF = 0.f; bc.sP = 0.f;                               \
+    const dlong bcType = EToB[face + p_Nfaces * e];         \
+    bc.flux = 0.f;                                          \
     bc.scalarId = scalarId;                                 \
     if(bcType == 3) {                                       \
-      cdsNeumannConditions3D(&bc);                          \
+      scalarNeumannConditions(&bc);                         \
     }                                                       \
-    s_ndU[j][i] = -WsJ * (bc.sF);                           \
+    s_ndU[j][i] = -WsJ * (bc.flux);                         \
   }
 
 //RHS contributions for continuous solver
@@ -63,6 +63,7 @@
                                  @restrict const dfloat*  y,
                                  @restrict const dfloat*  z,
                                  @restrict const dfloat*  S,
+                                 @restrict const int*  EToB,
                                  @restrict const int*  mapB,
                                  @restrict const dfloat*  W,
                                  @restrict dfloat*  rhsU)
@@ -258,7 +259,7 @@
           bc.x  = x[idM];
           bc.y  = y[idM];
           bc.z  = z[idM];
-          bc.sM = S[idM];
+          bc.s  = S[idM];
           bc.wrk = W;
           bc.fieldOffset = offset;
           bc.scalarId = scalarId;
@@ -266,9 +267,8 @@
           bc.nx = sgeo[sk * p_Nsgeo + p_NXID];
           bc.ny = sgeo[sk * p_Nsgeo + p_NYID];
           bc.nz = sgeo[sk * p_Nsgeo + p_NZID];
-          bc.sP = 0.f;
-          cdsDirichletConditions3D(&bc);
-          S[idM] = bc.sP;
+          scalarDirichletConditions(&bc);
+          S[idM] = bc.s;
         }
       }
     }
diff --git a/src/libP/okl/copyDfloatToPfloat.okl b/okl/core/copyDfloatToPfloat.okl
similarity index 97%
rename from src/libP/okl/copyDfloatToPfloat.okl
rename to okl/core/copyDfloatToPfloat.okl
index 93f452b89..1fda07c82 100644
--- a/src/libP/okl/copyDfloatToPfloat.okl
+++ b/okl/core/copyDfloatToPfloat.okl
@@ -30,7 +30,7 @@ SOFTWARE.
   
   for(dlong n=0;n<N;++n;@tile(256,@outer,@inner)){
     if(n<N){
-      y[n]=static_cast<pfloat>(x[n]);
+      y[n]=x[n];
     }
   }
 }
diff --git a/src/libP/okl/copyPfloatToDfloat.okl b/okl/core/copyPfloatToDfloat.okl
similarity index 97%
rename from src/libP/okl/copyPfloatToDfloat.okl
rename to okl/core/copyPfloatToDfloat.okl
index d42d396dd..75ac7e6cc 100644
--- a/src/libP/okl/copyPfloatToDfloat.okl
+++ b/okl/core/copyPfloatToDfloat.okl
@@ -30,7 +30,7 @@ SOFTWARE.
   
   for(dlong n=0;n<N;++n;@tile(256,@outer,@inner)){
     if(n<N){
-      x[n]=static_cast<dfloat>(y[n]);
+      x[n]=y[n];
     }
   }
 }
diff --git a/src/libP/okl/dotDivide.okl b/okl/core/dotDivide.okl
similarity index 100%
rename from src/libP/okl/dotDivide.okl
rename to okl/core/dotDivide.okl
diff --git a/src/libP/okl/dotMultiply.okl b/okl/core/dotMultiply.okl
similarity index 100%
rename from src/libP/okl/dotMultiply.okl
rename to okl/core/dotMultiply.okl
diff --git a/src/libP/okl/dotMultiplyAdd.okl b/okl/core/dotMultiplyAdd.okl
similarity index 100%
rename from src/libP/okl/dotMultiplyAdd.okl
rename to okl/core/dotMultiplyAdd.okl
diff --git a/src/libP/okl/fill.okl b/okl/core/fill.okl
similarity index 100%
rename from src/libP/okl/fill.okl
rename to okl/core/fill.okl
diff --git a/src/libP/okl/gather.okl b/okl/core/gather.okl
similarity index 100%
rename from src/libP/okl/gather.okl
rename to okl/core/gather.okl
diff --git a/src/libP/okl/gatherNodes.okl b/okl/core/gatherNodes.okl
similarity index 100%
rename from src/libP/okl/gatherNodes.okl
rename to okl/core/gatherNodes.okl
diff --git a/src/libP/okl/gatherScatter.okl b/okl/core/gatherScatter.okl
similarity index 100%
rename from src/libP/okl/gatherScatter.okl
rename to okl/core/gatherScatter.okl
diff --git a/src/libP/okl/get.okl b/okl/core/get.okl
similarity index 100%
rename from src/libP/okl/get.okl
rename to okl/core/get.okl
diff --git a/src/libP/okl/innerProduct.okl b/okl/core/innerProduct.okl
similarity index 100%
rename from src/libP/okl/innerProduct.okl
rename to okl/core/innerProduct.okl
diff --git a/src/libP/okl/mask.okl b/okl/core/mask.okl
similarity index 100%
rename from src/libP/okl/mask.okl
rename to okl/core/mask.okl
diff --git a/src/libP/okl/meshGeometricFactorsHex3D.okl b/okl/core/meshGeometricFactorsHex3D.okl
similarity index 100%
rename from src/libP/okl/meshGeometricFactorsHex3D.okl
rename to okl/core/meshGeometricFactorsHex3D.okl
diff --git a/src/libP/okl/meshHaloExtract2D.okl b/okl/core/meshHaloExtract2D.okl
similarity index 100%
rename from src/libP/okl/meshHaloExtract2D.okl
rename to okl/core/meshHaloExtract2D.okl
diff --git a/src/libP/okl/meshHaloExtract3D.okl b/okl/core/meshHaloExtract3D.okl
similarity index 100%
rename from src/libP/okl/meshHaloExtract3D.okl
rename to okl/core/meshHaloExtract3D.okl
diff --git a/src/libP/okl/meshHaloGet.okl b/okl/core/meshHaloGet.okl
similarity index 100%
rename from src/libP/okl/meshHaloGet.okl
rename to okl/core/meshHaloGet.okl
diff --git a/src/libP/okl/meshHaloPut.okl b/okl/core/meshHaloPut.okl
similarity index 100%
rename from src/libP/okl/meshHaloPut.okl
rename to okl/core/meshHaloPut.okl
diff --git a/src/libP/okl/meshIsoSurface3D.okl b/okl/core/meshIsoSurface3D.okl
similarity index 100%
rename from src/libP/okl/meshIsoSurface3D.okl
rename to okl/core/meshIsoSurface3D.okl
diff --git a/src/libP/okl/multiInnerProduct.okl b/okl/core/multiInnerProduct.okl
similarity index 100%
rename from src/libP/okl/multiInnerProduct.okl
rename to okl/core/multiInnerProduct.okl
diff --git a/src/libP/okl/multiScaledAdd.okl b/okl/core/multiScaledAdd.okl
similarity index 100%
rename from src/libP/okl/multiScaledAdd.okl
rename to okl/core/multiScaledAdd.okl
diff --git a/src/libP/okl/multiWeightedInnerProduct.okl b/okl/core/multiWeightedInnerProduct.okl
similarity index 100%
rename from src/libP/okl/multiWeightedInnerProduct.okl
rename to okl/core/multiWeightedInnerProduct.okl
diff --git a/src/libP/okl/norm2.okl b/okl/core/norm2.okl
similarity index 100%
rename from src/libP/okl/norm2.okl
rename to okl/core/norm2.okl
diff --git a/okl/core/insAdvectionHex3D.okl b/okl/core/nrsAdvectionHex3D.okl
similarity index 99%
rename from okl/core/insAdvectionHex3D.okl
rename to okl/core/nrsAdvectionHex3D.okl
index ef1e99361..58a3d295f 100644
--- a/okl/core/insAdvectionHex3D.okl
+++ b/okl/core/nrsAdvectionHex3D.okl
@@ -23,7 +23,7 @@
    SOFTWARE.
 
  */
-@kernel void insStrongAdvectionVolumeHex3D(const dlong Nelements,
+@kernel void nrsStrongAdvectionVolumeHex3D(const dlong Nelements,
                                            @restrict const dfloat*  vgeo,
                                            @restrict const dfloat*  D,
                                            const dlong offset,
@@ -120,7 +120,7 @@
   }
 }
 
-@kernel void insStrongAdvectionCubatureVolumeHex3D(const dlong Nelements,
+@kernel void nrsStrongAdvectionCubatureVolumeHex3D(const dlong Nelements,
                                                    @restrict const dfloat*  vgeo,
                                                    @restrict const dfloat*  cubvgeo,
                                                    @restrict const dfloat*  cubD,
diff --git a/okl/core/insCflHex3D.okl b/okl/core/nrsCflHex3D.okl
similarity index 98%
rename from okl/core/insCflHex3D.okl
rename to okl/core/nrsCflHex3D.okl
index a159c93c4..310214615 100644
--- a/okl/core/insCflHex3D.okl
+++ b/okl/core/nrsCflHex3D.okl
@@ -25,7 +25,7 @@
  */
 
 // Compute Gradient in weak Form
-@kernel void insCflHex3D(const dlong Nelements,
+@kernel void nrsCflHex3D(const dlong Nelements,
                          const dfloat dt,
                          @restrict const dfloat*  vgeo,
                          @restrict const dfloat*  idH,
diff --git a/okl/core/insCurlHex3D.okl b/okl/core/nrsCurlHex3D.okl
similarity index 98%
rename from okl/core/insCurlHex3D.okl
rename to okl/core/nrsCurlHex3D.okl
index 41402173a..d1b03e25f 100644
--- a/okl/core/insCurlHex3D.okl
+++ b/okl/core/nrsCurlHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insCurlHex3D(const dlong Nelements,
+@kernel void nrsCurlHex3D(const dlong Nelements,
                           @restrict const dfloat*  vgeo,
                           @restrict const dfloat*  const D,
                           const dlong offset,
diff --git a/okl/core/insDivergenceHex3D.okl b/okl/core/nrsDivergenceHex3D.okl
similarity index 77%
rename from okl/core/insDivergenceHex3D.okl
rename to okl/core/nrsDivergenceHex3D.okl
index 82cf660eb..50fab0fc6 100644
--- a/okl/core/insDivergenceHex3D.okl
+++ b/okl/core/nrsDivergenceHex3D.okl
@@ -23,7 +23,7 @@
 
  */
 
-@kernel void insDivergenceVolumeHex3D(const dlong Nelements,
+@kernel void nrsDivergenceVolumeHex3D(const dlong Nelements,
                                       @restrict const dfloat*  vgeo,
                                       @restrict const dfloat*  D,
                                       const dlong offset,
@@ -114,52 +114,35 @@
 
 // adding dp/dn -n\dot F 
 // if slip (dp/dn == 0) -n\dot F else -g0/dt n \dot u^(n+1)
-#define surfaceTerms(sk,face,m, i, j)                                                   \
-  {                                                                                     \
-    struct bcData bc;                                                                   \
-    const dlong idM = vmapM[sk];                                                        \
-                                                                                        \
-    bc.time = time;                                                                     \
-    bc.nx  = sgeo[sk * p_Nsgeo + p_NXID];                                               \
-    bc.ny  = sgeo[sk * p_Nsgeo + p_NYID];                                               \
-    bc.nz  = sgeo[sk * p_Nsgeo + p_NZID];                                               \
-    bc.x   = x[idM];                                                                    \
-    bc.y   = y[idM];                                                                    \
-    bc.z   = z[idM];                                                                    \
-    bc.idM = idM;                                                                       \
-    bc.fieldOffset = offset;                                                            \
-    const dfloat WSJ = sgeo[sk * p_Nsgeo + p_WSJID];                                    \
-                                                                                        \
-    bc.uM  = U[idM + 0 * offset];                                                       \
-    bc.vM  = U[idM + 1 * offset];                                                       \
-    bc.wM  = U[idM + 2 * offset];                                                       \
-    bc.wrk = W;                                                                         \
-    bc.id  = EToBM[face + p_Nfaces * e];                                                \
-    const dlong bcType = EToB[face + p_Nfaces * e];                                     \
-    bc.uP = bc.uM; bc.vP = bc.vM; bc.wP = bc.wM;                                        \
-    dfloat flux = coef * (bc.nx * bc.uP + bc.ny * bc.vP + bc.nz * bc.wP);               \
-    if(bcType == 4 || bcType == 5 || bcType == 6 ) {                                    \
-      const dfloat fx = F[idM + 0 * offset];                                            \
-      const dfloat fy = F[idM + 1 * offset];                                            \
-      const dfloat fz = F[idM + 2 * offset];                                            \
-      flux = bc.nx * fx + bc.ny * fy + bc.nz * fz;                                      \
-    }                                                                                   \
-    s_fluxDiv[m][j][i] = -WSJ*flux;                                                     \
+#define surfaceTerms(sk,face,m, i, j)                                                  \
+  {                                                                                    \
+    struct bcData bc;                                                                  \
+    const dlong idM = vmapM[sk];                                                       \
+                                                                                       \
+    bc.nx = sgeo[sk * p_Nsgeo + p_NXID];                                               \
+    bc.ny = sgeo[sk * p_Nsgeo + p_NYID];                                               \
+    bc.nz = sgeo[sk * p_Nsgeo + p_NZID];                                               \
+    bc.u  = U[idM + 0 * offset];                                                       \
+    bc.v  = U[idM + 1 * offset];                                                       \
+    bc.w  = U[idM + 2 * offset];                                                       \
+    const dfloat WSJ = sgeo[sk * p_Nsgeo + p_WSJID];                                   \
+    const dlong bcType = EToB[face + p_Nfaces * e];                                    \
+    dfloat flux = coef * (bc.nx * bc.u + bc.ny * bc.v + bc.nz * bc.w);                 \
+    if(bcType == 4 || bcType == 5 || bcType == 6 ) {                                   \
+      const dfloat fx = F[idM + 0 * offset];                                           \
+      const dfloat fy = F[idM + 1 * offset];                                           \
+      const dfloat fz = F[idM + 2 * offset];                                           \
+      flux = bc.nx * fx + bc.ny * fy + bc.nz * fz;                                     \
+    }                                                                                  \
+    s_fluxDiv[m][j][i] = -WSJ*flux;                                                    \
   }
 
-@kernel void insDivergenceSurfaceTOMBOHex3D(const dlong Nelements,
-                                            @restrict const dfloat*  vgeo,
+@kernel void nrsDivergenceSurfaceTOMBOHex3D(const dlong Nelements,
                                             @restrict const dfloat*  sgeo,
                                             @restrict const dlong*  vmapM,
-                                            @restrict const int*  EToBM,
                                             @restrict const int*  EToB,
-                                            const dfloat time,
                                             const dfloat coef,
-                                            @restrict const dfloat*  x,
-                                            @restrict const dfloat*  y,
-                                            @restrict const dfloat*  z,
                                             const dlong offset,
-                                            @restrict const dfloat*  W,
                                             @restrict const dfloat*  F,
                                             @restrict const dfloat*  U,
                                             @restrict dfloat*  divU)
diff --git a/okl/core/insExtrapolate.okl b/okl/core/nrsExtrapolate.okl
similarity index 96%
rename from okl/core/insExtrapolate.okl
rename to okl/core/nrsExtrapolate.okl
index 21bf1118a..b11edb94f 100644
--- a/okl/core/insExtrapolate.okl
+++ b/okl/core/nrsExtrapolate.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insMultiExtrapolate(const dlong Nelements,
+@kernel void nrsMultiExtrapolate(const dlong Nelements,
                                  const int Nfields,
                                  const int Nstages,
                                  const dlong fieldOffset,
@@ -51,7 +51,7 @@
   }
 }
 
-@kernel void insExtrapolate(const dlong Nelements,
+@kernel void nrsExtrapolate(const dlong Nelements,
                             const int Nstages,
                             const dlong fieldOffset,
                             const dlong stageOffset,
diff --git a/okl/core/insFilterRTHex3D.okl b/okl/core/nrsFilterRTHex3D.okl
similarity index 98%
rename from okl/core/insFilterRTHex3D.okl
rename to okl/core/nrsFilterRTHex3D.okl
index 614d70b16..30a75dd9d 100644
--- a/okl/core/insFilterRTHex3D.okl
+++ b/okl/core/nrsFilterRTHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insFilterRTHex3D(const dlong Nelements,
+@kernel void nrsFilterRTHex3D(const dlong Nelements,
                               @restrict const dfloat* fMT,
                               const dfloat filterS,
                               const dlong offset,
@@ -159,7 +159,7 @@
   }
 }
 
-@kernel void insSFilterRTHex3D(const dlong Nelements,
+@kernel void nrsSFilterRTHex3D(const dlong Nelements,
                                @restrict const dfloat* fMT,
                                const dfloat filterS,
                                const dlong offset,
diff --git a/okl/core/insGradientHex3D.okl b/okl/core/nrsGradientHex3D.okl
similarity index 98%
rename from okl/core/insGradientHex3D.okl
rename to okl/core/nrsGradientHex3D.okl
index 99ad089ec..584548dbd 100644
--- a/okl/core/insGradientHex3D.okl
+++ b/okl/core/nrsGradientHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insGradientVolumeHex3D(const dlong Nelements,
+@kernel void nrsGradientVolumeHex3D(const dlong Nelements,
                                     @restrict const dfloat*  vgeo,
                                     @restrict const dfloat*  D,
                                     const dlong offset,
@@ -88,7 +88,7 @@
   }
 }
 
-@kernel void inswGradientVolumeHex3D(const dlong Nelements,
+@kernel void nrswGradientVolumeHex3D(const dlong Nelements,
                                      @restrict const dfloat*  vgeo,
                                      @restrict const dfloat*  D,
                                      const dlong offset,
diff --git a/okl/core/insMassMatrix.okl b/okl/core/nrsMassMatrix.okl
similarity index 96%
rename from okl/core/insMassMatrix.okl
rename to okl/core/nrsMassMatrix.okl
index 90f1dee6b..35a841589 100644
--- a/okl/core/insMassMatrix.okl
+++ b/okl/core/nrsMassMatrix.okl
@@ -25,7 +25,7 @@
  */
 
 // Computes volume contribution of div(UI)
-@kernel void insMassMatrixHex3D(const dlong Nelements,
+@kernel void nrsMassMatrixHex3D(const dlong Nelements,
                                 const dlong offset,
                                 const int nfield,
                                 @restrict const dfloat*  vgeo,
@@ -50,7 +50,7 @@
 }
 
 // //
-@kernel void insInvMassMatrixHex3D(const dlong Nelements,
+@kernel void nrsInvMassMatrixHex3D(const dlong Nelements,
                                    const dlong offset,
                                    const int nfield,
                                    @restrict const dfloat*  vgeo,
diff --git a/okl/core/insMueDiv.okl b/okl/core/nrsMueDiv.okl
similarity index 89%
rename from okl/core/insMueDiv.okl
rename to okl/core/nrsMueDiv.okl
index 4392dc7cc..845795668 100644
--- a/okl/core/insMueDiv.okl
+++ b/okl/core/nrsMueDiv.okl
@@ -1,4 +1,4 @@
-@kernel void insMueDiv(const dlong N,
+@kernel void nrsMueDiv(const dlong N,
                        const dfloat scale,
                        @restrict const  dfloat *  MUE,
                        @restrict const  dfloat *  DIV,
diff --git a/okl/core/insPQ.okl b/okl/core/nrsPQ.okl
similarity index 91%
rename from okl/core/insPQ.okl
rename to okl/core/nrsPQ.okl
index e988f2f01..d4b0cc09c 100644
--- a/okl/core/insPQ.okl
+++ b/okl/core/nrsPQ.okl
@@ -1,4 +1,4 @@
-@kernel void insPQ(const dlong N,
+@kernel void nrsPQ(const dlong N,
                    const dfloat scale,
                    @restrict const  dfloat *  MUE,
                    @restrict const  dfloat *  DIV,
diff --git a/okl/core/insPressureAddQtl.okl b/okl/core/nrsPressureAddQtl.okl
similarity index 96%
rename from okl/core/insPressureAddQtl.okl
rename to okl/core/nrsPressureAddQtl.okl
index 6aa824ea3..04d57d47b 100644
--- a/okl/core/insPressureAddQtl.okl
+++ b/okl/core/nrsPressureAddQtl.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insPressureAddQtl(const dlong Nelements,
+@kernel void nrsPressureAddQtl(const dlong Nelements,
                                @restrict const dfloat* vgeo,
                                const dfloat lambda,
                                @restrict const dfloat* qtl,
diff --git a/okl/core/insPressureAxHex3D.okl b/okl/core/nrsPressureAxHex3D.okl
similarity index 98%
rename from okl/core/insPressureAxHex3D.okl
rename to okl/core/nrsPressureAxHex3D.okl
index 3e6879c6d..e57aa804d 100644
--- a/okl/core/insPressureAxHex3D.okl
+++ b/okl/core/nrsPressureAxHex3D.okl
@@ -25,7 +25,7 @@
  */
 
 // Computes local [lap(u) + lambda*u] = [-(grad(u), grad(phi)) + lambda*u] operation
-@kernel void insPressureAxHex3D(const dlong Nelements,
+@kernel void nrsPressureAxHex3D(const dlong Nelements,
                                 const dlong offset,
                                 @restrict const dfloat*  ggeo,
                                 @restrict const dfloat*  D,
diff --git a/okl/core/insPressureBCHex3D.okl b/okl/core/nrsPressureBCHex3D.okl
similarity index 89%
rename from okl/core/insPressureBCHex3D.okl
rename to okl/core/nrsPressureBCHex3D.okl
index 01ee05363..a997868a6 100644
--- a/okl/core/insPressureBCHex3D.okl
+++ b/okl/core/nrsPressureBCHex3D.okl
@@ -25,9 +25,8 @@
  */
 
 // We are solving for Pressure Difference
-@kernel void insPressureDirichletBCHex3D(const dlong Nelements,
+@kernel void nrsPressureDirichletBCHex3D(const dlong Nelements,
                                          const dfloat time,
-                                         const dfloat dt,
                                          const dlong offset,
                                          @restrict const dfloat*  sgeo,
                                          @restrict const dfloat*  x,
@@ -61,15 +60,14 @@
         bc.ny = sgeo[sk * p_Nsgeo + p_NYID];
         bc.nz = sgeo[sk * p_Nsgeo + p_NZID];
         // Get Current Solution i.e. Un
-        bc.uM = U[idM + 0 * offset];
-        bc.vM = U[idM + 1 * offset];
-        bc.wM = U[idM + 2 * offset];
-        bc.pM = P[idM];
+        bc.u = U[idM + 0 * offset];
+        bc.v = U[idM + 1 * offset];
+        bc.w = U[idM + 2 * offset];
         bc.wrk = W;
 
-        bc.pP = 0.f;
-        insPressureDirichletConditions3D(&bc);
-        PI[idM] = bc.pP;
+        bc.p = 0.f;
+        pressureDirichletConditions(&bc);
+        PI[idM] = bc.p;
       }
     }
 }
diff --git a/okl/core/insPressureRhsHex3D.okl b/okl/core/nrsPressureRhsHex3D.okl
similarity index 95%
rename from okl/core/insPressureRhsHex3D.okl
rename to okl/core/nrsPressureRhsHex3D.okl
index 69e3b311a..be363d69f 100644
--- a/okl/core/insPressureRhsHex3D.okl
+++ b/okl/core/nrsPressureRhsHex3D.okl
@@ -1,4 +1,4 @@
-@kernel void insPressureRhsTOMBOHex3D(const dlong N,
+@kernel void nrsPressureRhsTOMBOHex3D(const dlong N,
                                       const dlong fieldOffset,
                                       @restrict const dfloat*  MUE,
                                       @restrict const dfloat*  iRHO,
diff --git a/okl/core/insPressureStressHex3D.okl b/okl/core/nrsPressureStressHex3D.okl
similarity index 98%
rename from okl/core/insPressureStressHex3D.okl
rename to okl/core/nrsPressureStressHex3D.okl
index 6d01ea041..6b2abde3f 100644
--- a/okl/core/insPressureStressHex3D.okl
+++ b/okl/core/nrsPressureStressHex3D.okl
@@ -1,4 +1,4 @@
-@kernel void insPressureStressHex3D(const dlong Nelements,
+@kernel void nrsPressureStressHex3D(const dlong Nelements,
                                     @restrict const dfloat* vgeo,
                                     @restrict const dfloat* D,
                                     const dlong offset,
diff --git a/okl/core/insPressureUpdate.okl b/okl/core/nrsPressureUpdate.okl
similarity index 96%
rename from okl/core/insPressureUpdate.okl
rename to okl/core/nrsPressureUpdate.okl
index 2f7269a1e..36f08904b 100644
--- a/okl/core/insPressureUpdate.okl
+++ b/okl/core/nrsPressureUpdate.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insPressureUpdate(const dlong Nelements,
+@kernel void nrsPressureUpdate(const dlong Nelements,
                                @restrict const int*  mapB,
                                @restrict const dfloat*  PI,
                                @restrict const dfloat*  P,
diff --git a/okl/core/insQtlHex3D.okl b/okl/core/nrsQtlHex3D.okl
similarity index 98%
rename from okl/core/insQtlHex3D.okl
rename to okl/core/nrsQtlHex3D.okl
index c68cef967..b5c4794de 100644
--- a/okl/core/insQtlHex3D.okl
+++ b/okl/core/nrsQtlHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insQtlHex3D(const dlong Nelements,
+@kernel void nrsQtlHex3D(const dlong Nelements,
                          @restrict const dfloat*  vgeo,
                          @restrict const dfloat*  D,
                          const dlong offset,
diff --git a/okl/core/insSubCycleHex3D.okl b/okl/core/nrsSubCycleHex3D.okl
similarity index 97%
rename from okl/core/insSubCycleHex3D.okl
rename to okl/core/nrsSubCycleHex3D.okl
index 965acf2df..df07b7db1 100644
--- a/okl/core/insSubCycleHex3D.okl
+++ b/okl/core/nrsSubCycleHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insSubCycleStrongCubatureVolumeHex3D(const dlong Nelements,
+@kernel void nrsSubCycleStrongCubatureVolumeHex3D(const dlong Nelements,
                                                   @restrict const dlong*   elementList,
                                                   @restrict const dfloat*  vgeo,
                                                   @restrict const dfloat*  cubvgeo,
@@ -53,13 +53,13 @@
     @shared dfloat s_Vd[p_cubNq][p_cubNq];
     @shared dfloat s_Wd[p_cubNq][p_cubNq];
 
-    @shared dfloat s_U1[p_cubNq][p_cubNq];
-    @shared dfloat s_V1[p_cubNq][p_cubNq];
-    @shared dfloat s_W1[p_cubNq][p_cubNq];
+    @shared dfloat s_U1[p_Nq][p_cubNq];
+    @shared dfloat s_V1[p_Nq][p_cubNq];
+    @shared dfloat s_W1[p_Nq][p_cubNq];
 
-    @shared dfloat s_Ud1[p_cubNq][p_cubNq];
-    @shared dfloat s_Vd1[p_cubNq][p_cubNq];
-    @shared dfloat s_Wd1[p_cubNq][p_cubNq];
+    @shared dfloat s_Ud1[p_Nq][p_cubNq];
+    @shared dfloat s_Vd1[p_Nq][p_cubNq];
+    @shared dfloat s_Wd1[p_Nq][p_cubNq];
 
     @exclusive dfloat r_U[p_cubNq], r_V[p_cubNq], r_W[p_cubNq];
     @exclusive dfloat r_Ud[p_cubNq], r_Vd[p_cubNq], r_Wd[p_cubNq];
@@ -324,7 +324,7 @@
   }
 }
 
-@kernel void insSubCycleStrongVolumeHex3D(const dlong Nelements,
+@kernel void nrsSubCycleStrongVolumeHex3D(const dlong Nelements,
                                           @restrict const dlong*  elementList,
                                           @restrict const dfloat*  vgeo,
                                           @restrict const dfloat*  D,
diff --git a/okl/core/insSubCycleRKUpdate.okl b/okl/core/nrsSubCycleRKUpdate.okl
similarity index 96%
rename from okl/core/insSubCycleRKUpdate.okl
rename to okl/core/nrsSubCycleRKUpdate.okl
index d4a4ac341..7c0ad357a 100644
--- a/okl/core/insSubCycleRKUpdate.okl
+++ b/okl/core/nrsSubCycleRKUpdate.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insSubCycleLSERKUpdate(const dlong Nelements,
+@kernel void nrsSubCycleLSERKUpdate(const dlong Nelements,
                                     const dfloat dt,
                                     const dfloat rka,
                                     const dfloat rkb,
@@ -53,7 +53,7 @@
   }
 }
 
-@kernel void insSubCycleERKUpdate(const dlong Nelements,
+@kernel void nrsSubCycleERKUpdate(const dlong Nelements,
                                   const int stage,
                                   const dfloat dt,
                                   const dlong offset,
@@ -89,7 +89,7 @@
   }
 }
 
-@kernel void insSubCycleExt(const dlong Nelements,
+@kernel void nrsSubCycleExt(const dlong Nelements,
                             const int Nstages,
                             const dlong fieldOffset,
                             @restrict const dfloat*  c,
diff --git a/okl/core/insSumMakefHex3D.okl b/okl/core/nrsSumMakefHex3D.okl
similarity index 98%
rename from okl/core/insSumMakefHex3D.okl
rename to okl/core/nrsSumMakefHex3D.okl
index 4fbc3c157..bd23f89a0 100644
--- a/okl/core/insSumMakefHex3D.okl
+++ b/okl/core/nrsSumMakefHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insSumMakefHex3D(const dlong Nelements,
+@kernel void nrsSumMakefHex3D(const dlong Nelements,
                               @restrict const dfloat*  vgeo,
                               const dfloat idt,
                               @restrict const dfloat*  extbdfA,
diff --git a/okl/core/insVelocityBCHex3D.okl b/okl/core/nrsVelocityBCHex3D.okl
similarity index 87%
rename from okl/core/insVelocityBCHex3D.okl
rename to okl/core/nrsVelocityBCHex3D.okl
index 1f30c67a5..ca0f6c05a 100644
--- a/okl/core/insVelocityBCHex3D.okl
+++ b/okl/core/nrsVelocityBCHex3D.okl
@@ -37,23 +37,20 @@
     bc.z   = z[bc.idM];                                                 \
     bc.fieldOffset = offset;                                            \
     bc.id  = EToBM[face + p_Nfaces * e];                                \
-    bc.uM  = U[bc.idM + 0 * offset];                                    \
-    bc.vM  = U[bc.idM + 1 * offset];                                    \
-    bc.wM  = U[bc.idM + 2 * offset];                                    \
-    bc.uxP = 0.f;  bc.uyP = 0.f;  bc.uzP = 0.f; bc.uP = 0.f;            \
-    bc.vxP = 0.f;  bc.vyP = 0.f;  bc.vzP = 0.f; bc.vP = 0.f;            \
-    bc.wxP = 0.f;  bc.wyP = 0.f;  bc.wzP = 0.f; bc.wP = 0.f;            \
+    bc.u  = U[bc.idM + 0 * offset];                                     \
+    bc.v  = U[bc.idM + 1 * offset];                                     \
+    bc.w  = U[bc.idM + 2 * offset];                                     \
     bc.wrk = W;                                                         \
     dfloat TRx = 0.f;                                                   \
     dfloat TRy = 0.f;                                                   \
     dfloat TRz = 0.f;                                                   \
     const dlong bcType = EToB[face + p_Nfaces * e];                     \
     if(bcType == 3) {                                                   \
-        bc.pP = 0.f;                                                    \
-        insPressureDirichletConditions3D(&bc);                          \
-        TRx = -bc.pP*bc.nx;                                             \
-        TRy = -bc.pP*bc.ny;                                             \
-        TRz = -bc.pP*bc.nz;                                             \
+        bc.p = 0.f;                                                     \
+        pressureDirichletConditions(&bc);                               \
+        TRx = -bc.p*bc.nx;                                              \
+        TRy = -bc.p*bc.ny;                                              \
+        TRz = -bc.p*bc.nz;                                              \
     }                                                                   \
     s_ndU[j][i] = WsJ*TRx;                                              \
     s_ndV[j][i] = WsJ*TRy;                                              \
@@ -61,7 +58,7 @@
   }
 
 //RHS contributions for continuous solver
-@kernel void insVelocityNeumannBCHex3D(const dlong Nelements,
+@kernel void nrsVelocityNeumannBCHex3D(const dlong Nelements,
                                        const dlong offset,
                                        @restrict const dfloat*  sgeo,
                                        @restrict const dlong*  vmapM,
@@ -256,7 +253,7 @@
   }
 }
 
-@kernel void insVelocityDirichletBCHex3D(const dlong Nelements,
+@kernel void nrsVelocityDirichletBCHex3D(const dlong Nelements,
                                          const dlong offset,
                                          const dfloat time,
                                          @restrict const dfloat*  sgeo,
@@ -293,33 +290,29 @@
           bc.ny = sgeo[sid * p_Nsgeo + p_NYID];
           bc.nz = sgeo[sid * p_Nsgeo + p_NZID];
 
-          bc.uM = U[idM + 0 * offset];
-          bc.vM = U[idM + 1 * offset];
-          bc.wM = U[idM + 2 * offset];
-
-          bc.uP = UH[idM + 0 * offset];
-          bc.vP = UH[idM + 1 * offset];
-          bc.wP = UH[idM + 2 * offset];
-          bc.pP = 0.f;
+          bc.u = U[idM + 0 * offset];
+          bc.v = U[idM + 1 * offset];
+          bc.w = U[idM + 2 * offset];
+          //bc.pP =
           bc.wrk = W;
           bc.fieldOffset = offset;
 
           if(bcType == 1) {
-            bc.uP = 0.f;
-            bc.vP = 0.f;
-            bc.wP = 0.f;
+            bc.u = 0.f;
+            bc.v = 0.f;
+            bc.w = 0.f;
           }else if(bcType == 2) {
-            insVelocityDirichletConditions3D(&bc);
+            velocityDirichletConditions(&bc);
           }else if(bcType == 4) {
-            bc.uP = 0.f; // vP = vM ; wP = wM;
+            bc.u = 0.f; // vP = vM ; wP = wM;
           }else if(bcType == 5) {
-            bc.vP = 0.f; // uP = vM ; wP = wM;
+            bc.v = 0.f; // uP = vM ; wP = wM;
           }else if(bcType == 6) {
-            bc.wP = 0.f; // vP = vM ; uP = uM;
+            bc.w = 0.f; // vP = vM ; uP = uM;
           }
-          UH[idM + 0 * offset] = bc.uP;
-          UH[idM + 1 * offset] = bc.vP;
-          UH[idM + 2 * offset] = bc.wP;
+          UH[idM + 0 * offset] = bc.u;
+          UH[idM + 1 * offset] = bc.v;
+          UH[idM + 2 * offset] = bc.w;
         }
       }
     }
diff --git a/okl/core/insVelocityRhsHex3D.okl b/okl/core/nrsVelocityRhsHex3D.okl
similarity index 97%
rename from okl/core/insVelocityRhsHex3D.okl
rename to okl/core/nrsVelocityRhsHex3D.okl
index 7a950a726..56c177818 100644
--- a/okl/core/insVelocityRhsHex3D.okl
+++ b/okl/core/nrsVelocityRhsHex3D.okl
@@ -24,7 +24,7 @@
 
  */
 
-@kernel void insVelocityRhsTOMBOHex3D(const dlong Nelements,
+@kernel void nrsVelocityRhsTOMBOHex3D(const dlong Nelements,
                                       const dlong fieldOffset,
                                       @restrict const dfloat*  BF,
                                       @restrict const dfloat*  GP,
diff --git a/src/libP/okl/put.okl b/okl/core/put.okl
similarity index 100%
rename from src/libP/okl/put.okl
rename to okl/core/put.okl
diff --git a/src/libP/okl/scaledAdd.okl b/okl/core/scaledAdd.okl
similarity index 100%
rename from src/libP/okl/scaledAdd.okl
rename to okl/core/scaledAdd.okl
diff --git a/src/libP/okl/scatter.okl b/okl/core/scatter.okl
similarity index 100%
rename from src/libP/okl/scatter.okl
rename to okl/core/scatter.okl
diff --git a/src/libP/okl/serialDotMultiply.c b/okl/core/serialDotMultiply.c
similarity index 100%
rename from src/libP/okl/serialDotMultiply.c
rename to okl/core/serialDotMultiply.c
diff --git a/src/libP/okl/serialScaledAdd.c b/okl/core/serialScaledAdd.c
similarity index 100%
rename from src/libP/okl/serialScaledAdd.c
rename to okl/core/serialScaledAdd.c
diff --git a/src/libP/okl/serialWeightedInnerProduct2.c b/okl/core/serialWeightedInnerProduct2.c
similarity index 100%
rename from src/libP/okl/serialWeightedInnerProduct2.c
rename to okl/core/serialWeightedInnerProduct2.c
diff --git a/src/libP/okl/serialWeightedNorm2.c b/okl/core/serialWeightedNorm2.c
similarity index 100%
rename from src/libP/okl/serialWeightedNorm2.c
rename to okl/core/serialWeightedNorm2.c
diff --git a/src/libP/okl/sum.okl b/okl/core/sum.okl
similarity index 100%
rename from src/libP/okl/sum.okl
rename to okl/core/sum.okl
diff --git a/src/libP/okl/wadgUpdateKernels.okl b/okl/core/wadgUpdateKernels.okl
similarity index 100%
rename from src/libP/okl/wadgUpdateKernels.okl
rename to okl/core/wadgUpdateKernels.okl
diff --git a/src/libP/okl/weightedInnerProduct1.okl b/okl/core/weightedInnerProduct1.okl
similarity index 100%
rename from src/libP/okl/weightedInnerProduct1.okl
rename to okl/core/weightedInnerProduct1.okl
diff --git a/src/libP/okl/weightedInnerProduct2.okl b/okl/core/weightedInnerProduct2.okl
similarity index 100%
rename from src/libP/okl/weightedInnerProduct2.okl
rename to okl/core/weightedInnerProduct2.okl
diff --git a/src/libP/okl/weightedNorm2.okl b/okl/core/weightedNorm2.okl
similarity index 100%
rename from src/libP/okl/weightedNorm2.okl
rename to okl/core/weightedNorm2.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxHex3D.okl b/okl/elliptic/ellipticAxHex3D.okl
similarity index 99%
rename from src/libP/solvers/elliptic/okl/ellipticAxHex3D.okl
rename to okl/elliptic/ellipticAxHex3D.okl
index 4340df868..6878eb8ba 100644
--- a/src/libP/solvers/elliptic/okl/ellipticAxHex3D.okl
+++ b/okl/elliptic/ellipticAxHex3D.okl
@@ -147,6 +147,7 @@
     }
   }
 }
+
 @kernel void ellipticAxVarHex3D(const dlong Nelements,
                                 const dlong offset,
                                 @restrict const dfloat*  ggeo,
@@ -1372,7 +1373,6 @@
         }
     }
 
-
   for(int j = 0; j < p_Nq; ++j; @outer(1))
     for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
       @shared dfloat s_D[p_Nq][p_Nq];
@@ -1431,7 +1431,6 @@
         }
     }
 
-
   for(int i = 0; i < p_Nq; ++i; @outer(1))
     for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
       @shared dfloat s_D[p_Nq][p_Nq];
@@ -1489,8 +1488,8 @@
           Aq[id] += r_Auk;
         }
     }
-
 }
+
 #endif
 
 #if 0
@@ -1564,7 +1563,6 @@
         }
     }
 
-
   for(int j = 0; j < p_Nq; ++j; @outer(1))
     for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
       @shared dfloat s_D[p_Nq][p_Nq];
@@ -1623,7 +1621,6 @@
         }
     }
 
-
   for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
     @shared dfloat s_D[p_Nq][p_Nq];
     @shared dfloat s_q[p_Nq][p_Nq][p_Nq];
@@ -1718,7 +1715,6 @@
           s_q[k][j][i] = r_q;
         }
 
-
     for(int k = 0; k < p_Nq; ++k; @inner(2))
       for(int j = 0; j < p_Nq; ++j; @inner(1))
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
@@ -1839,7 +1835,6 @@
           s_q[k][j][i] = r_q;
         }
 
-
     for(int k = 0; k < p_Nq; ++k; @inner(2))
       for(int j = 0; j < p_Nq; ++j; @inner(1))
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
diff --git a/src/libP/solvers/elliptic/okl/ellipticBlockAxHex3D.okl b/okl/elliptic/ellipticBlockAxHex3D.okl
similarity index 76%
rename from src/libP/solvers/elliptic/okl/ellipticBlockAxHex3D.okl
rename to okl/elliptic/ellipticBlockAxHex3D.okl
index 1cbfc7c6a..6bee7b0a8 100644
--- a/src/libP/solvers/elliptic/okl/ellipticBlockAxHex3D.okl
+++ b/okl/elliptic/ellipticBlockAxHex3D.okl
@@ -276,6 +276,7 @@
     }
   }
 }
+
 // Currently Implemented for
 @kernel void ellipticBlockAxHex3D_N3(const dlong Nelements,
                                      const dlong offset,
@@ -1707,6 +1708,7 @@
     }
   }
 }
+
 // Currently Implemented for
 @kernel void ellipticBlockPartialAxVarHex3D_N3(const dlong Nelements,
                                                const dlong offset,
@@ -1896,19 +1898,19 @@
     }
   }
 }
+
 @kernel void ellipticStressAxVarHex3D(const dlong Nelements,
-                                 const dlong offset,
-                                 const dlong loffset,
-                                 @restrict const dfloat *vgeo,
-                                 @restrict const dfloat *D,
-                                 @restrict const  dfloat *  S,
-                                 @restrict const dfloat *lambda,
-                                 @restrict const dfloat *q,
-                                 @restrict dfloat *Aq){
-
-
-for(dlong e=0; e<Nelements; ++e; @outer(0)){
-  // AK: heavy memory usage, optimize later
+                                      const dlong offset,
+                                      const dlong loffset,
+                                      @restrict const dfloat* vgeo,
+                                      @restrict const dfloat* D,
+                                      @restrict const dfloat*  S,
+                                      @restrict const dfloat* lambda,
+                                      @restrict const dfloat* q,
+                                      @restrict dfloat* Aq)
+{
+  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
+    // AK: heavy memory usage, optimize later
     @shared dfloat s_D[p_Nq][p_Nq];
 
     @shared dfloat s_U[p_Nq][p_Nq];
@@ -1931,177 +1933,169 @@ for(dlong e=0; e<Nelements; ++e; @outer(0)){
     @shared dfloat s_SWs[p_Nq][p_Nq];
     @exclusive dfloat s_SWt[p_Nq];
     //
-    @exclusive dfloat rx , ry, rz; 
-    @exclusive dfloat sx , sy, sz; 
-    @exclusive dfloat tx , ty, tz; 
-    
-    // Symmetric Stress Tensor
-    @exclusive dfloat s11,s12,s13; 
-    @exclusive dfloat s21,s22,s23; 
-    @exclusive dfloat s31,s32,s33; 
+    @exclusive dfloat rx, ry, rz;
+    @exclusive dfloat sx, sy, sz;
+    @exclusive dfloat tx, ty, tz;
 
+    // Symmetric Stress Tensor
+    @exclusive dfloat s11,s12,s13;
+    @exclusive dfloat s21,s22,s23;
+    @exclusive dfloat s31,s32,s33;
 
     @exclusive dfloat r_Au[p_Nq];
     @exclusive dfloat r_Av[p_Nq];
     @exclusive dfloat r_Aw[p_Nq];
 
-   // prefetch q
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-          if(k==0) s_D[j][i] = D[p_Nq*j+i];
-            const dlong id = e*p_Np+k*p_Nq*p_Nq+j*p_Nq+i;
-            s_U[j][i] = q[id + 0*offset];
-            s_V[j][i] = q[id + 1*offset];
-            s_W[j][i] = q[id + 2*offset];
-          if(k==0){
-            for(int l = 0 ; l < p_Nq; ++l){
-              const dlong other_id = e*p_Np+l*p_Nq*p_Nq+j*p_Nq+i;
-              s_Uloc[l] = q[other_id + 0*offset];
-              s_Vloc[l] = q[other_id + 1*offset];
-              s_Wloc[l] = q[other_id + 2*offset];
+    // prefetch q
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          if(k == 0) s_D[j][i] = D[p_Nq * j + i];
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          s_U[j][i] = q[id + 0 * offset];
+          s_V[j][i] = q[id + 1 * offset];
+          s_W[j][i] = q[id + 2 * offset];
+          if(k == 0) {
+            for(int l = 0; l < p_Nq; ++l) {
+              const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
+              s_Uloc[l] = q[other_id + 0 * offset];
+              s_Vloc[l] = q[other_id + 1 * offset];
+              s_Wloc[l] = q[other_id + 2 * offset];
             }
           }
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){   
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          rx = vgeo[gid + p_RXID*p_Np];
-          ry = vgeo[gid + p_RYID*p_Np];
-          rz = vgeo[gid + p_RZID*p_Np];
-          
-          sx = vgeo[gid + p_SXID*p_Np];
-          sy = vgeo[gid + p_SYID*p_Np];
-          sz = vgeo[gid + p_SZID*p_Np];
-          
-          tx = vgeo[gid + p_TXID*p_Np];
-          ty = vgeo[gid + p_TYID*p_Np];
-          tz = vgeo[gid + p_TZID*p_Np];
-          
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
+          rx = vgeo[gid + p_RXID * p_Np];
+          ry = vgeo[gid + p_RYID * p_Np];
+          rz = vgeo[gid + p_RZID * p_Np];
+
+          sx = vgeo[gid + p_SXID * p_Np];
+          sy = vgeo[gid + p_SYID * p_Np];
+          sz = vgeo[gid + p_SZID * p_Np];
+
+          tx = vgeo[gid + p_TXID * p_Np];
+          ty = vgeo[gid + p_TYID * p_Np];
+          tz = vgeo[gid + p_TZID * p_Np];
+
+          const dfloat JW = vgeo[gid + p_JWID * p_Np];
 
           // compute 1D derivatives
           dfloat ur = 0.f, us = 0.f, ut = 0.f;
           dfloat vr = 0.f, vs = 0.f, vt = 0.f;
           dfloat wr = 0.f, ws = 0.f, wt = 0.f;
-          for(int m=0;m<p_Nq;++m){
+          for(int m = 0; m < p_Nq; ++m) {
             const dfloat Dim = s_D[i][m]; // Dr
             const dfloat Djm = s_D[j][m]; // Ds
             const dfloat Dkm = s_D[k][m]; // Dt
 
-            ur += Dim*s_U[j][m];
-            us += Djm*s_U[m][i];
-            ut += Dkm*s_Uloc[m];
+            ur += Dim * s_U[j][m];
+            us += Djm * s_U[m][i];
+            ut += Dkm * s_Uloc[m];
             //
-            vr += Dim*s_V[j][m];
-            vs += Djm*s_V[m][i];
-            vt += Dkm*s_Vloc[m];
+            vr += Dim * s_V[j][m];
+            vs += Djm * s_V[m][i];
+            vt += Dkm * s_Vloc[m];
             //
-            wr += Dim*s_W[j][m];
-            ws += Djm*s_W[m][i];
-            wt += Dkm*s_Wloc[m];
+            wr += Dim * s_W[j][m];
+            ws += Djm * s_W[m][i];
+            wt += Dkm * s_Wloc[m];
           }
 
-          const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
-          // not sure that we need anistropic diffusion!!!! 
-          // con be simplified for istropic diffusion    
-          const dfloat u_lam0 = lambda[id + 0*offset + 0*loffset]; 
-          const dfloat u_lam1 = lambda[id + 1*offset + 0*loffset];
-          const dfloat v_lam0 = lambda[id + 0*offset + 1*loffset]; 
-          const dfloat v_lam1 = lambda[id + 1*offset + 1*loffset];
-          const dfloat w_lam0 = lambda[id + 0*offset + 2*loffset]; 
-          const dfloat w_lam1 = lambda[id + 1*offset + 2*loffset];
-
-          const dfloat dudx = rx*ur + sx*us + tx*ut; 
-          const dfloat dudy = ry*ur + sy*us + ty*ut; 
-          const dfloat dudz = rz*ur + sz*us + tz*ut; 
-
-          const dfloat dvdx = rx*vr + sx*vs + tx*vt; 
-          const dfloat dvdy = ry*vr + sy*vs + ty*vt; 
-          const dfloat dvdz = rz*vr + sz*vs + tz*vt; 
-
-          const dfloat dwdx = rx*wr + sx*ws + tx*wt; 
-          const dfloat dwdy = ry*wr + sy*ws + ty*wt; 
-          const dfloat dwdz = rz*wr + sz*ws + tz*wt; 
-
-          s11 = u_lam0*JW*(dudx + dudx); 
-          s12 = u_lam0*JW*(dudy + dvdx); 
-          s13 = u_lam0*JW*(dudz + dwdx); 
-
-          s21 = v_lam0*JW*(dvdx + dudy); 
-          s22 = v_lam0*JW*(dvdy + dvdy); 
-          s23 = v_lam0*JW*(dvdz + dwdy); 
-
-          s31 = w_lam0*JW*(dwdx + dudz); 
-          s32 = w_lam0*JW*(dwdy + dvdz); 
-          s33 = w_lam0*JW*(dwdz + dwdz); 
-          // store in register
-          r_Au[k] =  u_lam1*JW*s_U[j][i]; 
-          r_Av[k] =  v_lam1*JW*s_V[j][i];
-          r_Aw[k] =  w_lam1*JW*s_W[j][i];
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          // not sure that we need anistropic diffusion!!!!
+          // con be simplified for istropic diffusion
+          const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset];
+          const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset];
+          const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset];
+          const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset];
+          const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset];
+          const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset];
 
-        }
-      }
-      @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dfloat dudx = rx * ur + sx * us + tx * ut;
+          const dfloat dudy = ry * ur + sy * us + ty * ut;
+          const dfloat dudz = rz * ur + sz * us + tz * ut;
+
+          const dfloat dvdx = rx * vr + sx * vs + tx * vt;
+          const dfloat dvdy = ry * vr + sy * vs + ty * vt;
+          const dfloat dvdz = rz * vr + sz * vs + tz * vt;
+
+          const dfloat dwdx = rx * wr + sx * ws + tx * wt;
+          const dfloat dwdy = ry * wr + sy * ws + ty * wt;
+          const dfloat dwdz = rz * wr + sz * ws + tz * wt;
 
-         s_SUr[j][i] =  rx*s11 + ry*s12 + rz*s13;
-         s_SUs[j][i] =  sx*s11 + sy*s12 + sz*s13;
-         s_SUtloc[k] =  tx*s11 + ty*s12 + tz*s13;
-         //
-         s_SVr[j][i] =  rx*s21 + ry*s22 + rz*s23;
-         s_SVs[j][i] =  sx*s21 + sy*s22 + sz*s23;
-         s_SVt[k] =  tx*s21 + ty*s22 + tz*s23;
-         //
-         s_SWr[j][i] =  rx*s31 + ry*s32 + rz*s33;
-         s_SWs[j][i] =  sx*s31 + sy*s32 + sz*s33;
-         s_SWt[k] =  tx*s31 + ty*s32 + tz*s33;
+          s11 = u_lam0 * JW * (dudx + dudx);
+          s12 = u_lam0 * JW * (dudy + dvdx);
+          s13 = u_lam0 * JW * (dudz + dwdx);
 
+          s21 = v_lam0 * JW * (dvdx + dudy);
+          s22 = v_lam0 * JW * (dvdy + dvdy);
+          s23 = v_lam0 * JW * (dvdz + dwdy);
+
+          s31 = w_lam0 * JW * (dwdx + dudz);
+          s32 = w_lam0 * JW * (dwdy + dvdz);
+          s33 = w_lam0 * JW * (dwdz + dwdz);
+          // store in register
+          r_Au[k] =  u_lam1 * JW * s_U[j][i];
+          r_Av[k] =  v_lam1 * JW * s_V[j][i];
+          r_Aw[k] =  w_lam1 * JW * s_W[j][i];
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-           #pragma unroll p_Nq
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          s_SUr[j][i] =  rx * s11 + ry * s12 + rz * s13;
+          s_SUs[j][i] =  sx * s11 + sy * s12 + sz * s13;
+          s_SUtloc[k] =  tx * s11 + ty * s12 + tz * s13;
+          //
+          s_SVr[j][i] =  rx * s21 + ry * s22 + rz * s23;
+          s_SVs[j][i] =  sx * s21 + sy * s22 + sz * s23;
+          s_SVt[k] =  tx * s21 + ty * s22 + tz * s23;
+          //
+          s_SWr[j][i] =  rx * s31 + ry * s32 + rz * s33;
+          s_SWs[j][i] =  sx * s31 + sy * s32 + sz * s33;
+          s_SWt[k] =  tx * s31 + ty * s32 + tz * s33;
+        }
+      @barrier("local");
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dim = s_D[m][i]; // Dr'
             const dfloat Djm = s_D[m][j]; // Ds'
 
-            r_Au[k] += Dim*s_SUr[j][m];
-            r_Au[k] += Djm*s_SUs[m][i];
+            r_Au[k] += Dim * s_SUr[j][m];
+            r_Au[k] += Djm * s_SUs[m][i];
 
-            r_Av[k] += Dim*s_SVr[j][m];
-            r_Av[k] += Djm*s_SVs[m][i];
+            r_Av[k] += Dim * s_SVr[j][m];
+            r_Av[k] += Djm * s_SVs[m][i];
 
-            r_Aw[k] += Dim*s_SWr[j][m];
-            r_Aw[k] += Djm*s_SWs[m][i];
+            r_Aw[k] += Dim * s_SWr[j][m];
+            r_Aw[k] += Djm * s_SWs[m][i];
           }
         }
       }
       @barrier("local");
     }
 
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-           #pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dkm = s_D[m][k]; // Dt'
 
-            r_Au[k] += Dkm*s_SUtloc[m];
+            r_Au[k] += Dkm * s_SUtloc[m];
 
-            r_Av[k] += Dkm*s_SVt[m];
+            r_Av[k] += Dkm * s_SVt[m];
 
-            r_Aw[k] += Dkm*s_SWt[m];
+            r_Aw[k] += Dkm * s_SWt[m];
           }
-          const dlong id = e*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
-          Aq[id+0*offset] = r_Au[k];
-          Aq[id+1*offset] = r_Av[k];
-          Aq[id+2*offset] = r_Aw[k];
-        
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          Aq[id + 0 * offset] = r_Au[k];
+          Aq[id + 1 * offset] = r_Av[k];
+          Aq[id + 2 * offset] = r_Aw[k];
         }
       }
     }
@@ -2110,19 +2104,17 @@ for(dlong e=0; e<Nelements; ++e; @outer(0)){
 
 //
 @kernel void ellipticStressPartialAxVarHex3D(const dlong Nelements,
-                                 const dlong offset,
-                                 const dlong loffset,
-                                 @restrict const  dlong *elementList,
-                                 @restrict const dfloat *vgeo,
-                                 @restrict const dfloat *D,
-                                 @restrict const  dfloat *  S,
-                                 @restrict const dfloat *lambda,
-                                 @restrict const dfloat *q,
-                                 @restrict dfloat *Aq){
-
-
-for(dlong e=0; e<Nelements; ++e; @outer(0)){
-
+                                             const dlong offset,
+                                             const dlong loffset,
+                                             @restrict const dlong* elementList,
+                                             @restrict const dfloat* vgeo,
+                                             @restrict const dfloat* D,
+                                             @restrict const dfloat*  S,
+                                             @restrict const dfloat* lambda,
+                                             @restrict const dfloat* q,
+                                             @restrict dfloat* Aq)
+{
+  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
     @shared dfloat s_D[p_Nq][p_Nq];
 
     @shared dfloat s_U[p_Nq][p_Nq];
@@ -2144,199 +2136,188 @@ for(dlong e=0; e<Nelements; ++e; @outer(0)){
     @exclusive dfloat s_SWt[p_Nq];
 
     //
-    @exclusive dfloat rx , ry, rz; 
-    @exclusive dfloat sx , sy, sz; 
-    @exclusive dfloat tx , ty, tz; 
+    @exclusive dfloat rx, ry, rz;
+    @exclusive dfloat sx, sy, sz;
+    @exclusive dfloat tx, ty, tz;
     // Symmetric Stress Tensor
-    @exclusive dfloat s11,s12,s13; 
-    @exclusive dfloat s21,s22,s23; 
-    @exclusive dfloat s31,s32,s33; 
-
+    @exclusive dfloat s11,s12,s13;
+    @exclusive dfloat s21,s22,s23;
+    @exclusive dfloat s31,s32,s33;
 
     @exclusive dfloat r_Au[p_Nq];
     @exclusive dfloat r_Av[p_Nq];
     @exclusive dfloat r_Aw[p_Nq];
     @exclusive dlong element;
 
-   // prefetch q
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
+    // prefetch q
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           element = elementList[e];
-          if(k==0) s_D[j][i] = D[p_Nq*j+i];
-            const dlong id = element*p_Np+k*p_Nq*p_Nq+j*p_Nq+i;
-            s_U[j][i] =q[id + 0*offset];
-            s_V[j][i] =q[id + 1*offset];
-            s_W[j][i] =q[id + 2*offset];
-          if(k==0){
-            for(int l = 0 ; l < p_Nq; ++l){
-              const dlong other_id = element*p_Np+l*p_Nq*p_Nq+j*p_Nq+i;
-              s_Uloc[l] =q[other_id + 0*offset];
-              s_Vloc[l] =q[other_id + 1*offset];
-              s_Wloc[l] =q[other_id + 2*offset];
+          if(k == 0) s_D[j][i] = D[p_Nq * j + i];
+          const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          s_U[j][i] = q[id + 0 * offset];
+          s_V[j][i] = q[id + 1 * offset];
+          s_W[j][i] = q[id + 2 * offset];
+          if(k == 0) {
+            for(int l = 0; l < p_Nq; ++l) {
+              const dlong other_id = element * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
+              s_Uloc[l] = q[other_id + 0 * offset];
+              s_Vloc[l] = q[other_id + 1 * offset];
+              s_Wloc[l] = q[other_id + 2 * offset];
             }
           }
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){   
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + element*p_Np*p_Nvgeo;
-          rx = vgeo[gid + p_RXID*p_Np];
-          ry = vgeo[gid + p_RYID*p_Np];
-          rz = vgeo[gid + p_RZID*p_Np];
-          
-          sx = vgeo[gid + p_SXID*p_Np];
-          sy = vgeo[gid + p_SYID*p_Np];
-          sz = vgeo[gid + p_SZID*p_Np];
-          
-          tx = vgeo[gid + p_TXID*p_Np];
-          ty = vgeo[gid + p_TYID*p_Np];
-          tz = vgeo[gid + p_TZID*p_Np];
-          
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np * p_Nvgeo;
+          rx = vgeo[gid + p_RXID * p_Np];
+          ry = vgeo[gid + p_RYID * p_Np];
+          rz = vgeo[gid + p_RZID * p_Np];
+
+          sx = vgeo[gid + p_SXID * p_Np];
+          sy = vgeo[gid + p_SYID * p_Np];
+          sz = vgeo[gid + p_SZID * p_Np];
+
+          tx = vgeo[gid + p_TXID * p_Np];
+          ty = vgeo[gid + p_TYID * p_Np];
+          tz = vgeo[gid + p_TZID * p_Np];
+
+          const dfloat JW = vgeo[gid + p_JWID * p_Np];
 
           // compute 1D derivatives
           dfloat ur = 0.f, us = 0.f, ut = 0.f;
           dfloat vr = 0.f, vs = 0.f, vt = 0.f;
           dfloat wr = 0.f, ws = 0.f, wt = 0.f;
-          for(int m=0;m<p_Nq;++m){
+          for(int m = 0; m < p_Nq; ++m) {
             const dfloat Dim = s_D[i][m]; // Dr
             const dfloat Djm = s_D[j][m]; // Ds
             const dfloat Dkm = s_D[k][m]; // Dt
 
-            ur += Dim*s_U[j][m];
-            us += Djm*s_U[m][i];
-            ut += Dkm*s_Uloc[m];
+            ur += Dim * s_U[j][m];
+            us += Djm * s_U[m][i];
+            ut += Dkm * s_Uloc[m];
             //
-            vr += Dim*s_V[j][m];
-            vs += Djm*s_V[m][i];
-            vt += Dkm*s_Vloc[m];
+            vr += Dim * s_V[j][m];
+            vs += Djm * s_V[m][i];
+            vt += Dkm * s_Vloc[m];
             //
-            wr += Dim*s_W[j][m];
-            ws += Djm*s_W[m][i];
-            wt += Dkm*s_Wloc[m];
+            wr += Dim * s_W[j][m];
+            ws += Djm * s_W[m][i];
+            wt += Dkm * s_Wloc[m];
           }
 
-          const dlong id = element*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
-          
-          const dfloat u_lam0 = lambda[id + 0*offset + 0*loffset]; 
-          const dfloat u_lam1 = lambda[id + 1*offset + 0*loffset];
-          const dfloat v_lam0 = lambda[id + 0*offset + 1*loffset]; 
-          const dfloat v_lam1 = lambda[id + 1*offset + 1*loffset];
-          const dfloat w_lam0 = lambda[id + 0*offset + 2*loffset]; 
-          const dfloat w_lam1 = lambda[id + 1*offset + 2*loffset];
-
-          const dfloat dudx = rx*ur + sx*us + tx*ut; 
-          const dfloat dudy = ry*ur + sy*us + ty*ut; 
-          const dfloat dudz = rz*ur + sz*us + tz*ut; 
-
-          const dfloat dvdx = rx*vr + sx*vs + tx*vt; 
-          const dfloat dvdy = ry*vr + sy*vs + ty*vt; 
-          const dfloat dvdz = rz*vr + sz*vs + tz*vt; 
-
-          const dfloat dwdx = rx*wr + sx*ws + tx*wt; 
-          const dfloat dwdy = ry*wr + sy*ws + ty*wt; 
-          const dfloat dwdz = rz*wr + sz*ws + tz*wt; 
-
-          s11 = u_lam0*JW*(dudx + dudx); 
-          s12 = u_lam0*JW*(dudy + dvdx); 
-          s13 = u_lam0*JW*(dudz + dwdx); 
-
-          s21 = v_lam0*JW*(dvdx + dudy); 
-          s22 = v_lam0*JW*(dvdy + dvdy); 
-          s23 = v_lam0*JW*(dvdz + dwdy); 
-
-          s31 = w_lam0*JW*(dwdx + dudz); 
-          s32 = w_lam0*JW*(dwdy + dvdz); 
-          s33 = w_lam0*JW*(dwdz + dwdz); 
+          const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+
+          const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset];
+          const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset];
+          const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset];
+          const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset];
+          const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset];
+          const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset];
+
+          const dfloat dudx = rx * ur + sx * us + tx * ut;
+          const dfloat dudy = ry * ur + sy * us + ty * ut;
+          const dfloat dudz = rz * ur + sz * us + tz * ut;
+
+          const dfloat dvdx = rx * vr + sx * vs + tx * vt;
+          const dfloat dvdy = ry * vr + sy * vs + ty * vt;
+          const dfloat dvdz = rz * vr + sz * vs + tz * vt;
+
+          const dfloat dwdx = rx * wr + sx * ws + tx * wt;
+          const dfloat dwdy = ry * wr + sy * ws + ty * wt;
+          const dfloat dwdz = rz * wr + sz * ws + tz * wt;
+
+          s11 = u_lam0 * JW * (dudx + dudx);
+          s12 = u_lam0 * JW * (dudy + dvdx);
+          s13 = u_lam0 * JW * (dudz + dwdx);
+
+          s21 = v_lam0 * JW * (dvdx + dudy);
+          s22 = v_lam0 * JW * (dvdy + dvdy);
+          s23 = v_lam0 * JW * (dvdz + dwdy);
+
+          s31 = w_lam0 * JW * (dwdx + dudz);
+          s32 = w_lam0 * JW * (dwdy + dvdz);
+          s33 = w_lam0 * JW * (dwdz + dwdz);
           // store in register
-          r_Au[k] =  u_lam1*JW*s_U[j][i]; 
-          r_Av[k] =  v_lam1*JW*s_V[j][i];
-          r_Aw[k] =  w_lam1*JW*s_W[j][i];
+          r_Au[k] =  u_lam1 * JW * s_U[j][i];
+          r_Av[k] =  v_lam1 * JW * s_V[j][i];
+          r_Aw[k] =  w_lam1 * JW * s_W[j][i];
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-         s_SUr[j][i] =  rx*s11 + ry*s12 + rz*s13;
-         s_SUs[j][i] =  sx*s11 + sy*s12 + sz*s13;
-         s_SUtloc[k] =  tx*s11 + ty*s12 + tz*s13;
-         //
-         s_SVr[j][i] =  rx*s21 + ry*s22 + rz*s23;
-         s_SVs[j][i] =  sx*s21 + sy*s22 + sz*s23;
-         s_SVt[k] =  tx*s21 + ty*s22 + tz*s23;
-         //
-         s_SWr[j][i] =  rx*s31 + ry*s32 + rz*s33;
-         s_SWs[j][i] =  sx*s31 + sy*s32 + sz*s33;
-         s_SWt[k] =  tx*s31 + ty*s32 + tz*s33;
-
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          s_SUr[j][i] =  rx * s11 + ry * s12 + rz * s13;
+          s_SUs[j][i] =  sx * s11 + sy * s12 + sz * s13;
+          s_SUtloc[k] =  tx * s11 + ty * s12 + tz * s13;
+          //
+          s_SVr[j][i] =  rx * s21 + ry * s22 + rz * s23;
+          s_SVs[j][i] =  sx * s21 + sy * s22 + sz * s23;
+          s_SVt[k] =  tx * s21 + ty * s22 + tz * s23;
+          //
+          s_SWr[j][i] =  rx * s31 + ry * s32 + rz * s33;
+          s_SWs[j][i] =  sx * s31 + sy * s32 + sz * s33;
+          s_SWt[k] =  tx * s31 + ty * s32 + tz * s33;
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-          #pragma unroll p_Nq
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dim = s_D[m][i]; // Dr'
             const dfloat Djm = s_D[m][j]; // Ds'
 
-            r_Au[k] += Dim*s_SUr[j][m];
-            r_Au[k] += Djm*s_SUs[m][i];
+            r_Au[k] += Dim * s_SUr[j][m];
+            r_Au[k] += Djm * s_SUs[m][i];
 
-            r_Av[k] += Dim*s_SVr[j][m];
-            r_Av[k] += Djm*s_SVs[m][i];
+            r_Av[k] += Dim * s_SVr[j][m];
+            r_Av[k] += Djm * s_SVs[m][i];
 
-            r_Aw[k] += Dim*s_SWr[j][m];
-            r_Aw[k] += Djm*s_SWs[m][i];
+            r_Aw[k] += Dim * s_SWr[j][m];
+            r_Aw[k] += Djm * s_SWs[m][i];
           }
         }
       }
     }
 
 // loop over slabs
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-          #pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dkm = s_D[m][k]; // Dt'
 
-            r_Au[k] += Dkm*s_SUtloc[m];
+            r_Au[k] += Dkm * s_SUtloc[m];
 
-            r_Av[k] += Dkm*s_SVt[m];
+            r_Av[k] += Dkm * s_SVt[m];
 
-            r_Aw[k] += Dkm*s_SWt[m];
+            r_Aw[k] += Dkm * s_SWt[m];
           }
-          const dlong id = element*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
-          Aq[id+0*offset] = r_Au[k];
-          Aq[id+1*offset] = r_Av[k];
-          Aq[id+2*offset] = r_Aw[k];
-        
+          const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          Aq[id + 0 * offset] = r_Au[k];
+          Aq[id + 1 * offset] = r_Av[k];
+          Aq[id + 2 * offset] = r_Aw[k];
         }
       }
     }
   }
 }
 
-
-
-
-
 //
 @kernel void ellipticStressAxHex3D(const dlong Nelements,
-                                 const dlong offset,
-                                 const dlong loffset,
-                                 @restrict const dfloat *vgeo,
-                                 @restrict const dfloat *D,
-                                 @restrict const  dfloat *  S,
-                                 @restrict const dfloat *lambda,
-                                 @restrict const dfloat *q,
-                                 @restrict dfloat *Aq){
-
-
-for(dlong e=0; e<Nelements; ++e; @outer(0)){
-  // AK: heavy memory usage, optimize later
+                                   const dlong offset,
+                                   const dlong loffset,
+                                   @restrict const dfloat* vgeo,
+                                   @restrict const dfloat* D,
+                                   @restrict const dfloat*  S,
+                                   @restrict const dfloat* lambda,
+                                   @restrict const dfloat* q,
+                                   @restrict dfloat* Aq)
+{
+  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
+    // AK: heavy memory usage, optimize later
     @shared dfloat s_D[p_Nq][p_Nq];
 
     @shared dfloat s_U[p_Nq][p_Nq];
@@ -2357,169 +2338,160 @@ for(dlong e=0; e<Nelements; ++e; @outer(0)){
     @shared dfloat s_SWs[p_Nq][p_Nq];
     @exclusive dfloat s_SWt[p_Nq];
     //
-    @exclusive dfloat rx , ry, rz; 
-    @exclusive dfloat sx , sy, sz; 
-    @exclusive dfloat tx , ty, tz; 
-    
-    // Symmetric Stress Tensor
-    @exclusive dfloat s11,s12,s13; 
-    @exclusive dfloat s21,s22,s23; 
-    @exclusive dfloat s31,s32,s33; 
+    @exclusive dfloat rx, ry, rz;
+    @exclusive dfloat sx, sy, sz;
+    @exclusive dfloat tx, ty, tz;
 
+    // Symmetric Stress Tensor
+    @exclusive dfloat s11,s12,s13;
+    @exclusive dfloat s21,s22,s23;
+    @exclusive dfloat s31,s32,s33;
 
     @exclusive dfloat r_Au[p_Nq];
     @exclusive dfloat r_Av[p_Nq];
     @exclusive dfloat r_Aw[p_Nq];
 
-   // prefetch q
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-          if(k==0) s_D[j][i] = D[p_Nq*j+i];
-            const dlong id = e*p_Np+k*p_Nq*p_Nq+j*p_Nq+i;
-            s_U[j][i] = q[id + 0*offset];
-            s_V[j][i] = q[id + 1*offset];
-            s_W[j][i] = q[id + 2*offset];
-          if(k==0){
-            for(int l = 0 ; l < p_Nq; ++l){
-              const dlong other_id = e*p_Np+l*p_Nq*p_Nq+j*p_Nq+i;
-              s_Uloc[l] = q[other_id + 0*offset];
-              s_Vloc[l] = q[other_id + 1*offset];
-              s_Wloc[l] = q[other_id + 2*offset];
+    // prefetch q
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          if(k == 0) s_D[j][i] = D[p_Nq * j + i];
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          s_U[j][i] = q[id + 0 * offset];
+          s_V[j][i] = q[id + 1 * offset];
+          s_W[j][i] = q[id + 2 * offset];
+          if(k == 0) {
+            for(int l = 0; l < p_Nq; ++l) {
+              const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
+              s_Uloc[l] = q[other_id + 0 * offset];
+              s_Vloc[l] = q[other_id + 1 * offset];
+              s_Wloc[l] = q[other_id + 2 * offset];
             }
           }
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){   
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          rx = vgeo[gid + p_RXID*p_Np];
-          ry = vgeo[gid + p_RYID*p_Np];
-          rz = vgeo[gid + p_RZID*p_Np];
-          
-          sx = vgeo[gid + p_SXID*p_Np];
-          sy = vgeo[gid + p_SYID*p_Np];
-          sz = vgeo[gid + p_SZID*p_Np];
-          
-          tx = vgeo[gid + p_TXID*p_Np];
-          ty = vgeo[gid + p_TYID*p_Np];
-          tz = vgeo[gid + p_TZID*p_Np];
-          
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
+          rx = vgeo[gid + p_RXID * p_Np];
+          ry = vgeo[gid + p_RYID * p_Np];
+          rz = vgeo[gid + p_RZID * p_Np];
+
+          sx = vgeo[gid + p_SXID * p_Np];
+          sy = vgeo[gid + p_SYID * p_Np];
+          sz = vgeo[gid + p_SZID * p_Np];
+
+          tx = vgeo[gid + p_TXID * p_Np];
+          ty = vgeo[gid + p_TYID * p_Np];
+          tz = vgeo[gid + p_TZID * p_Np];
+
+          const dfloat JW = vgeo[gid + p_JWID * p_Np];
 
           // compute 1D derivatives
           dfloat ur = 0.f, us = 0.f, ut = 0.f;
           dfloat vr = 0.f, vs = 0.f, vt = 0.f;
           dfloat wr = 0.f, ws = 0.f, wt = 0.f;
-          for(int m=0;m<p_Nq;++m){
+          for(int m = 0; m < p_Nq; ++m) {
             const dfloat Dim = s_D[i][m]; // Dr
             const dfloat Djm = s_D[j][m]; // Ds
             const dfloat Dkm = s_D[k][m]; // Dt
 
-            ur += Dim*s_U[j][m];
-            us += Djm*s_U[m][i];
-            ut += Dkm*s_Uloc[m];
+            ur += Dim * s_U[j][m];
+            us += Djm * s_U[m][i];
+            ut += Dkm * s_Uloc[m];
             //
-            vr += Dim*s_V[j][m];
-            vs += Djm*s_V[m][i];
-            vt += Dkm*s_Vloc[m];
+            vr += Dim * s_V[j][m];
+            vs += Djm * s_V[m][i];
+            vt += Dkm * s_Vloc[m];
             //
-            wr += Dim*s_W[j][m];
-            ws += Djm*s_W[m][i];
-            wt += Dkm*s_Wloc[m];
+            wr += Dim * s_W[j][m];
+            ws += Djm * s_W[m][i];
+            wt += Dkm * s_Wloc[m];
           }
 
-          const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
-         
-          const dfloat dudx = rx*ur + sx*us + tx*ut; 
-          const dfloat dudy = ry*ur + sy*us + ty*ut; 
-          const dfloat dudz = rz*ur + sz*us + tz*ut; 
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
 
-          const dfloat dvdx = rx*vr + sx*vs + tx*vt; 
-          const dfloat dvdy = ry*vr + sy*vs + ty*vt; 
-          const dfloat dvdz = rz*vr + sz*vs + tz*vt; 
+          const dfloat dudx = rx * ur + sx * us + tx * ut;
+          const dfloat dudy = ry * ur + sy * us + ty * ut;
+          const dfloat dudz = rz * ur + sz * us + tz * ut;
 
-          const dfloat dwdx = rx*wr + sx*ws + tx*wt; 
-          const dfloat dwdy = ry*wr + sy*ws + ty*wt; 
-          const dfloat dwdz = rz*wr + sz*ws + tz*wt; 
+          const dfloat dvdx = rx * vr + sx * vs + tx * vt;
+          const dfloat dvdy = ry * vr + sy * vs + ty * vt;
+          const dfloat dvdz = rz * vr + sz * vs + tz * vt;
 
-          s11 = JW*(dudx + dudx); 
-          s12 = JW*(dudy + dvdx); 
-          s13 = JW*(dudz + dwdx); 
+          const dfloat dwdx = rx * wr + sx * ws + tx * wt;
+          const dfloat dwdy = ry * wr + sy * ws + ty * wt;
+          const dfloat dwdz = rz * wr + sz * ws + tz * wt;
 
-          s21 = JW*(dvdx + dudy); 
-          s22 = JW*(dvdy + dvdy); 
-          s23 = JW*(dvdz + dwdy); 
+          s11 = JW * (dudx + dudx);
+          s12 = JW * (dudy + dvdx);
+          s13 = JW * (dudz + dwdx);
 
-          s31 = JW*(dwdx + dudz); 
-          s32 = JW*(dwdy + dvdz); 
-          s33 = JW*(dwdz + dwdz); 
-          // store in register
-          r_Au[k] =  lambda[id+ 1 * offset + 0*loffset]*JW*s_U[j][i]; 
-          r_Av[k] =  lambda[id+ 1 * offset + 1*loffset]*JW*s_V[j][i];
-          r_Aw[k] =  lambda[id+ 1 * offset + 2*loffset]*JW*s_W[j][i];
+          s21 = JW * (dvdx + dudy);
+          s22 = JW * (dvdy + dvdy);
+          s23 = JW * (dvdz + dwdy);
 
+          s31 = JW * (dwdx + dudz);
+          s32 = JW * (dwdy + dvdz);
+          s33 = JW * (dwdz + dwdz);
+          // store in register
+          r_Au[k] =  lambda[id + 1 * offset + 0 * loffset] * JW * s_U[j][i];
+          r_Av[k] =  lambda[id + 1 * offset + 1 * loffset] * JW * s_V[j][i];
+          r_Aw[k] =  lambda[id + 1 * offset + 2 * loffset] * JW * s_W[j][i];
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-
-         s_SUr[j][i] =  rx*s11 + ry*s12 + rz*s13;
-         s_SUs[j][i] =  sx*s11 + sy*s12 + sz*s13;
-         s_SUtloc[k] =  tx*s11 + ty*s12 + tz*s13;
-         //
-         s_SVr[j][i] =  rx*s21 + ry*s22 + rz*s23;
-         s_SVs[j][i] =  sx*s21 + sy*s22 + sz*s23;
-         s_SVt[k] =  tx*s21 + ty*s22 + tz*s23;
-         //
-         s_SWr[j][i] =  rx*s31 + ry*s32 + rz*s33;
-         s_SWs[j][i] =  sx*s31 + sy*s32 + sz*s33;
-         s_SWt[k] =  tx*s31 + ty*s32 + tz*s33;
-
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          s_SUr[j][i] =  rx * s11 + ry * s12 + rz * s13;
+          s_SUs[j][i] =  sx * s11 + sy * s12 + sz * s13;
+          s_SUtloc[k] =  tx * s11 + ty * s12 + tz * s13;
+          //
+          s_SVr[j][i] =  rx * s21 + ry * s22 + rz * s23;
+          s_SVs[j][i] =  sx * s21 + sy * s22 + sz * s23;
+          s_SVt[k] =  tx * s21 + ty * s22 + tz * s23;
+          //
+          s_SWr[j][i] =  rx * s31 + ry * s32 + rz * s33;
+          s_SWs[j][i] =  sx * s31 + sy * s32 + sz * s33;
+          s_SWt[k] =  tx * s31 + ty * s32 + tz * s33;
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-           #pragma unroll p_Nq
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dim = s_D[m][i]; // Dr'
             const dfloat Djm = s_D[m][j]; // Ds'
 
-            r_Au[k] += Dim*s_SUr[j][m];
-            r_Au[k] += Djm*s_SUs[m][i];
+            r_Au[k] += Dim * s_SUr[j][m];
+            r_Au[k] += Djm * s_SUs[m][i];
 
-            r_Av[k] += Dim*s_SVr[j][m];
-            r_Av[k] += Djm*s_SVs[m][i];
+            r_Av[k] += Dim * s_SVr[j][m];
+            r_Av[k] += Djm * s_SVs[m][i];
 
-            r_Aw[k] += Dim*s_SWr[j][m];
-            r_Aw[k] += Djm*s_SWs[m][i];
+            r_Aw[k] += Dim * s_SWr[j][m];
+            r_Aw[k] += Djm * s_SWs[m][i];
           }
         }
       }
     }
-   
 
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-           #pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dkm = s_D[m][k]; // Dt'
 
-            r_Au[k] += Dkm*s_SUtloc[m];
+            r_Au[k] += Dkm * s_SUtloc[m];
 
-            r_Av[k] += Dkm*s_SVt[m];
+            r_Av[k] += Dkm * s_SVt[m];
 
-            r_Aw[k] += Dkm*s_SWt[m];
+            r_Aw[k] += Dkm * s_SWt[m];
           }
-          const dlong id = e*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
-          Aq[id+0*offset] = r_Au[k];
-          Aq[id+1*offset] = r_Av[k];
-          Aq[id+2*offset] = r_Aw[k];
-        
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          Aq[id + 0 * offset] = r_Au[k];
+          Aq[id + 1 * offset] = r_Av[k];
+          Aq[id + 2 * offset] = r_Aw[k];
         }
       }
     }
@@ -2528,19 +2500,17 @@ for(dlong e=0; e<Nelements; ++e; @outer(0)){
 
 //
 @kernel void ellipticStressPartialAxHex3D(const dlong Nelements,
-                                 const dlong offset,
-                                 const dlong loffset,
-                                 @restrict const  dlong *elementList,
-                                 @restrict const dfloat *vgeo,
-                                 @restrict const dfloat *D,
-                                 @restrict const  dfloat *  S,
-                                 @restrict const dfloat *lambda,
-                                 @restrict const dfloat *q,
-                                 @restrict dfloat *Aq){
-
-
-for(dlong e=0; e<Nelements; ++e; @outer(0)){
-
+                                          const dlong offset,
+                                          const dlong loffset,
+                                          @restrict const dlong* elementList,
+                                          @restrict const dfloat* vgeo,
+                                          @restrict const dfloat* D,
+                                          @restrict const dfloat*  S,
+                                          @restrict const dfloat* lambda,
+                                          @restrict const dfloat* q,
+                                          @restrict dfloat* Aq)
+{
+  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
     @shared dfloat s_D[p_Nq][p_Nq];
 
     @shared dfloat s_U[p_Nq][p_Nq];
@@ -2562,165 +2532,159 @@ for(dlong e=0; e<Nelements; ++e; @outer(0)){
     @exclusive dfloat s_SWt[p_Nq];
 
     //
-    @exclusive dfloat rx , ry, rz; 
-    @exclusive dfloat sx , sy, sz; 
-    @exclusive dfloat tx , ty, tz; 
+    @exclusive dfloat rx, ry, rz;
+    @exclusive dfloat sx, sy, sz;
+    @exclusive dfloat tx, ty, tz;
     // Symmetric Stress Tensor
-    @exclusive dfloat s11,s12,s13; 
-    @exclusive dfloat s21,s22,s23; 
-    @exclusive dfloat s31,s32,s33; 
-
+    @exclusive dfloat s11,s12,s13;
+    @exclusive dfloat s21,s22,s23;
+    @exclusive dfloat s31,s32,s33;
 
     @exclusive dfloat r_Au[p_Nq];
     @exclusive dfloat r_Av[p_Nq];
     @exclusive dfloat r_Aw[p_Nq];
     @exclusive dlong element;
 
-   // prefetch q
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
+    // prefetch q
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           element = elementList[e];
-          if(k==0) s_D[j][i] = D[p_Nq*j+i];
-            const dlong id = element*p_Np+k*p_Nq*p_Nq+j*p_Nq+i;
-            s_U[j][i] =q[id + 0*offset];
-            s_V[j][i] =q[id + 1*offset];
-            s_W[j][i] =q[id + 2*offset];
-          if(k==0){
-            for(int l = 0; l < p_Nq; ++l){
-              const dlong other_id = element*p_Np+l*p_Nq*p_Nq+j*p_Nq+i;
-              s_Uloc[l] =q[other_id + 0*offset];
-              s_Vloc[l] =q[other_id + 1*offset];
-              s_Wloc[l] =q[other_id + 2*offset];
+          if(k == 0) s_D[j][i] = D[p_Nq * j + i];
+          const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          s_U[j][i] = q[id + 0 * offset];
+          s_V[j][i] = q[id + 1 * offset];
+          s_W[j][i] = q[id + 2 * offset];
+          if(k == 0) {
+            for(int l = 0; l < p_Nq; ++l) {
+              const dlong other_id = element * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
+              s_Uloc[l] = q[other_id + 0 * offset];
+              s_Vloc[l] = q[other_id + 1 * offset];
+              s_Wloc[l] = q[other_id + 2 * offset];
             }
           }
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){   
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + element*p_Np*p_Nvgeo;
-          rx = vgeo[gid + p_RXID*p_Np];
-          ry = vgeo[gid + p_RYID*p_Np];
-          rz = vgeo[gid + p_RZID*p_Np];
-          
-          sx = vgeo[gid + p_SXID*p_Np];
-          sy = vgeo[gid + p_SYID*p_Np];
-          sz = vgeo[gid + p_SZID*p_Np];
-          
-          tx = vgeo[gid + p_TXID*p_Np];
-          ty = vgeo[gid + p_TYID*p_Np];
-          tz = vgeo[gid + p_TZID*p_Np];
-          
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np * p_Nvgeo;
+          rx = vgeo[gid + p_RXID * p_Np];
+          ry = vgeo[gid + p_RYID * p_Np];
+          rz = vgeo[gid + p_RZID * p_Np];
+
+          sx = vgeo[gid + p_SXID * p_Np];
+          sy = vgeo[gid + p_SYID * p_Np];
+          sz = vgeo[gid + p_SZID * p_Np];
+
+          tx = vgeo[gid + p_TXID * p_Np];
+          ty = vgeo[gid + p_TYID * p_Np];
+          tz = vgeo[gid + p_TZID * p_Np];
+
+          const dfloat JW = vgeo[gid + p_JWID * p_Np];
 
           // compute 1D derivatives
           dfloat ur = 0.f, us = 0.f, ut = 0.f;
           dfloat vr = 0.f, vs = 0.f, vt = 0.f;
           dfloat wr = 0.f, ws = 0.f, wt = 0.f;
-          for(int m=0;m<p_Nq;++m){
+          for(int m = 0; m < p_Nq; ++m) {
             const dfloat Dim = s_D[i][m]; // Dr
             const dfloat Djm = s_D[j][m]; // Ds
             const dfloat Dkm = s_D[k][m]; // Dt
 
-            ur += Dim*s_U[j][m];
-            us += Djm*s_U[m][i];
-            ut += Dkm*s_Uloc[m];
+            ur += Dim * s_U[j][m];
+            us += Djm * s_U[m][i];
+            ut += Dkm * s_Uloc[m];
             //
-            vr += Dim*s_V[j][m];
-            vs += Djm*s_V[m][i];
-            vt += Dkm*s_Vloc[m];
+            vr += Dim * s_V[j][m];
+            vs += Djm * s_V[m][i];
+            vt += Dkm * s_Vloc[m];
             //
-            wr += Dim*s_W[j][m];
-            ws += Djm*s_W[m][i];
-            wt += Dkm*s_Wloc[m];
+            wr += Dim * s_W[j][m];
+            ws += Djm * s_W[m][i];
+            wt += Dkm * s_Wloc[m];
           }
 
-          const dfloat dudx = rx*ur + sx*us + tx*ut; 
-          const dfloat dudy = ry*ur + sy*us + ty*ut; 
-          const dfloat dudz = rz*ur + sz*us + tz*ut; 
+          const dfloat dudx = rx * ur + sx * us + tx * ut;
+          const dfloat dudy = ry * ur + sy * us + ty * ut;
+          const dfloat dudz = rz * ur + sz * us + tz * ut;
 
-          const dfloat dvdx = rx*vr + sx*vs + tx*vt; 
-          const dfloat dvdy = ry*vr + sy*vs + ty*vt; 
-          const dfloat dvdz = rz*vr + sz*vs + tz*vt; 
+          const dfloat dvdx = rx * vr + sx * vs + tx * vt;
+          const dfloat dvdy = ry * vr + sy * vs + ty * vt;
+          const dfloat dvdz = rz * vr + sz * vs + tz * vt;
 
-          const dfloat dwdx = rx*wr + sx*ws + tx*wt; 
-          const dfloat dwdy = ry*wr + sy*ws + ty*wt; 
-          const dfloat dwdz = rz*wr + sz*ws + tz*wt; 
+          const dfloat dwdx = rx * wr + sx * ws + tx * wt;
+          const dfloat dwdy = ry * wr + sy * ws + ty * wt;
+          const dfloat dwdz = rz * wr + sz * ws + tz * wt;
 
-          s11 = JW*(dudx + dudx); 
-          s12 = JW*(dudy + dvdx); 
-          s13 = JW*(dudz + dwdx); 
+          s11 = JW * (dudx + dudx);
+          s12 = JW * (dudy + dvdx);
+          s13 = JW * (dudz + dwdx);
 
-          s21 = JW*(dvdx + dudy); 
-          s22 = JW*(dvdy + dvdy); 
-          s23 = JW*(dvdz + dwdy); 
+          s21 = JW * (dvdx + dudy);
+          s22 = JW * (dvdy + dvdy);
+          s23 = JW * (dvdz + dwdy);
 
-          s31 = JW*(dwdx + dudz); 
-          s32 = JW*(dwdy + dvdz); 
-          s33 = JW*(dwdz + dwdz); 
-          const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
+          s31 = JW * (dwdx + dudz);
+          s32 = JW * (dwdy + dvdz);
+          s33 = JW * (dwdz + dwdz);
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
 
-          r_Au[k] =  lambda[id + 1*offset + 0*loffset]*JW*s_U[j][i]; 
-          r_Av[k] =  lambda[id + 1*offset + 1*loffset]*JW*s_V[j][i];
-          r_Aw[k] =  lambda[id + 1*offset + 2*loffset]*JW*s_W[j][i];
+          r_Au[k] =  lambda[id + 1 * offset + 0 * loffset] * JW * s_U[j][i];
+          r_Av[k] =  lambda[id + 1 * offset + 1 * loffset] * JW * s_V[j][i];
+          r_Aw[k] =  lambda[id + 1 * offset + 2 * loffset] * JW * s_W[j][i];
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-         s_SUr[j][i] =  rx*s11 + ry*s12 + rz*s13;
-         s_SUs[j][i] =  sx*s11 + sy*s12 + sz*s13;
-         s_SUtloc[k] =  tx*s11 + ty*s12 + tz*s13;
-         //
-         s_SVr[j][i] =  rx*s21 + ry*s22 + rz*s23;
-         s_SVs[j][i] =  sx*s21 + sy*s22 + sz*s23;
-         s_SVt[k] =  tx*s21 + ty*s22 + tz*s23;
-         //
-         s_SWr[j][i] =  rx*s31 + ry*s32 + rz*s33;
-         s_SWs[j][i] =  sx*s31 + sy*s32 + sz*s33;
-         s_SWt[k] =  tx*s31 + ty*s32 + tz*s33;
-
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+          s_SUr[j][i] =  rx * s11 + ry * s12 + rz * s13;
+          s_SUs[j][i] =  sx * s11 + sy * s12 + sz * s13;
+          s_SUtloc[k] =  tx * s11 + ty * s12 + tz * s13;
+          //
+          s_SVr[j][i] =  rx * s21 + ry * s22 + rz * s23;
+          s_SVs[j][i] =  sx * s21 + sy * s22 + sz * s23;
+          s_SVt[k] =  tx * s21 + ty * s22 + tz * s23;
+          //
+          s_SWr[j][i] =  rx * s31 + ry * s32 + rz * s33;
+          s_SWs[j][i] =  sx * s31 + sy * s32 + sz * s33;
+          s_SWt[k] =  tx * s31 + ty * s32 + tz * s33;
         }
-      }
       @barrier("local");
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-          #pragma unroll p_Nq
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dim = s_D[m][i]; // Dr'
             const dfloat Djm = s_D[m][j]; // Ds'
 
-            r_Au[k] += Dim*s_SUr[j][m];
-            r_Au[k] += Djm*s_SUs[m][i];
+            r_Au[k] += Dim * s_SUr[j][m];
+            r_Au[k] += Djm * s_SUs[m][i];
 
-            r_Av[k] += Dim*s_SVr[j][m];
-            r_Av[k] += Djm*s_SVs[m][i];
+            r_Av[k] += Dim * s_SVr[j][m];
+            r_Av[k] += Djm * s_SVs[m][i];
 
-            r_Aw[k] += Dim*s_SWr[j][m];
-            r_Aw[k] += Djm*s_SWs[m][i];
+            r_Aw[k] += Dim * s_SWr[j][m];
+            r_Aw[k] += Djm * s_SWs[m][i];
           }
         }
       }
     }
-    for(int k=0;k<p_Nq;++k){ 
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
-          #pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
+#pragma unroll p_Nq
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dkm = s_D[m][k]; // Dt'
 
-            r_Au[k] += Dkm*s_SUtloc[m];
+            r_Au[k] += Dkm * s_SUtloc[m];
 
-            r_Av[k] += Dkm*s_SVt[m];
+            r_Av[k] += Dkm * s_SVt[m];
 
-            r_Aw[k] += Dkm*s_SWt[m];
+            r_Aw[k] += Dkm * s_SWt[m];
           }
-          const dlong id = element*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
-          Aq[id+0*offset] = r_Au[k];
-          Aq[id+1*offset] = r_Av[k];
-          Aq[id+2*offset] = r_Aw[k];
-        
+          const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          Aq[id + 0 * offset] = r_Au[k];
+          Aq[id + 1 * offset] = r_Av[k];
+          Aq[id + 2 * offset] = r_Aw[k];
         }
       }
     }
diff --git a/src/libP/solvers/elliptic/okl/ellipticBlockCoefficientHex3D.okl b/okl/elliptic/ellipticBlockCoefficientHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticBlockCoefficientHex3D.okl
rename to okl/elliptic/ellipticBlockCoefficientHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticBlockJacobiPrecon.okl b/okl/elliptic/ellipticBlockJacobiPrecon.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticBlockJacobiPrecon.okl
rename to okl/elliptic/ellipticBlockJacobiPrecon.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticBuildDiagonalHex3D.okl b/okl/elliptic/ellipticBuildDiagonalHex3D.okl
similarity index 94%
rename from src/libP/solvers/elliptic/okl/ellipticBuildDiagonalHex3D.okl
rename to okl/elliptic/ellipticBuildDiagonalHex3D.okl
index 8c01e4b98..9f0784f6b 100644
--- a/src/libP/solvers/elliptic/okl/ellipticBuildDiagonalHex3D.okl
+++ b/okl/elliptic/ellipticBuildDiagonalHex3D.okl
@@ -47,9 +47,9 @@
     @exclusive dfloat s_lambdat[p_Nq];
 
     // prefetch lamda 0
-    #pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; ++k){
-      for(int j = 0; j < p_Nq; ++j; @inner(1)){
+#pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           const dlong id    = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
           const dlong base = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
@@ -60,9 +60,9 @@
           s_lambda0[j][i] = lambda[id + 0 * offset];
           s_Grr[j][i]     = ggeo[base + p_G00ID * p_Np];
           s_Gss[j][i]     = ggeo[base + p_G11ID * p_Np];
-          if(k==0){
-            #pragma unroll p_Nq
-            for(int l = 0 ; l < p_Nq; ++l){
+          if(k == 0) {
+#pragma unroll p_Nq
+            for(int l = 0; l < p_Nq; ++l) {
               const dlong other_base = e * p_Nggeo * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
               const dlong other_id    = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
               s_Gtt[l]     = ggeo[other_base + p_G22ID * p_Np];
@@ -74,7 +74,7 @@
       @barrier("local");
 
       // loop over slabs
-      for(int j = 0; j < p_Nq; ++j; @inner(1)){
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           const dlong id          = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
           dfloat r_q              = 1.0f;
@@ -109,7 +109,6 @@
           }
           Aq[id] = r_q;
         }
-      }
       @barrier("local");
     }
   }
@@ -140,9 +139,9 @@
     @exclusive int r_masked;
 
     // prefetch lamda 0
-    #pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; ++k){
-      for(int j = 0; j < p_Nq; ++j; @inner(1)){
+#pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           const dlong id    = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
           const dlong base = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
@@ -154,8 +153,8 @@
           s_Grr[j][i]     = ggeo[base + p_G00ID * p_Np];
           s_Gss[j][i]     = ggeo[base + p_G11ID * p_Np];
           if( k == 0 ) {
-            #pragma unroll p_Nq
-            for(int l = 0 ; l < p_Nq; ++l){
+#pragma unroll p_Nq
+            for(int l = 0; l < p_Nq; ++l) {
               const dlong other_base = e * p_Nggeo * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
               const dlong other_id    = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
               s_Gtt[l]     = ggeo[other_base + p_G22ID * p_Np];
@@ -170,7 +169,7 @@
         }
       }
       @barrier("local");
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           const dlong id   = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
           if(r_masked) {
@@ -202,7 +201,6 @@
           }
           Aq[id] = r_q;
         }
-      }
     }
   }
 }
@@ -228,9 +226,9 @@
     @exclusive dfloat s_Gtt[p_Nq];
     @exclusive dfloat s_lambdat[p_eNfields][p_Nq];
     // prefetch lamda 0
-    #pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; ++k){
-      for(int j = 0; j < p_Nq; ++j; @inner(1)){
+#pragma unroll p_Nq
+    for(int k = 0; k < p_Nq; ++k) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           const dlong id    = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
           const dlong base = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
@@ -244,8 +242,8 @@
           for(int l = 0; l < p_eNfields; l++)
             s_lambda0[l][j][i] = lambda[id + 0 * offset + l * loffset];
           if( k == 0 ) {
-            #pragma unroll p_Nq
-            for(int l = 0 ; l < p_Nq; ++l){
+#pragma unroll p_Nq
+            for(int l = 0; l < p_Nq; ++l) {
               const dlong other_base = e * p_Nggeo * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
               const dlong other_id    = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i;
               s_Gtt[l]     = ggeo[other_base + p_G22ID * p_Np];
@@ -258,7 +256,7 @@
 
       @barrier("local");
 
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
+      for(int j = 0; j < p_Nq; ++j; @inner(1))
         for(int i = 0; i < p_Nq; ++i; @inner(0)) {
           const dlong id          = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
           dfloat r_q              = 1.0f;
@@ -296,7 +294,6 @@
             Aq[id + l * offset] = r_q;
           }
         }
-      }
     }
   }
 }
diff --git a/src/libP/solvers/elliptic/okl/ellipticCoefficientHex3D.okl b/okl/elliptic/ellipticCoefficientHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticCoefficientHex3D.okl
rename to okl/elliptic/ellipticCoefficientHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticCubatureAxHex3D.okl b/okl/elliptic/ellipticCubatureAxHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticCubatureAxHex3D.okl
rename to okl/elliptic/ellipticCubatureAxHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticDivGradHex3D.okl b/okl/elliptic/ellipticDivGradHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticDivGradHex3D.okl
rename to okl/elliptic/ellipticDivGradHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticDoubleReductionPCG.okl b/okl/elliptic/ellipticDoubleReductionPCG.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticDoubleReductionPCG.okl
rename to okl/elliptic/ellipticDoubleReductionPCG.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientHex3D.okl b/okl/elliptic/ellipticGradientHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticGradientHex3D.okl
rename to okl/elliptic/ellipticGradientHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticPatchSolver.okl b/okl/elliptic/ellipticPatchSolver.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticPatchSolver.okl
rename to okl/elliptic/ellipticPatchSolver.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenHex3D.okl b/okl/elliptic/ellipticPreconCoarsenHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticPreconCoarsenHex3D.okl
rename to okl/elliptic/ellipticPreconCoarsenHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateHex3D.okl b/okl/elliptic/ellipticPreconProlongateHex3D.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticPreconProlongateHex3D.okl
rename to okl/elliptic/ellipticPreconProlongateHex3D.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl b/okl/elliptic/ellipticResidualProjection.okl
similarity index 54%
rename from src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl
rename to okl/elliptic/ellipticResidualProjection.okl
index 2c3b3eb80..b54e39962 100644
--- a/src/libP/solvers/elliptic/okl/ellipticResidualProjection.okl
+++ b/okl/elliptic/ellipticResidualProjection.okl
@@ -33,22 +33,21 @@
     if(n < N)
       x[n + offset] = alpha * x[n + offset];
 }
+
 @kernel void multiScaledAddwOffset(const dlong N,
-                              const dlong m,
-                              const dlong destOffset,
-                              const dlong fieldOffset,
-                              @restrict const dfloat * alphas,
-                              const dfloat beta,
-                              @restrict dfloat*  x)
+                                   const dlong m,
+                                   const dlong destOffset,
+                                   const dlong fieldOffset,
+                                   @restrict const dfloat* alphas,
+                                   const dfloat beta,
+                                   @restrict dfloat*  x)
 {
-  for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner)){
-    if(n < N){
-      for(dlong k = 0; k < m-1; ++k){
+  for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
+    if(n < N)
+      for(dlong k = 0; k < m - 1; ++k)
         x[n + destOffset] = -alphas[k] * x[n + k * fieldOffset] + beta * x[n + destOffset];
-      }
-    }
-  }
 }
+
 @kernel void accumulate(const dlong N,
                         const dlong m,
                         const dlong fieldOffset,
@@ -67,54 +66,53 @@
 }
 
 @kernel void multiWeightedInnerProduct2(const dlong N,
-                                   const dlong fieldOffset,
-                                   const dlong Nblock,
-                                   const dlong m,
-                                   const dlong offset,
-                                   @restrict const dfloat*  w,
-                                   @restrict const dfloat*  x,
-                                   @restrict const dfloat*  y,
-                                   @restrict dfloat*  wxy)
+                                        const dlong fieldOffset,
+                                        const dlong Nblock,
+                                        const dlong m,
+                                        const dlong offset,
+                                        @restrict const dfloat*  w,
+                                        @restrict const dfloat*  x,
+                                        @restrict const dfloat*  y,
+                                        @restrict dfloat*  wxy)
 {
   for(dlong b = 0; b < (N + p_threadBlockSize - 1) / p_threadBlockSize; ++b; @outer(0)) {
     @shared volatile dfloat s_wxy[p_threadBlockSize];
     @exclusive dfloat w_loc;
     @exclusive dfloat y_loc;
 
-    for(int v = 0 ; v < m; ++v){
-
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) {
-      const dlong id = t + p_threadBlockSize * b;
-      if(v == 0 && id < N){
-        w_loc = w[id];
-        y_loc = y[id+offset];
+    for(int v = 0; v < m; ++v) {
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) {
+        const dlong id = t + p_threadBlockSize * b;
+        if(v == 0 && id < N) {
+          w_loc = w[id];
+          y_loc = y[id + offset];
+        }
+        s_wxy[t] = (id < N) ? w_loc * x[id + fieldOffset * v] * y_loc : 0.f;
       }
-      s_wxy[t] = (id < N) ? w_loc * x[id + fieldOffset * v] * y_loc : 0.f;
-    }
 
-    @barrier("local");
+      @barrier("local");
 #if p_threadBlockSize > 512
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512];
-    @barrier("local");
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512];
+      @barrier("local");
 #endif
 #if p_threadBlockSize > 256
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256];
-    @barrier("local");
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256];
+      @barrier("local");
 #endif
 
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128];
-    @barrier("local");
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128];
+      @barrier("local");
 
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64];
-    @barrier("local");
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64];
+      @barrier("local");
 
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32];
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16];
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  8) s_wxy[t] += s_wxy[t + 8];
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  4) s_wxy[t] += s_wxy[t + 4];
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  2) s_wxy[t] += s_wxy[t + 2];
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32];
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16];
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  8) s_wxy[t] += s_wxy[t + 8];
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  4) s_wxy[t] += s_wxy[t + 4];
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  2) s_wxy[t] += s_wxy[t + 2];
 
-    for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  1) wxy[b+v*Nblock] = s_wxy[0] + s_wxy[1];
-  }
+      for(int t = 0; t < p_threadBlockSize; ++t; @inner(0)) if(t <  1) wxy[b + v * Nblock] = s_wxy[0] + s_wxy[1];
+    }
   }
 }
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/okl/ellipticSEMFEMAnterp.okl b/okl/elliptic/ellipticSEMFEMAnterp.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticSEMFEMAnterp.okl
rename to okl/elliptic/ellipticSEMFEMAnterp.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticSEMFEMInterp.okl b/okl/elliptic/ellipticSEMFEMInterp.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticSEMFEMInterp.okl
rename to okl/elliptic/ellipticSEMFEMInterp.okl
diff --git a/src/libP/solvers/elliptic/okl/ellipticSchwarzSolverHex3D.okl b/okl/elliptic/ellipticSchwarzSolverHex3D.okl
similarity index 87%
rename from src/libP/solvers/elliptic/okl/ellipticSchwarzSolverHex3D.okl
rename to okl/elliptic/ellipticSchwarzSolverHex3D.okl
index b135f3769..d2f3f1153 100644
--- a/src/libP/solvers/elliptic/okl/ellipticSchwarzSolverHex3D.okl
+++ b/okl/elliptic/ellipticSchwarzSolverHex3D.okl
@@ -9,9 +9,9 @@
 {
   for(dlong n = 0; n < N; ++n; @tile(p_threadBlockSize,@outer,@inner))
     if(n < N)
-      result[n] = static_cast < pfloat > (w[n] * v[n]);
-
+      result[n] = w[n] * v[n];
 }
+
 @kernel void preFDM(dlong Nelements,
                     @restrict const pfloat* u,
                     @restrict pfloat* work1)
@@ -26,17 +26,14 @@
       }
     }
     @barrier("local");
-    for(int k = 0; k < p_Nq; ++k) {
-      for(int j = 0; j < p_Nq_e; ++j; @inner) {
-        for(int i = 0; i < p_Nq_e; ++i; @inner) {
+    for(int k = 0; k < p_Nq; ++k)
+      for(int j = 0; j < p_Nq_e; ++j; @inner)
+        for(int i = 0; i < p_Nq_e; ++i; @inner)
           if(i < p_Nq && j < p_Nq) {
             const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq;
             const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset;
             sWork1[k + 1][j + 1][i + 1] = u[idx];
           }
-        }
-      }
-    }
     @barrier("local");
     for(int k = 0; k < p_Nq_e; ++k; @inner)
       for(int j = 0; j < p_Nq_e; ++j; @inner)
@@ -65,36 +62,33 @@
           sWork1[i][j][p_Nq_e - l1 - 1] = sWork1[i][j][p_Nq_e - l2 - 1];
         }
     @barrier("local");
-    for(int k = 0; k < p_Nq_e; ++k) {
-      for(int j = 0; j < p_Nq_e; ++j; @inner) {
+    for(int k = 0; k < p_Nq_e; ++k)
+      for(int j = 0; j < p_Nq_e; ++j; @inner)
         for(int i = 0; i < p_Nq_e; ++i; @inner) {
           const dlong elem_offset = p_Nq_e * p_Nq_e * p_Nq_e * elem;
           const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
           work1[idx] = sWork1[k][j][i];
         }
-      }
-    }
   }
 }
+
 @kernel void postFDM(dlong Nelements,
                      @restrict pfloat* my_work1,
                      @restrict pfloat* my_work2,
                      @restrict pfloat* Su,
-                     @restrict const pfloat * wts)
+                     @restrict const pfloat* wts)
 {
   for (dlong elem = 0; elem < Nelements; ++elem; @outer) {
     @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e];
     @shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e];
-    for(int k = 0; k < p_Nq_e; ++k) {
-      for(int j = 0; j < p_Nq_e; ++j; @inner) {
+    for(int k = 0; k < p_Nq_e; ++k)
+      for(int j = 0; j < p_Nq_e; ++j; @inner)
         for(int i = 0; i < p_Nq_e; ++i; @inner) {
           const dlong elem_offset = elem * p_Nq_e * p_Nq_e * p_Nq_e;
           const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
           work1[k][j][i] = my_work2[idx];
           work2[k][j][i] = my_work1[idx];
         }
-      }
-    }
     @barrier("local");
     for(int k = 0; k < p_Nq_e; ++k; @inner)
       for(int j = 0; j < p_Nq_e; ++j; @inner)
@@ -156,34 +150,32 @@
                                          work1[i][j][p_Nq_e - l2 - 1];
         }
     @barrier("local");
-    for(int k = 0; k < p_Nq; ++k) {
-      for(int j = 0; j < p_Nq_e; ++j; @inner) {
-        for(int i = 0; i < p_Nq_e; ++i; @inner) {
+    for(int k = 0; k < p_Nq; ++k)
+      for(int j = 0; j < p_Nq_e; ++j; @inner)
+        for(int i = 0; i < p_Nq_e; ++i; @inner)
           if(i < p_Nq && j < p_Nq) {
             const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq;
             const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset;
             Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx];
           }
-        }
-      }
-    }
   }
 }
+
 @kernel void fusedFDM(
-                      dlong Nelements,
-                      dlong localNelements,
-                      @restrict const dlong*  elementList,
-                      @restrict pfloat* Su,
-                      @restrict const pfloat* S_x,
-                      @restrict const pfloat* S_y,
-                      @restrict const pfloat* S_z,
-                      @restrict const pfloat* inv_L,
-                      @restrict pfloat* u
-                      #if p_restrict
-                      ,
-                      @restrict const dfloat* wts
-                      #endif
-                      )
+  dlong Nelements,
+  dlong localNelements,
+  @restrict const dlong*  elementList,
+  @restrict pfloat* Su,
+  @restrict const pfloat* S_x,
+  @restrict const pfloat* S_y,
+  @restrict const pfloat* S_z,
+  @restrict const pfloat* inv_L,
+  @restrict pfloat* u
+#if p_restrict
+  ,
+  @restrict const dfloat* wts
+#endif
+  )
 {
 #if p_overlap
   for (dlong my_elem = 0; my_elem < localNelements; ++my_elem; @outer) {
@@ -199,14 +191,15 @@
     @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e];
     @shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e];
     @exclusive dlong element;
+
     for(int k = 0; k < p_Nq_e; ++k) {
       for(int j = 0; j < p_Nq_e; ++j; @inner) {
         for(int i = 0; i < p_Nq_e; ++i; @inner) {
-          #if p_overlap
+#if p_overlap
           element = elementList[my_elem];
-          #else
+#else
           element = my_elem;
-          #endif
+#endif
           const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e;
           const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
           work1[k][j][i] = u[idx];
@@ -259,7 +252,6 @@
       for (int j = 0; j < p_Nq_e; j++; @inner) {
 #pragma unroll
         for (int i = 0; i < p_Nq_e; i++) {
-          const pfloat* u_e = u + element * p_Nq_e * p_Nq_e * p_Nq_e;
           pfloat value = 0.0;
 #pragma unroll
           for (int l = 0; l < p_Nq_e; l++)
@@ -330,9 +322,9 @@
             value += S_z_e[k][l] * work2[j][i][l];
 
 #if (!p_restrict)
-          const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e;
-          pfloat* Su_e = Su + element * p_Nq_e * p_Nq_e * p_Nq_e;
-          Su_e[v] = value;
+          const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e;
+          const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
+          Su[v] = value;
 #endif
           work1[k][j][i] = value;
         }
@@ -367,28 +359,25 @@
           work2[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l2 - 1];
         }
     @barrier("local");
-    for(int k = 0; k < p_Nq_e; ++k) {
-      for(int j = 0; j < p_Nq_e; ++j; @inner) {
+    for(int k = 0; k < p_Nq_e; ++k)
+      for(int j = 0; j < p_Nq_e; ++j; @inner)
         for(int i = 0; i < p_Nq_e; ++i; @inner) {
           const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e;
           const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
           u[idx] = work2[k][j][i];
         }
-      }
-    }
+
 #else  /* if (!p_restrict) */
     @barrier("local");
-    for(int k = 0; k < p_Nq; ++k) {
-      for(int j = 0; j < p_Nq_e; ++j; @inner) {
-        for(int i = 0; i < p_Nq_e; ++i; @inner) {
+    for(int k = 0; k < p_Nq; ++k)
+      for(int j = 0; j < p_Nq_e; ++j; @inner)
+        for(int i = 0; i < p_Nq_e; ++i; @inner)
           if(i < p_Nq && j < p_Nq) {
             const dlong elem_offset = element * p_Nq * p_Nq * p_Nq;
             const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset;
             Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx];
           }
-        }
-      }
-    }
+
 #endif
   }
 }
diff --git a/src/libP/solvers/elliptic/okl/ellipticSerialAxHex3D.c b/okl/elliptic/ellipticSerialAxHex3D.c
similarity index 72%
rename from src/libP/solvers/elliptic/okl/ellipticSerialAxHex3D.c
rename to okl/elliptic/ellipticSerialAxHex3D.c
index 3038bb9f9..a33e58fa8 100644
--- a/src/libP/solvers/elliptic/okl/ellipticSerialAxHex3D.c
+++ b/okl/elliptic/ellipticSerialAxHex3D.c
@@ -117,6 +117,7 @@ void ellipticAxHex3D(const dlong & Nelements,
         }
   }
 }
+
 extern "C"
 void ellipticAxVarHex3D(const dlong & Nelements,
                         const dlong & offset,
@@ -141,7 +142,6 @@ void ellipticAxVarHex3D(const dlong & Nelements,
       s_S[j][i] = S[j * p_Nq + i];
     }
 
-
   for(dlong e = 0; e < Nelements; ++e) {
     const dlong element = e;
 
@@ -363,183 +363,166 @@ void ellipticBlockAxVarHex3D_N3(const dlong & Nelements,
 //
 extern "C"
 void ellipticStressAxVarHex3D(const dlong &Nelements,
-                                 const dlong &offset,
-                                 const dlong &loffset,
-                                 const dfloat * __restrict__ vgeo,
-                                 const dfloat * __restrict__ D,
-                                 const dfloat * __restrict__ S,
-                                 const dfloat * __restrict__ lambda,
-                                 const dfloat * __restrict__ q,
-                                 dfloat * __restrict__ Aq){
-
-    dfloat s_D[p_Nq][p_Nq];
-
-    dfloat s_U[p_Nq][p_Nq][p_Nq];
-    dfloat s_V[p_Nq][p_Nq][p_Nq];
-    dfloat s_W[p_Nq][p_Nq][p_Nq];
-
-    dfloat s_SUr[p_Nq][p_Nq][p_Nq];
-    dfloat s_SUs[p_Nq][p_Nq][p_Nq];
-    dfloat s_SUt[p_Nq][p_Nq][p_Nq];
-
-    dfloat s_SVr[p_Nq][p_Nq][p_Nq];
-    dfloat s_SVs[p_Nq][p_Nq][p_Nq];
-    dfloat s_SVt[p_Nq][p_Nq][p_Nq];
-
-    dfloat s_SWr[p_Nq][p_Nq][p_Nq];
-    dfloat s_SWs[p_Nq][p_Nq][p_Nq];
-    dfloat s_SWt[p_Nq][p_Nq][p_Nq];
-
-    for(int j=0;j<p_Nq;++j){
-      for(int i=0;i<p_Nq;++i){
-      s_D[j][i] = D[j*p_Nq+i];
-    }
-  }
-    
+                              const dlong &offset,
+                              const dlong &loffset,
+                              const dfloat* __restrict__ vgeo,
+                              const dfloat* __restrict__ D,
+                              const dfloat* __restrict__ S,
+                              const dfloat* __restrict__ lambda,
+                              const dfloat* __restrict__ q,
+                              dfloat* __restrict__ Aq)
+{
+  dfloat s_D[p_Nq][p_Nq];
 
-for(dlong e=0; e<Nelements; ++e){
+  dfloat s_U[p_Nq][p_Nq][p_Nq];
+  dfloat s_V[p_Nq][p_Nq][p_Nq];
+  dfloat s_W[p_Nq][p_Nq][p_Nq];
 
-    for(int k=0;k<p_Nq;++k){ 
-      for(int j=0;j<p_Nq;++j){
-        for(int i=0;i<p_Nq;++i){
-            const dlong id = e*p_Np+k*p_Nq*p_Nq+j*p_Nq+i;
-            s_U[k][j][i] = q[id + 0*offset];
-            s_V[k][j][i] = q[id + 1*offset];
-            s_W[k][j][i] = q[id + 2*offset];
-        }
-      }
-    }
-    
+  dfloat s_SUr[p_Nq][p_Nq][p_Nq];
+  dfloat s_SUs[p_Nq][p_Nq][p_Nq];
+  dfloat s_SUt[p_Nq][p_Nq][p_Nq];
 
+  dfloat s_SVr[p_Nq][p_Nq][p_Nq];
+  dfloat s_SVs[p_Nq][p_Nq][p_Nq];
+  dfloat s_SVt[p_Nq][p_Nq][p_Nq];
+
+  dfloat s_SWr[p_Nq][p_Nq][p_Nq];
+  dfloat s_SWs[p_Nq][p_Nq][p_Nq];
+  dfloat s_SWt[p_Nq][p_Nq][p_Nq];
+
+  for(int j = 0; j < p_Nq; ++j)
+    for(int i = 0; i < p_Nq; ++i)
+      s_D[j][i] = D[j * p_Nq + i];
+
+  for(dlong e = 0; e < Nelements; ++e) {
+    for(int k = 0; k < p_Nq; ++k)
+      for(int j = 0; j < p_Nq; ++j)
+        for(int i = 0; i < p_Nq; ++i) {
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          s_U[k][j][i] = q[id + 0 * offset];
+          s_V[k][j][i] = q[id + 1 * offset];
+          s_W[k][j][i] = q[id + 2 * offset];
+        }
 
     // loop over slabs
-     for(int k=0;k<p_Nq;++k){ 
-      for(int j=0;j<p_Nq;++j){
-        for(int i=0;i<p_Nq;++i){   
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          const dfloat rx = vgeo[gid + p_RXID*p_Np];
-          const dfloat ry = vgeo[gid + p_RYID*p_Np];
-          const dfloat rz = vgeo[gid + p_RZID*p_Np];
-          
-          const dfloat sx = vgeo[gid + p_SXID*p_Np];
-          const dfloat sy = vgeo[gid + p_SYID*p_Np];
-          const dfloat sz = vgeo[gid + p_SZID*p_Np];
-          
-          const dfloat tx = vgeo[gid + p_TXID*p_Np];
-          const dfloat ty = vgeo[gid + p_TYID*p_Np];
-          const dfloat tz = vgeo[gid + p_TZID*p_Np];
-          
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+    for(int k = 0; k < p_Nq; ++k)
+      for(int j = 0; j < p_Nq; ++j)
+        for(int i = 0; i < p_Nq; ++i) {
+          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
+          const dfloat rx = vgeo[gid + p_RXID * p_Np];
+          const dfloat ry = vgeo[gid + p_RYID * p_Np];
+          const dfloat rz = vgeo[gid + p_RZID * p_Np];
+
+          const dfloat sx = vgeo[gid + p_SXID * p_Np];
+          const dfloat sy = vgeo[gid + p_SYID * p_Np];
+          const dfloat sz = vgeo[gid + p_SZID * p_Np];
+
+          const dfloat tx = vgeo[gid + p_TXID * p_Np];
+          const dfloat ty = vgeo[gid + p_TYID * p_Np];
+          const dfloat tz = vgeo[gid + p_TZID * p_Np];
+
+          const dfloat JW = vgeo[gid + p_JWID * p_Np];
 
           // compute 1D derivatives
           dfloat ur = 0.f, us = 0.f, ut = 0.f;
           dfloat vr = 0.f, vs = 0.f, vt = 0.f;
           dfloat wr = 0.f, ws = 0.f, wt = 0.f;
-          for(int m=0;m<p_Nq;++m){
+          for(int m = 0; m < p_Nq; ++m) {
             const dfloat Dim = s_D[i][m]; // Dr
             const dfloat Djm = s_D[j][m]; // Ds
             const dfloat Dkm = s_D[k][m]; // Dt
 
-            ur += Dim*s_U[k][j][m];
-            us += Djm*s_U[k][m][i];
-            ut += Dkm*s_U[m][j][i];
+            ur += Dim * s_U[k][j][m];
+            us += Djm * s_U[k][m][i];
+            ut += Dkm * s_U[m][j][i];
             //
-            vr += Dim*s_V[k][j][m];
-            vs += Djm*s_V[k][m][i];
-            vt += Dkm*s_V[m][j][i];
+            vr += Dim * s_V[k][j][m];
+            vs += Djm * s_V[k][m][i];
+            vt += Dkm * s_V[m][j][i];
             //
-            wr += Dim*s_W[k][j][m];
-            ws += Djm*s_W[k][m][i];
-            wt += Dkm*s_W[m][j][i];
+            wr += Dim * s_W[k][j][m];
+            ws += Djm * s_W[k][m][i];
+            wt += Dkm * s_W[m][j][i];
           }
 
-          const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;  
-          const dfloat u_lam0 = lambda[id + 0*offset + 0*loffset]; 
+          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset];
           // const dfloat u_lam1 = lambda[id + 1*offset + 0*loffset];
-          const dfloat v_lam0 = lambda[id + 0*offset + 1*loffset]; 
+          const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset];
           // const dfloat v_lam1 = lambda[id + 1*offset + 1*loffset];
-          const dfloat w_lam0 = lambda[id + 0*offset + 2*loffset]; 
+          const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset];
           // const dfloat w_lam1 = lambda[id + 1*offset + 2*loffset];
 
-         
-          const dfloat dudx = rx*ur + sx*us + tx*ut; 
-          const dfloat dudy = ry*ur + sy*us + ty*ut; 
-          const dfloat dudz = rz*ur + sz*us + tz*ut; 
+          const dfloat dudx = rx * ur + sx * us + tx * ut;
+          const dfloat dudy = ry * ur + sy * us + ty * ut;
+          const dfloat dudz = rz * ur + sz * us + tz * ut;
 
-          const dfloat dvdx = rx*vr + sx*vs + tx*vt; 
-          const dfloat dvdy = ry*vr + sy*vs + ty*vt; 
-          const dfloat dvdz = rz*vr + sz*vs + tz*vt; 
+          const dfloat dvdx = rx * vr + sx * vs + tx * vt;
+          const dfloat dvdy = ry * vr + sy * vs + ty * vt;
+          const dfloat dvdz = rz * vr + sz * vs + tz * vt;
 
-          const dfloat dwdx = rx*wr + sx*ws + tx*wt; 
-          const dfloat dwdy = ry*wr + sy*ws + ty*wt; 
-          const dfloat dwdz = rz*wr + sz*ws + tz*wt; 
+          const dfloat dwdx = rx * wr + sx * ws + tx * wt;
+          const dfloat dwdy = ry * wr + sy * ws + ty * wt;
+          const dfloat dwdz = rz * wr + sz * ws + tz * wt;
 
-          const dfloat s11 = u_lam0*JW*(dudx + dudx); 
-          const dfloat s12 = u_lam0*JW*(dudy + dvdx); 
-          const dfloat s13 = u_lam0*JW*(dudz + dwdx); 
+          const dfloat s11 = u_lam0 * JW * (dudx + dudx);
+          const dfloat s12 = u_lam0 * JW * (dudy + dvdx);
+          const dfloat s13 = u_lam0 * JW * (dudz + dwdx);
 
-          const dfloat s21 = v_lam0*JW*(dvdx + dudy); 
-          const dfloat s22 = v_lam0*JW*(dvdy + dvdy); 
-          const dfloat s23 = v_lam0*JW*(dvdz + dwdy); 
+          const dfloat s21 = v_lam0 * JW * (dvdx + dudy);
+          const dfloat s22 = v_lam0 * JW * (dvdy + dvdy);
+          const dfloat s23 = v_lam0 * JW * (dvdz + dwdy);
 
-          const dfloat s31 = w_lam0*JW*(dwdx + dudz); 
-          const dfloat s32 = w_lam0*JW*(dwdy + dvdz); 
-          const dfloat s33 = w_lam0*JW*(dwdz + dwdz); 
+          const dfloat s31 = w_lam0 * JW * (dwdx + dudz);
+          const dfloat s32 = w_lam0 * JW * (dwdy + dvdz);
+          const dfloat s33 = w_lam0 * JW * (dwdz + dwdz);
 
-          s_SUr[k][j][i] =  rx*s11 + ry*s12 + rz*s13;
-          s_SUs[k][j][i] =  sx*s11 + sy*s12 + sz*s13;
-          s_SUt[k][j][i] =  tx*s11 + ty*s12 + tz*s13;
+          s_SUr[k][j][i] =  rx * s11 + ry * s12 + rz * s13;
+          s_SUs[k][j][i] =  sx * s11 + sy * s12 + sz * s13;
+          s_SUt[k][j][i] =  tx * s11 + ty * s12 + tz * s13;
           //
-          s_SVr[k][j][i] =  rx*s21 + ry*s22 + rz*s23;
-          s_SVs[k][j][i] =  sx*s21 + sy*s22 + sz*s23;
-          s_SVt[k][j][i] =  tx*s21 + ty*s22 + tz*s23;
+          s_SVr[k][j][i] =  rx * s21 + ry * s22 + rz * s23;
+          s_SVs[k][j][i] =  sx * s21 + sy * s22 + sz * s23;
+          s_SVt[k][j][i] =  tx * s21 + ty * s22 + tz * s23;
           //
-          s_SWr[k][j][i] =  rx*s31 + ry*s32 + rz*s33;
-          s_SWs[k][j][i] =  sx*s31 + sy*s32 + sz*s33;
-          s_SWt[k][j][i] =  tx*s31 + ty*s32 + tz*s33;
-
-         
+          s_SWr[k][j][i] =  rx * s31 + ry * s32 + rz * s33;
+          s_SWs[k][j][i] =  sx * s31 + sy * s32 + sz * s33;
+          s_SWt[k][j][i] =  tx * s31 + ty * s32 + tz * s33;
         }
-      }
-    }
-
 
 // loop over slabs
-    for(int k=0;k<p_Nq;++k){ 
-      for(int j=0;j<p_Nq;++j){
-        for(int i=0;i<p_Nq;++i){
+    for(int k = 0; k < p_Nq; ++k)
+      for(int j = 0; j < p_Nq; ++j)
+        for(int i = 0; i < p_Nq; ++i) {
           dfloat r_Au = 0.f, r_Av = 0.f, r_Aw = 0.f;
           for(int m = 0; m < p_Nq; m++) {
             const dfloat Dim = s_D[m][i]; // Dr'
             const dfloat Djm = s_D[m][j]; // Ds'
             const dfloat Dkm = s_D[m][k]; // Dt'
 
-            r_Au += Dim*s_SUr[k][j][m];
-            r_Au += Djm*s_SUs[k][m][i];
-            r_Au += Dkm*s_SUt[m][j][i];
+            r_Au += Dim * s_SUr[k][j][m];
+            r_Au += Djm * s_SUs[k][m][i];
+            r_Au += Dkm * s_SUt[m][j][i];
 
-            r_Av += Dim*s_SVr[k][j][m];
-            r_Av += Djm*s_SVs[k][m][i];
-            r_Av += Dkm*s_SVt[m][j][i];
+            r_Av += Dim * s_SVr[k][j][m];
+            r_Av += Djm * s_SVs[k][m][i];
+            r_Av += Dkm * s_SVt[m][j][i];
 
-            r_Aw += Dim*s_SWr[k][j][m];
-            r_Aw += Djm*s_SWs[k][m][i];
-            r_Aw += Dkm*s_SWt[m][j][i];
+            r_Aw += Dim * s_SWr[k][j][m];
+            r_Aw += Djm * s_SWs[k][m][i];
+            r_Aw += Dkm * s_SWt[m][j][i];
           }
-          const dlong id      = e*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
-          const dfloat u_lam1 = lambda[id + 1*offset + 0*loffset];
-          const dfloat v_lam1 = lambda[id + 1*offset + 1*loffset];
-          const dfloat w_lam1 = lambda[id + 1*offset + 2*loffset];
-         
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
-           // store in register
-          Aq[id+0*offset] =  r_Au + u_lam1*JW*s_U[k][j][i]; 
-          Aq[id+1*offset] =  r_Av + v_lam1*JW*s_V[k][j][i];
-          Aq[id+2*offset] =  r_Aw + w_lam1*JW*s_W[k][j][i];
+          const dlong id      = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
+          const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset];
+          const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset];
+          const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset];
+
+          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
+          const dfloat JW = vgeo[gid + p_JWID * p_Np];
+          // store in register
+          Aq[id + 0 * offset] =  r_Au + u_lam1 * JW * s_U[k][j][i];
+          Aq[id + 1 * offset] =  r_Av + v_lam1 * JW * s_V[k][j][i];
+          Aq[id + 2 * offset] =  r_Aw + w_lam1 * JW * s_W[k][j][i];
         }
-      }
-    }
   }
 }
-
diff --git a/src/libP/solvers/elliptic/okl/ellipticSerialUpdatePCG.c b/okl/elliptic/ellipticSerialUpdatePCG.c
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticSerialUpdatePCG.c
rename to okl/elliptic/ellipticSerialUpdatePCG.c
diff --git a/src/libP/solvers/elliptic/okl/ellipticUpdateNBFPCG.okl b/okl/elliptic/ellipticUpdateNBFPCG.okl
similarity index 99%
rename from src/libP/solvers/elliptic/okl/ellipticUpdateNBFPCG.okl
rename to okl/elliptic/ellipticUpdateNBFPCG.okl
index 541ae601b..9aecbbc41 100644
--- a/src/libP/solvers/elliptic/okl/ellipticUpdateNBFPCG.okl
+++ b/okl/elliptic/ellipticUpdateNBFPCG.okl
@@ -510,7 +510,6 @@
           w[id] = wn;
         }
 
-
       s_sum[0][t] = sumudotr;
       s_sum[1][t] = sumudots;
       s_sum[2][t] = sumudotw;
diff --git a/src/libP/solvers/elliptic/okl/ellipticUpdateNBPCG.okl b/okl/elliptic/ellipticUpdateNBPCG.okl
similarity index 99%
rename from src/libP/solvers/elliptic/okl/ellipticUpdateNBPCG.okl
rename to okl/elliptic/ellipticUpdateNBPCG.okl
index f0d9b7e8c..bb5f8b724 100644
--- a/src/libP/solvers/elliptic/okl/ellipticUpdateNBPCG.okl
+++ b/okl/elliptic/ellipticUpdateNBPCG.okl
@@ -337,6 +337,7 @@
     }
   }
 }
+
 @kernel void ellipticBlockUpdate2NBPCG(const dlong N,
                                        const dlong offset,
                                        const dlong Nblocks,
diff --git a/src/libP/solvers/elliptic/okl/ellipticUpdatePCG.okl b/okl/elliptic/ellipticUpdatePCG.okl
similarity index 100%
rename from src/libP/solvers/elliptic/okl/ellipticUpdatePCG.okl
rename to okl/elliptic/ellipticUpdatePCG.okl
diff --git a/src/libP/parAlmond/okl/SpMVcsr.okl b/okl/parAlmond/SpMVcsr.okl
similarity index 100%
rename from src/libP/parAlmond/okl/SpMVcsr.okl
rename to okl/parAlmond/SpMVcsr.okl
diff --git a/src/libP/parAlmond/okl/SpMVell.okl b/okl/parAlmond/SpMVell.okl
similarity index 100%
rename from src/libP/parAlmond/okl/SpMVell.okl
rename to okl/parAlmond/SpMVell.okl
diff --git a/src/libP/parAlmond/okl/SpMVmcsr.okl b/okl/parAlmond/SpMVmcsr.okl
similarity index 100%
rename from src/libP/parAlmond/okl/SpMVmcsr.okl
rename to okl/parAlmond/SpMVmcsr.okl
diff --git a/src/libP/parAlmond/okl/haloExtract.okl b/okl/parAlmond/haloExtract.okl
similarity index 100%
rename from src/libP/parAlmond/okl/haloExtract.okl
rename to okl/parAlmond/haloExtract.okl
diff --git a/src/libP/parAlmond/okl/kcycleCombinedOp.okl b/okl/parAlmond/kcycleCombinedOp.okl
similarity index 100%
rename from src/libP/parAlmond/okl/kcycleCombinedOp.okl
rename to okl/parAlmond/kcycleCombinedOp.okl
diff --git a/src/libP/parAlmond/okl/vectorAdd.okl b/okl/parAlmond/vectorAdd.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorAdd.okl
rename to okl/parAlmond/vectorAdd.okl
diff --git a/src/libP/parAlmond/okl/vectorAddInnerProd.okl b/okl/parAlmond/vectorAddInnerProd.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorAddInnerProd.okl
rename to okl/parAlmond/vectorAddInnerProd.okl
diff --git a/src/libP/parAlmond/okl/vectorAddScalar.okl b/okl/parAlmond/vectorAddScalar.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorAddScalar.okl
rename to okl/parAlmond/vectorAddScalar.okl
diff --git a/src/libP/parAlmond/okl/vectorDotStar.okl b/okl/parAlmond/vectorDotStar.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorDotStar.okl
rename to okl/parAlmond/vectorDotStar.okl
diff --git a/src/libP/parAlmond/okl/vectorInnerProd.okl b/okl/parAlmond/vectorInnerProd.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorInnerProd.okl
rename to okl/parAlmond/vectorInnerProd.okl
diff --git a/src/libP/parAlmond/okl/vectorScale.okl b/okl/parAlmond/vectorScale.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorScale.okl
rename to okl/parAlmond/vectorScale.okl
diff --git a/src/libP/parAlmond/okl/vectorSet.okl b/okl/parAlmond/vectorSet.okl
similarity index 100%
rename from src/libP/parAlmond/okl/vectorSet.okl
rename to okl/parAlmond/vectorSet.okl
diff --git a/scripts/nrsLaunchHelper.sh b/scripts/nrsLaunchHelper.sh
index b446cfb49..cef17196e 100755
--- a/scripts/nrsLaunchHelper.sh
+++ b/scripts/nrsLaunchHelper.sh
@@ -33,8 +33,8 @@ export CUDA_VISIBLE_DEVICES=$GPU
 export UCX_NET_DEVICES=$NIC:1
 export UCX_TLS=rc,sm,cuda
 #export UCX_TLS=rc,sm,rocm
-#export UCX_RNDV_SCHEME=put_zcopy
-#export UCX_RNDV_THRESH=1024
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=1024
 #export UCX_MEMTYPE_CACHE=n
 
 export OMPI_MCA_pml=ucx
diff --git a/scripts/uncrustify.cfg b/scripts/uncrustify.cfg
index 51511a48e..0340403d2 100644
--- a/scripts/uncrustify.cfg
+++ b/scripts/uncrustify.cfg
@@ -27,7 +27,7 @@ sp_balance_nested_parens=false
 align_keep_tabs=false
 align_with_tabs=false
 align_on_tabstop=false
-align_number_left=false
+#align_number_left=false
 align_func_params=false
 align_same_func_call_params=false
 align_var_def_colon=false
@@ -40,6 +40,7 @@ align_single_line_func=false
 align_single_line_brace=false
 align_nl_cont=false
 align_left_shift=true
+nl_after_func_body=2
 nl_collapse_empty_body=false
 nl_assign_leave_one_liners=false
 nl_class_leave_one_liners=true
diff --git a/src/core/cds.cpp b/src/cds/cds.cpp
similarity index 92%
rename from src/core/cds.cpp
rename to src/cds/cds.cpp
index 6e1a23a94..5ec43255e 100644
--- a/src/core/cds.cpp
+++ b/src/cds/cds.cpp
@@ -53,18 +53,21 @@ occa::memory cdsSolve(const int is, cds_t* cds, dfloat time)
                             mesh->o_y,
                             mesh->o_z,
                             cds->o_wrk0,
+                            cds->o_EToB[is],
                             cds->o_mapB[is],
                             *(cds->o_usrwrk),
                             cds->o_wrk1);
   oogs::startFinish(cds->o_wrk1, 1, cds->fieldOffset, ogsDfloat, ogsAdd, gsh);
   if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, cds->o_wrk1);
 
-  if(cds->options.compareArgs("SCALAR INITIAL GUESS DEFAULT", "EXTRAPOLATION")) {
+  if(cds->options[is].compareArgs("SCALAR INITIAL GUESS DEFAULT", "EXTRAPOLATION")) {
     cds->o_wrk0.copyFrom(cds->o_Se, cds->Ntotal * sizeof(dfloat), 0, is * cds->fieldOffset * sizeof(dfloat));
     if (solver->Nmasked) cds->maskCopyKernel(solver->Nmasked, 0, solver->o_maskIds, cds->o_wrk2, cds->o_wrk0);
   }
 
-  cds->Niter[is] = ellipticSolve(solver, cds->TOL, cds->o_wrk1, cds->o_wrk0);
+  cds->Niter[is] = ellipticSolve(solver, cds->o_wrk1, cds->o_wrk0);
 
   return cds->o_wrk0;
 }
+
+
diff --git a/src/core/cds.h b/src/cds/cds.hpp
similarity index 92%
rename from src/core/cds.h
rename to src/cds/cds.hpp
index fcb35a467..96919b0b4 100644
--- a/src/core/cds.h
+++ b/src/cds/cds.hpp
@@ -5,8 +5,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "mpi.h"
-#include "mesh2D.h"
+
+#include "nrssys.hpp"
 #include "mesh3D.h"
 #include "elliptic.h"
 
@@ -23,7 +23,7 @@ typedef struct
   int NVfields;            // Number of velocity fields
   int NSfields;            // Number of scalar fields
 
-  setupAide options;
+  setupAide options[NSCALAR_MAX];
 
   oogs_t *gsh, *gshT;
 
@@ -31,27 +31,19 @@ typedef struct
   dlong fieldOffset;
   dlong Nlocal, Ntotal;
   int Nblock;
-  dfloat dt, idt, cfl, dti;          // time step
-  dfloat time;
+  dfloat idt;
+  dfloat *dt;
   int tstep;
   dfloat g0, ig0;
-  dfloat startTime;
-  dfloat finalTime;
 
   int temporalOrder;
   int ExplicitOrder;
-  int NtimeSteps;  // number of time steps
   int Nstages;
-  int outputStep;
-  int outputForceStep;
   int dtAdaptStep;
 
   int compute[NSCALAR_MAX];
   int Niter[NSCALAR_MAX];
 
-  //solver tolerances
-  dfloat TOL;
-
   dfloat* U, * S;
   dfloat* rkNS;
   //  dfloat *rhsS;
diff --git a/src/insBcData.h b/src/core/bcData.h
similarity index 55%
rename from src/insBcData.h
rename to src/core/bcData.h
index 720127d0f..772b0a3dd 100644
--- a/src/insBcData.h
+++ b/src/core/bcData.h
@@ -2,25 +2,19 @@
 struct bcData
 {
   int idM;
+
   int fieldOffset;
   int id;
 
-  int scalarId;
-
   dfloat time;
   dfloat x, y, z;
   dfloat nx, ny, nz;
 
-  dfloat uM, vM, wM;
-  dfloat uP, vP, wP;
-  dfloat uxP, uyP, uzP;
-  dfloat vxP, vyP, vzP;
-  dfloat wxP, wyP, wzP;
+  dfloat u, v, w;
+  dfloat p;
 
-  dfloat pM;
-  dfloat pP;
+  int scalarId;
+  dfloat s, flux;
 
   @globalPtr const dfloat* wrk;
-
-  dfloat sM, sP, sF;
 };
diff --git a/src/core/bcMap.cpp b/src/core/bcMap.cpp
index ce3c75100..f2a759de9 100644
--- a/src/core/bcMap.cpp
+++ b/src/core/bcMap.cpp
@@ -216,10 +216,10 @@ int size(int isTmesh)
   return isTmesh ? nbid[1] : nbid[0];
 }
 
-void check(mesh_t* mesh, int isTmesh)
+void check(mesh_t* mesh)
 {
   int nid = nbid[0];
-  if(isTmesh) nid = nbid[1];
+  if(mesh->cht) nid = nbid[1];
 
   int retval = 0;
 
diff --git a/src/core/bcMap.hpp b/src/core/bcMap.hpp
index a65ec3d37..9788a9145 100644
--- a/src/core/bcMap.hpp
+++ b/src/core/bcMap.hpp
@@ -3,7 +3,6 @@
 
 #include <string>
 #include <vector>
-#include "setupAide.hpp"
 #include "nekInterfaceAdapter.hpp"
 
 namespace bcMap
@@ -13,7 +12,7 @@ int id(int bid, string field);
 int type(int bid, string field);
 string text(int bid, string field);
 int size(int isTmesh);
-void check(mesh_t* mesh, int isTmesh);
+void check(mesh_t* mesh);
 void setBcMap(string field, int* map, int nbid);
 }
 
diff --git a/src/core/cfl.cpp b/src/core/cfl.cpp
index 84f51e430..7fe746ff7 100644
--- a/src/core/cfl.cpp
+++ b/src/core/cfl.cpp
@@ -4,12 +4,12 @@ static int firstTime = 1;
 static dfloat* tmp;
 static occa::memory o_tmp;
 
-void setup(ins_t* ins)
+void setup(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
 
   dfloat* dH;
-  if(ins->elementType == QUADRILATERALS || ins->elementType == HEXAHEDRA) {
+  if(nrs->elementType == QUADRILATERALS || nrs->elementType == HEXAHEDRA) {
     dH = (dfloat*) calloc((mesh->N + 1),sizeof(dfloat));
 
     for(int n = 0; n < (mesh->N + 1); n++) {
@@ -23,37 +23,37 @@ void setup(ins_t* ins)
     for(int n = 0; n < (mesh->N + 1); n++)
       dH[n] = 1.0 / dH[n];
 
-    ins->o_idH = mesh->device.malloc((mesh->N + 1) * sizeof(dfloat), dH);
+    nrs->o_idH = mesh->device.malloc((mesh->N + 1) * sizeof(dfloat), dH);
     free(dH);
   }
 
-  tmp = (dfloat*) calloc(ins->Nblock, sizeof(dfloat));
-  o_tmp = mesh->device.malloc(ins->Nblock * sizeof(dfloat), tmp);
+  tmp = (dfloat*) calloc(nrs->Nblock, sizeof(dfloat));
+  o_tmp = mesh->device.malloc(nrs->Nblock * sizeof(dfloat), tmp);
 
   firstTime = 0;
 }
 
-dfloat computeCFL(ins_t* ins, dfloat time, int tstep)
+dfloat computeCFL(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
-  if(firstTime) setup(ins);
+  mesh_t* mesh = nrs->mesh;
+  if(firstTime) setup(nrs);
 
   // Compute cfl factors i.e. dt* U / h
-  ins->cflKernel(mesh->Nelements,
-                 ins->dt,
+  nrs->cflKernel(mesh->Nelements,
+                 nrs->dt[0],
                  mesh->o_vgeo,
-                 ins->o_idH,
-                 ins->fieldOffset,
-                 ins->o_U,
-                 ins->o_wrk0);
+                 nrs->o_idH,
+                 nrs->fieldOffset,
+                 nrs->o_U,
+                 nrs->o_wrk0);
 
   // find the local maximum of CFL number
-  ins->maxKernel(ins->Nlocal, ins->o_wrk0, o_tmp);
+  nrs->maxKernel(nrs->Nlocal, nrs->o_wrk0, o_tmp);
   o_tmp.copyTo(tmp);
 
   // finish reduction
   dfloat cfl = 0.f;
-  for(dlong n = 0; n < ins->Nblock; ++n)
+  for(dlong n = 0; n < nrs->Nblock; ++n)
     cfl  = mymax(cfl, tmp[n]);
 
   dfloat gcfl = 0.f;
diff --git a/src/core/cfl.hpp b/src/core/cfl.hpp
index 18d762d4e..d8d50df8c 100644
--- a/src/core/cfl.hpp
+++ b/src/core/cfl.hpp
@@ -1,7 +1,7 @@
 #if !defined(nekrs_cfl_hpp_)
 #define nekrs_cfl_hpp_
 
-#include "nekrs.hpp"
-dfloat computeCFL(ins_t* ins, dfloat time, int tstep);
+#include "nrs.hpp"
+dfloat computeCFL(nrs_t* nrs);
 
 #endif
diff --git a/src/core/configReader.hpp b/src/core/configReader.hpp
index 6e6b8e257..69db02142 100644
--- a/src/core/configReader.hpp
+++ b/src/core/configReader.hpp
@@ -1,7 +1,7 @@
 #if !defined(nekrs_cfgreader_hpp_)
 #define nekrs_cfgreader_hpp_
 
-#include "nekrs.hpp"
+#include "nrs.hpp"
 void configRead(MPI_Comm comm);
 
 #endif
diff --git a/src/core/filter.cpp b/src/core/filter.cpp
index 08caf97ee..3d2ecefff 100644
--- a/src/core/filter.cpp
+++ b/src/core/filter.cpp
@@ -1,5 +1,4 @@
 #include "nrs.hpp"
-#include "ins.h"
 
 void filterFunctionRelaxation1D(int Nmodes, int Nc, dfloat* A);
 
@@ -10,15 +9,15 @@ dfloat filterJacobiP(dfloat a, dfloat alpha, dfloat beta, int N);
 
 dfloat filterFactorial(int n);
 
-void filterSetup(ins_t* ins)
+void filterSetup(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
 
   // First construct filter function
-  ins->filterS = 10.0; // filter Weight...
-  ins->options.getArgs("HPFRT STRENGTH", ins->filterS);
-  ins->options.getArgs("HPFRT MODES", ins->filterNc);
-  ins->filterS = -1.0 * fabs(ins->filterS);
+  nrs->filterS = 10.0; // filter Weight...
+  nrs->options.getArgs("HPFRT STRENGTH", nrs->filterS);
+  nrs->options.getArgs("HPFRT MODES", nrs->filterNc);
+  nrs->filterS = -1.0 * fabs(nrs->filterS);
 
   // Construct Filter Function
   int Nmodes = mesh->N + 1; // N+1, 1D GLL points
@@ -29,7 +28,7 @@ void filterSetup(ins_t* ins)
   dfloat* A = (dfloat*) calloc(Nmodes * Nmodes, sizeof(dfloat));
 
   // Construct Filter Function
-  filterFunctionRelaxation1D(Nmodes, ins->filterNc, A);
+  filterFunctionRelaxation1D(Nmodes, nrs->filterNc, A);
 
   // Construct Vandermonde Matrix
   filterVandermonde1D(mesh->N, Nmodes, mesh->r, V);
@@ -74,16 +73,16 @@ void filterSetup(ins_t* ins)
   dgemm_(&TRANSA, &TRANSB, &MD, &ND, &KD, &ALPHA, V, &LDA, C, &LDB, &BETA, A, &LDC);
 
   // store filter matrix (row major)
-  ins->filterM = (dfloat*) calloc(Nmodes * Nmodes, sizeof(dfloat));
+  nrs->filterM = (dfloat*) calloc(Nmodes * Nmodes, sizeof(dfloat));
   for(int c = 0; c < Nmodes; c++)
     for(int r = 0; r < Nmodes; r++)
-      ins->filterM[c + r * Nmodes] = A[r + c * Nmodes];
+      nrs->filterM[c + r * Nmodes] = A[r + c * Nmodes];
 
-  ins->o_filterMT =  mesh->device.malloc(Nmodes * Nmodes * sizeof(dfloat), A); // copy Tranpose
+  nrs->o_filterMT =  mesh->device.malloc(Nmodes * Nmodes * sizeof(dfloat), A); // copy Tranpose
 
   if(mesh->rank == 0)
     printf("High pass filter relaxation: chi = %.4f using %d mode(s)\n",
-           fabs(ins->filterS), ins->filterNc);
+           fabs(nrs->filterS), nrs->filterNc);
 
   free(A);
   free(C);
diff --git a/src/core/filter.hpp b/src/core/filter.hpp
index 85e71277c..62339db65 100644
--- a/src/core/filter.hpp
+++ b/src/core/filter.hpp
@@ -1,7 +1,7 @@
 #if !defined(nekrs_filtersetup_hpp_)
 #define nekrs_filtersetup_hpp_
 
-#include "nekrs.hpp"
-void filterSetup(ins_t* ins);
+#include "nrs.hpp"
+void filterSetup(nrs_t* nrs);
 
 #endif
diff --git a/src/core/insSetup.cpp b/src/core/insSetup.cpp
deleted file mode 100644
index e42978067..000000000
--- a/src/core/insSetup.cpp
+++ /dev/null
@@ -1,1164 +0,0 @@
-#include "nrs.hpp"
-#include "meshSetup.hpp"
-#include "nekInterfaceAdapter.hpp"
-#include "udf.hpp"
-#include "filter.hpp"
-#include "bcMap.hpp"
-#include <vector>
-#include <map>
-
-static dfloat* scratch;
-static occa::memory o_scratch;
-
-cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &kernelInfoH);
-ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buildOnly)
-{
-  ins_t* ins = new ins_t();
-  ins->options = options;
-  ins->kernelInfo = new occa::properties();
-  occa::properties& kernelInfo = *ins->kernelInfo;
-  kernelInfo["defines"].asObject();
-  kernelInfo["includes"].asArray();
-  kernelInfo["header"].asArray();
-  kernelInfo["flags"].asObject();
-  kernelInfo["include_paths"].asArray();
-
-  int N, cubN;
-  string install_dir;
-  options.getArgs("POLYNOMIAL DEGREE", N);
-  options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN);
-  options.getArgs("NUMBER OF SCALARS", ins->Nscalar);
-  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
-  options.getArgs("MESH DIMENSION", ins->dim);
-  options.getArgs("ELEMENT TYPE", ins->elementType);
-
-  ins->flow = 1;
-  if(options.compareArgs("VELOCITY", "FALSE")) ins->flow = 0;
-  if(options.compareArgs("VELOCITY SOLVER", "NONE")) ins->flow = 0;
-
-  ins->cht = 0;
-  if (nekData.nelv != nekData.nelt && ins->Nscalar) ins->cht = 1;
-
-  if (buildOnly) {
-    ins->meshT = createMeshDummy(comm, N, cubN, options, device, kernelInfo);
-    ins->mesh = ins->meshT;
-  } else {
-    ins->meshT = createMeshT(comm, N, cubN, ins->cht, options, device, kernelInfo);
-    ins->mesh = ins->meshT;
-    if (ins->cht) ins->mesh = createMeshV(comm, N, cubN, ins->meshT, options, kernelInfo);
-  }
-  mesh_t* mesh = ins->mesh;
-
-
-  { 
-    dlong retVal; 
-    MPI_Allreduce(&mesh->NinternalElements,&retVal,1,MPI_DLONG,MPI_MIN,mesh->comm);
-    if(mesh->rank == 0) printf("min NinternalElements: %d (ratio: %4.2f)\n", retVal, (double)retVal/mesh->Nelements);
-  }
-
-  occa::properties kernelInfoV  = kernelInfo;
-  occa::properties kernelInfoP  = kernelInfo;
-  occa::properties kernelInfoS  = kernelInfo;
-
-  ins->NVfields = (ins->dim == 3) ? 3:2; // Total Number of Velocity Fields
-  ins->NTfields = ins->NVfields + 1;   // Total Velocity + Pressure
-
-  ins->SNrk = 0;
-  options.getArgs("SUBCYCLING TIME STAGE NUMBER", ins->SNrk);
-
-  mesh->Nfields = 1;
-
-  ins->extbdfA = (dfloat*) calloc(3, sizeof(dfloat));
-  ins->extbdfB = (dfloat*) calloc(3, sizeof(dfloat));
-  ins->extbdfC = (dfloat*) calloc(3, sizeof(dfloat));
-
-  ins->extC = (dfloat*) calloc(3, sizeof(dfloat));
-
-  if (options.compareArgs("TIME INTEGRATOR", "TOMBO1")) {
-    ins->Nstages = 1;
-    ins->temporalOrder = 1;
-  } else if (options.compareArgs("TIME INTEGRATOR", "TOMBO2")) {
-    ins->Nstages = 2;
-    ins->temporalOrder = 2;
-  } else if (options.compareArgs("TIME INTEGRATOR", "TOMBO3")) {
-    ins->Nstages = 3;
-    ins->temporalOrder = 3;
-  }
-
-  ins->readRestartFile = 0;
-  options.getArgs("RESTART FROM FILE", ins->readRestartFile);
-
-  ins->writeRestartFile = 0;
-  options.getArgs("WRITE RESTART FILE", ins->writeRestartFile);
-
-  dfloat mue = 1;
-  dfloat rho = 1;
-  options.getArgs("VISCOSITY", mue);
-  options.getArgs("DENSITY", rho);
-
-  options.getArgs("SUBCYCLING STEPS",ins->Nsubsteps);
-  options.getArgs("DT", ins->dt);
-  options.getArgs("START TIME", ins->startTime);
-  options.getArgs("FINAL TIME", ins->finalTime);
-  if(ins->startTime > 0.0) ins->finalTime += ins->startTime; 
-  options.setArgs("FINAL TIME", to_string_f(ins->finalTime));
-
-  ins->NtimeSteps = (ins->finalTime - ins->startTime) / ins->dt;
-  if(ins->startTime + ins->NtimeSteps*ins->dt < ins->finalTime) ins->NtimeSteps++;
-
-  options.setArgs("NUMBER TIMESTEPS", std::to_string(ins->NtimeSteps));
-  if(ins->Nsubsteps) ins->sdt = ins->dt / ins->Nsubsteps;
-
-  // Hold some inverses for kernels
-  ins->idt = 1.0 / ins->dt;
-  options.getArgs("TSTEPS FOR SOLUTION OUTPUT", ins->outputStep);
-
-  const dlong Nlocal = mesh->Np * mesh->Nelements;
-  const dlong Ntotal = mesh->Np * (mesh->Nelements + mesh->totalHaloPairs);
-
-  ins->Nlocal = Nlocal;
-  ins->Ntotal = Ntotal;
-
-  // ensure that offset is large enough for v and t mesh and is properly aligned
-  {
-    const dlong NtotalT = ins->meshT->Np * (ins->meshT->Nelements + ins->meshT->totalHaloPairs);
-    ins->fieldOffset = mymax(Ntotal, NtotalT);
-
-    int PAGESIZE = 4096; // default is 4kB
-    char* tmp;
-    tmp = getenv("NEKRS_PAGE_SIZE");
-    if (tmp != NULL) PAGESIZE = std::stoi(tmp);
-    const int pageW = PAGESIZE / sizeof(dfloat);
-    if (ins->fieldOffset % pageW) ins->fieldOffset = (ins->fieldOffset / pageW + 1) * pageW;
-  }
-
-  ins->Nblock = (Nlocal + blockSize - 1) / blockSize;
-
-  ins->U  = (dfloat*) calloc(ins->NVfields * ins->Nstages * ins->fieldOffset,sizeof(dfloat));
-  ins->Ue = (dfloat*) calloc(ins->NVfields * ins->fieldOffset,sizeof(dfloat));
-
-  ins->P  = (dfloat*) calloc(ins->fieldOffset,sizeof(dfloat));
-
-  ins->BF = (dfloat*) calloc(ins->NVfields * ins->fieldOffset,sizeof(dfloat));
-  ins->FU = (dfloat*) calloc(ins->NVfields * ins->Nstages * ins->fieldOffset,sizeof(dfloat));
-
-  if(ins->Nsubsteps) {
-    int Sorder;
-    options.getArgs("SUBCYCLING TIME ORDER", Sorder);
-    if(Sorder == 4 && ins->SNrk == 4) { // ERK(4,4)
-      dfloat rka[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0};
-      dfloat rkb[4] = {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0};
-      dfloat rkc[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0};
-      ins->Srka = (dfloat*) calloc(ins->SNrk, sizeof(dfloat));
-      ins->Srkb = (dfloat*) calloc(ins->SNrk, sizeof(dfloat));
-      ins->Srkc = (dfloat*) calloc(ins->SNrk, sizeof(dfloat));
-      memcpy(ins->Srka, rka, ins->SNrk * sizeof(dfloat));
-      memcpy(ins->Srkb, rkb, ins->SNrk * sizeof(dfloat));
-      memcpy(ins->Srkc, rkc, ins->SNrk * sizeof(dfloat));
-    }else{
-      if(mesh->rank == 0) cout << "Unsupported subcycling scheme!\n";
-      ABORT(1);
-    }
-    ins->o_Srka = mesh->device.malloc(ins->SNrk * sizeof(dfloat), ins->Srka);
-    ins->o_Srkb = mesh->device.malloc(ins->SNrk * sizeof(dfloat), ins->Srkb);
-  }
-
-  // setup scratch space
-  const int wrkNflds = 6;
-  const int ellipticWrkNflds = 15;
-  ins->ellipticWrkOffset = wrkNflds * ins->fieldOffset;
-
-  const int scratchNflds = wrkNflds + ellipticWrkNflds;
-  scratch   = (dfloat*) calloc(scratchNflds * ins->fieldOffset,sizeof(dfloat));
-  o_scratch = mesh->device.malloc(scratchNflds * ins->fieldOffset * sizeof(dfloat), scratch);
-
-  ins->o_wrk0  = o_scratch.slice( 0 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk1  = o_scratch.slice( 1 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk2  = o_scratch.slice( 2 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk3  = o_scratch.slice( 3 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk4  = o_scratch.slice( 4 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk5  = o_scratch.slice( 5 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk6  = o_scratch.slice( 6 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk7  = o_scratch.slice( 7 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk9  = o_scratch.slice( 9 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk12 = o_scratch.slice(12 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_wrk15 = o_scratch.slice(15 * ins->fieldOffset * sizeof(dfloat));
-
-  ins->o_U  = mesh->device.malloc(ins->NVfields * ins->Nstages * ins->fieldOffset * sizeof(dfloat),
-                                  ins->U);
-  ins->o_Ue = mesh->device.malloc(ins->NVfields * ins->fieldOffset * sizeof(dfloat), ins->Ue);
-  ins->o_P  = mesh->device.malloc(ins->fieldOffset * sizeof(dfloat), ins->P);
-
-  ins->o_FU =
-    mesh->device.malloc(ins->NVfields * ins->Nstages * ins->fieldOffset * sizeof(dfloat),
-                        ins->FU);
-  ins->o_BF = mesh->device.malloc(ins->NVfields * ins->fieldOffset * sizeof(dfloat), ins->BF);
-
-  ins->var_coeff = 1; // use always var coeff elliptic
-  ins->ellipticCoeff = (dfloat*) calloc(2 * ins->fieldOffset,sizeof(dfloat));
-  ins->o_ellipticCoeff = mesh->device.malloc(2 * ins->fieldOffset * sizeof(dfloat),
-                                             ins->ellipticCoeff);
-
-  ins->prop =  (dfloat*) calloc(2 * ins->fieldOffset,sizeof(dfloat));
-  for (int e = 0; e < mesh->Nelements; e++)
-    for (int n = 0; n < mesh->Np; n++) {
-      ins->prop[0 * ins->fieldOffset + e * mesh->Np + n] = mue;
-      ins->prop[1 * ins->fieldOffset + e * mesh->Np + n] = rho;
-    }
-  ins->o_prop = mesh->device.malloc(2 * ins->fieldOffset * sizeof(dfloat), ins->prop);
-  ins->o_mue = ins->o_prop.slice(0 * ins->fieldOffset * sizeof(dfloat));
-  ins->o_rho = ins->o_prop.slice(1 * ins->fieldOffset * sizeof(dfloat));
-
-  ins->div   = (dfloat*) calloc(ins->fieldOffset,sizeof(dfloat));
-  ins->o_div = mesh->device.malloc(ins->fieldOffset * sizeof(dfloat), ins->div);
-
-  ins->elementInfo = (dlong*) calloc(ins->meshT->Nelements,sizeof(dlong));
-  for (int e = 0; e < ins->meshT->Nelements; e++) ins->elementInfo[e] = mesh->elementInfo[e];
-  ins->o_elementInfo = mesh->device.malloc(ins->meshT->Nelements * sizeof(dlong), ins->elementInfo);
-  dfloat rkC[4]  = {1.0, 0.0, -1.0, -2.0};
-  ins->o_rkC     = mesh->device.malloc(4 * sizeof(dfloat),rkC);
-  ins->o_extbdfA = mesh->device.malloc(3 * sizeof(dfloat));
-  ins->o_extbdfB = mesh->device.malloc(3 * sizeof(dfloat));
-  ins->o_extbdfC = mesh->device.malloc(3 * sizeof(dfloat));
-  ins->o_extC    = mesh->device.malloc(3 * sizeof(dfloat));
-
-  // define aux kernel constants
-  kernelInfo["defines/" "p_eNfields"] = ins->NVfields;
-  kernelInfo["defines/" "p_NTfields"] = ins->NTfields;
-  kernelInfo["defines/" "p_NVfields"] = ins->NVfields;
-  kernelInfo["defines/" "p_NfacesNfp"] =  mesh->Nfaces * mesh->Nfp;
-  kernelInfo["defines/" "p_Nstages"] =  ins->Nstages;
-  if(ins->Nsubsteps)
-    kernelInfo["defines/" "p_SUBCYCLING"] =  1;
-  else
-    kernelInfo["defines/" "p_SUBCYCLING"] =  0;
-
-  kernelInfo["defines/" "p_blockSize"] = blockSize;
-  //kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-  int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
-  kernelInfo["defines/" "p_maxNodes"] = maxNodes;
-
-  int NblockV = mymax(1,256 / mesh->Np);
-  kernelInfo["defines/" "p_NblockV"] = NblockV;
-
-  int NblockS = mymax(1,256 / maxNodes);
-  kernelInfo["defines/" "p_NblockS"] = NblockS;
-
-  int maxNodesVolumeCub = mymax(mesh->cubNp,mesh->Np);
-  kernelInfo["defines/" "p_maxNodesVolumeCub"] = maxNodesVolumeCub;
-  int cubNblockV = mymax(1,256 / maxNodesVolumeCub);
-
-  int maxNodesSurfaceCub = mymax(mesh->Np, mymax(mesh->Nfaces * mesh->Nfp,
-                                                 mesh->Nfaces * mesh->intNfp));
-  kernelInfo["defines/" "p_maxNodesSurfaceCub"] = maxNodesSurfaceCub;
-  int cubNblockS = mymax(256 / maxNodesSurfaceCub,1);
-
-  kernelInfo["defines/" "p_cubNblockV"] = cubNblockV;
-  kernelInfo["defines/" "p_cubNblockS"] = cubNblockS;
-
-  // jit compile udf kernels
-  if (udf.loadKernels) {
-    if (mesh->rank == 0) cout << "loading udf kernels ... ";
-    udf.loadKernels(ins);
-    if (mesh->rank == 0) cout << "done" << endl;
-  }
-
-  ins->linAlg = new linAlg_t(mesh->device, ins->kernelInfo, mesh->comm);
-
-  occa::properties kernelInfoBC = kernelInfo;
-  const string bcDataFile = install_dir + "/include/insBcData.h";
-  kernelInfoBC["includes"] += bcDataFile.c_str();
-  string boundaryHeaderFileName;
-  options.getArgs("DATA FILE", boundaryHeaderFileName);
-  kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL);
-
-  const int nbrBIDs = bcMap::size(0);
-  int NBCType = nbrBIDs + 1;
-
-  meshParallelGatherScatterSetup(mesh, ins->Nlocal, mesh->globalIds, mesh->comm, 0);
-  oogs_mode oogsMode = OOGS_AUTO; 
-  if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT;
-  ins->gsh = oogs::setup(mesh->ogs, ins->NVfields, ins->fieldOffset, ogsDfloat, NULL, oogsMode);
-
-  if(!buildOnly) {
-    int err = 0;
-    dlong gNelements = mesh->Nelements;
-    MPI_Allreduce(MPI_IN_PLACE, &gNelements, 1, MPI_DLONG, MPI_SUM, mesh->comm);
-    const dfloat sum2 = (dfloat)gNelements * mesh->Np;
-    ins->linAlg->fillKernel(ins->fieldOffset, 1.0, ins->o_wrk0);
-    ogsGatherScatter(ins->o_wrk0, ogsDfloat, ogsAdd, mesh->ogs);
-    ins->linAlg->axmyKernel(Nlocal, 1.0, mesh->ogs->o_invDegree, ins->o_wrk0); 
-    dfloat* tmp = (dfloat*) calloc(Nlocal, sizeof(dfloat));
-    ins->o_wrk0.copyTo(tmp, Nlocal * sizeof(dfloat));
-    dfloat sum1 = 0;
-    for(int i = 0; i < Nlocal; i++) sum1 += tmp[i];
-    MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, mesh->comm);
-    sum1 = abs(sum1 - sum2) / sum2;
-    if(sum1 > 1e-15) {
-      if(mesh->rank == 0) printf("ogsGatherScatter test err=%g!\n", sum1);
-      fflush(stdout);
-      err++;
-    }
-
-    mesh->ogs->o_invDegree.copyTo(tmp, Nlocal * sizeof(dfloat));
-    double* vmult = (double*) nek_ptr("vmult");
-    sum1 = 0;
-    for(int i = 0; i < Nlocal; i++) sum1 += abs(tmp[i] - vmult[i]);
-    MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, mesh->comm);
-    if(sum1 > 1e-15) {
-      if(mesh->rank == 0) printf("multiplicity test err=%g!\n", sum1);
-      fflush(stdout);
-      err++;
-    }
-
-    if(err) ABORT(1);
-    free(tmp);
-  }
-
-  if (ins->flow) {
-    if (mesh->rank == 0) printf("==================VELOCITY SETUP=========================\n");
-
-    ins->velTOL  = 1E-6;
-    ins->uvwSolver = NULL;
-
-    //if(options.compareArgs("VARIABLEPROPERTIES", "TRUE"))
-    //   options.setArgs("STRESSFORMULATION", "TRUE");
-
-    if(options.compareArgs("STRESSFORMULATION", "TRUE"))
-       options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
-
-    if(options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE"))
-      ins->uvwSolver = new elliptic_t();
-
-    int* uvwBCType = (int*) calloc(3 * NBCType, sizeof(int));
-    int* uBCType = uvwBCType + 0 * NBCType;
-    int* vBCType = uvwBCType + 1 * NBCType;
-    int* wBCType = uvwBCType + 2 * NBCType;
-    for (int bID = 1; bID <= nbrBIDs; bID++) {
-      string bcTypeText(bcMap::text(bID, "velocity"));
-      if(mesh->rank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str());
-
-      uBCType[bID] = bcMap::type(bID, "x-velocity");
-      vBCType[bID] = bcMap::type(bID, "y-velocity");
-      wBCType[bID] = bcMap::type(bID, "z-velocity");
-    }
-
-    ins->vOptions = options;
-    ins->vOptions.setArgs("KRYLOV SOLVER",        options.getArgs("VELOCITY KRYLOV SOLVER"));
-    ins->vOptions.setArgs("SOLVER TOLERANCE",     options.getArgs("VELOCITY SOLVER TOLERANCE"));
-    ins->vOptions.setArgs("DISCRETIZATION",       options.getArgs("VELOCITY DISCRETIZATION"));
-    ins->vOptions.setArgs("BASIS",                options.getArgs("VELOCITY BASIS"));
-    ins->vOptions.setArgs("PRECONDITIONER",       options.getArgs("VELOCITY PRECONDITIONER"));
-    ins->vOptions.setArgs("RESIDUAL PROJECTION",       options.getArgs("VELOCITY RESIDUAL PROJECTION"));
-    ins->vOptions.setArgs("RESIDUAL PROJECTION VECTORS",       options.getArgs("VELOCITY RESIDUAL PROJECTION VECTORS"));
-    ins->vOptions.setArgs("RESIDUAL PROJECTION START",       options.getArgs("VELOCITY RESIDUAL PROJECTION START"));
-    ins->vOptions.setArgs("MULTIGRID COARSENING", options.getArgs("VELOCITY MULTIGRID COARSENING"));
-    ins->vOptions.setArgs("MULTIGRID SMOOTHER",   options.getArgs("VELOCITY MULTIGRID SMOOTHER"));
-    ins->vOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE",
-                          options.getArgs("VELOCITY MULTIGRID CHEBYSHEV DEGREE"));
-    ins->vOptions.setArgs("PARALMOND CYCLE",      options.getArgs("VELOCITY PARALMOND CYCLE"));
-    ins->vOptions.setArgs("PARALMOND SMOOTHER",   options.getArgs("VELOCITY PARALMOND SMOOTHER"));
-    ins->vOptions.setArgs("PARALMOND PARTITION",  options.getArgs("VELOCITY PARALMOND PARTITION"));
-    ins->vOptions.setArgs("PARALMOND CHEBYSHEV DEGREE",
-                          options.getArgs("VELOCITY PARALMOND CHEBYSHEV DEGREE"));
-    ins->vOptions.setArgs("PARALMOND AGGREGATION STRATEGY",
-                          options.getArgs("VELOCITY PARALMOND AGGREGATION STRATEGY"));
-    ins->vOptions.setArgs("DEBUG ENABLE OGS", "1");
-    ins->vOptions.setArgs("DEBUG ENABLE REDUCTIONS", "1");
-
-    // coeff used by ellipticSetup to detect allNeumann
-    for (int i = 0; i < 2 * ins->fieldOffset; i++) ins->ellipticCoeff[i] = 1;
-
-    if(ins->uvwSolver) {
-      ins->uvwSolver->blockSolver = 1;
-      ins->uvwSolver->stressForm = 0;
-      if(options.compareArgs("STRESSFORMULATION", "TRUE"))
-        ins->uvwSolver->stressForm = 1;
-      ins->uvwSolver->Nfields = ins->NVfields;
-      ins->uvwSolver->Ntotal = ins->fieldOffset;
-      ins->uvwSolver->wrk = scratch + ins->ellipticWrkOffset;
-      ins->uvwSolver->o_wrk = o_scratch.slice(ins->ellipticWrkOffset * sizeof(dfloat));
-      ins->uvwSolver->mesh = mesh;
-      ins->uvwSolver->options = ins->vOptions;
-      ins->uvwSolver->dim = ins->dim;
-      ins->uvwSolver->elementType = ins->elementType;
-      ins->uvwSolver->NBCType = NBCType;
-      ins->uvwSolver->BCType = (int*) calloc(ins->NVfields * NBCType,sizeof(int));
-      memcpy(ins->uvwSolver->BCType,uvwBCType,ins->NVfields * NBCType * sizeof(int));
-      ins->uvwSolver->var_coeff = ins->var_coeff;
-      ins->uvwSolver->lambda = ins->ellipticCoeff;
-      ins->uvwSolver->o_lambda = ins->o_ellipticCoeff;
-      ins->uvwSolver->loffset = 0; // use same ellipticCoeff for u,v and w
-
-      ellipticSolveSetup(ins->uvwSolver, kernelInfoV);
-    } else {
-      ins->uSolver = new elliptic_t();
-      ins->uSolver->blockSolver = 0;
-      ins->uSolver->Nfields = 1;
-      ins->uSolver->Ntotal = ins->fieldOffset;
-      ins->uSolver->wrk = scratch + ins->ellipticWrkOffset;
-      ins->uSolver->o_wrk = o_scratch.slice(ins->ellipticWrkOffset * sizeof(dfloat));
-      ins->uSolver->mesh = mesh;
-      ins->uSolver->options = ins->vOptions;
-      ins->uSolver->dim = ins->dim;
-      ins->uSolver->elementType = ins->elementType;
-      ins->uSolver->NBCType = NBCType;
-      ins->uSolver->BCType = (int*) calloc(NBCType,sizeof(int));
-      memcpy(ins->uSolver->BCType,uBCType,NBCType * sizeof(int));
-      ins->uSolver->var_coeff = ins->var_coeff;
-      ins->uSolver->lambda = ins->ellipticCoeff;
-      ins->uSolver->o_lambda = ins->o_ellipticCoeff;
-      ins->uSolver->loffset = 0;
-
-      ellipticSolveSetup(ins->uSolver, kernelInfoV);
-
-      ins->vSolver = new elliptic_t();
-      ins->vSolver->blockSolver = 0;
-      ins->vSolver->Nfields = 1;
-      ins->vSolver->Ntotal = ins->fieldOffset;
-      ins->vSolver->wrk = scratch + ins->ellipticWrkOffset;
-      ins->vSolver->o_wrk = o_scratch.slice(ins->ellipticWrkOffset * sizeof(dfloat));
-      ins->vSolver->mesh = mesh;
-      ins->vSolver->options = ins->vOptions;
-      ins->vSolver->dim = ins->dim;
-      ins->vSolver->elementType = ins->elementType;
-      ins->vSolver->NBCType = NBCType;
-      ins->vSolver->BCType = (int*) calloc(NBCType,sizeof(int));
-      memcpy(ins->vSolver->BCType,vBCType,NBCType * sizeof(int));
-      ins->vSolver->var_coeff = ins->var_coeff;
-      ins->vSolver->lambda = ins->ellipticCoeff;
-      ins->vSolver->o_lambda = ins->o_ellipticCoeff;
-      ins->vSolver->loffset = 0;
-
-      ellipticSolveSetup(ins->vSolver, kernelInfoV);
-
-      if (ins->dim == 3) {
-        ins->wSolver = new elliptic_t();
-        ins->wSolver->blockSolver = 0;
-        ins->wSolver->Nfields = 1;
-        ins->wSolver->Ntotal = ins->fieldOffset;
-        ins->wSolver->wrk = scratch + ins->ellipticWrkOffset;
-        ins->wSolver->o_wrk = o_scratch.slice(ins->ellipticWrkOffset * sizeof(dfloat));
-        ins->wSolver->mesh = mesh;
-        ins->wSolver->options = ins->vOptions;
-        ins->wSolver->dim = ins->dim;
-        ins->wSolver->elementType = ins->elementType;
-        ins->wSolver->NBCType = NBCType;
-        ins->wSolver->BCType = (int*) calloc(NBCType,sizeof(int));
-        memcpy(ins->wSolver->BCType,wBCType,NBCType * sizeof(int));
-        ins->wSolver->var_coeff = ins->var_coeff;
-        ins->wSolver->lambda = ins->ellipticCoeff;
-        ins->wSolver->o_lambda = ins->o_ellipticCoeff;
-        ins->wSolver->loffset = 0;
-
-        ellipticSolveSetup(ins->wSolver, kernelInfoV);
-      }
-    }
-  } // flow
-
-  // setup scalar solver
-  if(ins->Nscalar) {
-    mesh_t* msh;
-    (ins->cht) ? msh = ins->meshT : msh = ins->mesh;
-    ins->cds = cdsSetup(ins, msh, options, kernelInfoS);
-  }
-
-  if (ins->flow) {
-    if (mesh->rank == 0) printf("==================PRESSURE SETUP=========================\n");
-
-    ins->presTOL = 1E-4;
-
-    int* pBCType = (int*) calloc(NBCType, sizeof(int));
-    for (int bID = 1; bID <= nbrBIDs; bID++)
-      pBCType[bID] = bcMap::type(bID, "pressure");
-
-    ins->pOptions = options;
-    ins->pOptions.setArgs("KRYLOV SOLVER",        options.getArgs("PRESSURE KRYLOV SOLVER"));
-    ins->pOptions.setArgs("SOLVER TOLERANCE",     options.getArgs("PRESSURE SOLVER TOLERANCE"));
-    ins->pOptions.setArgs("DISCRETIZATION",       options.getArgs("PRESSURE DISCRETIZATION"));
-    ins->pOptions.setArgs("BASIS",                options.getArgs("PRESSURE BASIS"));
-    ins->pOptions.setArgs("PRECONDITIONER",       options.getArgs("PRESSURE PRECONDITIONER"));
-    ins->pOptions.setArgs("MULTIGRID COARSENING", options.getArgs("PRESSURE MULTIGRID COARSENING"));
-    ins->pOptions.setArgs("MULTIGRID SMOOTHER",   options.getArgs("PRESSURE MULTIGRID SMOOTHER"));
-    ins->pOptions.setArgs("MULTIGRID DOWNWARD SMOOTHER",
-                          options.getArgs("PRESSURE MULTIGRID DOWNWARD SMOOTHER"));
-    ins->pOptions.setArgs("MULTIGRID UPWARD SMOOTHER",
-                          options.getArgs("PRESSURE MULTIGRID UPWARD SMOOTHER"));
-    ins->pOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE",
-                          options.getArgs("PRESSURE MULTIGRID CHEBYSHEV DEGREE"));
-    ins->pOptions.setArgs("PARALMOND CYCLE",      options.getArgs("PRESSURE PARALMOND CYCLE"));
-    ins->pOptions.setArgs("PARALMOND SMOOTHER",   options.getArgs("PRESSURE MULTIGRID SMOOTHER"));
-    ins->pOptions.setArgs("PARALMOND PARTITION",  options.getArgs("PRESSURE PARALMOND PARTITION"));
-    ins->pOptions.setArgs("PARALMOND CHEBYSHEV DEGREE",
-                          options.getArgs("PRESSURE PARALMOND CHEBYSHEV DEGREE"));
-    ins->pOptions.setArgs("PARALMOND AGGREGATION STRATEGY",
-                          options.getArgs("PRESSURE PARALMOND AGGREGATION STRATEGY"));
-    ins->pOptions.setArgs("RESIDUAL PROJECTION", options.getArgs("PRESSURE RESIDUAL PROJECTION"));
-    ins->pOptions.setArgs("RESIDUAL PROJECTION VECTORS",
-                          options.getArgs("PRESSURE RESIDUAL PROJECTION VECTORS"));
-    ins->pOptions.setArgs("RESIDUAL PROJECTION START",
-                          options.getArgs("PRESSURE RESIDUAL PROJECTION START"));
-    ins->pOptions.setArgs("DEBUG ENABLE OGS", "1");
-    ins->pOptions.setArgs("DEBUG ENABLE REDUCTIONS", "1");
-    ins->pOptions.setArgs("MULTIGRID VARIABLE COEFFICIENT", "FALSE");
-
-    ins->pSolver = new elliptic_t();
-    ins->pSolver->blockSolver = 0;
-    ins->pSolver->Nfields = 1;
-    ins->pSolver->Ntotal = ins->fieldOffset;
-    ins->pSolver->wrk = scratch + ins->ellipticWrkOffset;
-    ins->pSolver->o_wrk = o_scratch.slice(ins->ellipticWrkOffset * sizeof(dfloat));
-    ins->pSolver->mesh = mesh;
-    ins->pSolver->dim = ins->dim;
-    ins->pSolver->elementType = ins->elementType;
-    ins->pSolver->BCType = (int*) calloc(nbrBIDs + 1,sizeof(int));
-    memcpy(ins->pSolver->BCType,pBCType,(nbrBIDs + 1) * sizeof(int));
-    ins->pSolver->var_coeff = 1;
-    //// coeff used by ellipticSetup to detect allNeumann
-    // and coeff[0] to setup MG levels
-    for (int i = 0; i < 2 * ins->fieldOffset; i++) ins->ellipticCoeff[i] = 0;
-    ins->pSolver->lambda = ins->ellipticCoeff;
-    ins->pSolver->o_lambda = ins->o_ellipticCoeff;
-    ins->pSolver->loffset = 0;
-
-    string p_mglevels;
-    if(ins->pOptions.getArgs("MULTIGRID COARSENING", p_mglevels)) {
-      std::vector<std::string> mgLevelList;
-      mgLevelList = serializeString(p_mglevels);
-      ins->pSolver->nLevels = mgLevelList.size();
-      ins->pSolver->levels = (int*) calloc(ins->pSolver->nLevels,sizeof(int));
-      for(int i = 0; i < ins->pSolver->nLevels; ++i)
-        ins->pSolver->levels[i] = std::atoi(mgLevelList.at(i).c_str());
-
-      if(ins->pSolver->levels[0] > mesh->N || 
-         ins->pSolver->levels[ins->pSolver->nLevels-1] < 1) {
-        if(mesh->rank == 0) printf("ERROR: Invalid multigrid coarsening!\n");
-        EXIT(1);
-      }
-      ins->pOptions.setArgs("MULTIGRID COARSENING","CUSTOM");
-    } else if(ins->pOptions.compareArgs("MULTIGRID DOWNWARD SMOOTHER","ASM") ||
-              ins->pOptions.compareArgs("MULTIGRID DOWNWARD SMOOTHER","RAS")) {
-      std::map<int,std::vector<int> > mg_level_lookup =
-      {
-        {1,{1}},
-        {2,{2,1}},
-        {3,{3,1}},
-        {4,{4,2,1}},
-        {5,{5,3,1}},
-        {6,{6,3,1}},
-        {7,{7,3,1}},
-        {8,{8,5,1}},
-        {9,{9,5,1}},
-        {10,{10,6,1}},
-        {11,{11,6,1}},
-        {12,{12,7,1}},
-        {13,{13,7,1}},
-        {14,{14,8,1}},
-        {15,{15,9,1}},
-      };
-
-      const std::vector<int>& levels = mg_level_lookup.at(mesh->Nq - 1);
-      ins->pSolver->nLevels = levels.size();
-      ins->pSolver->levels = (int*) calloc(ins->pSolver->nLevels,sizeof(int));
-      for(int i = 0; i < ins->pSolver->nLevels; ++i)
-        ins->pSolver->levels[i] = levels.at(i);
-      ins->pOptions.setArgs("MULTIGRID COARSENING","CUSTOM");
-    } else if(ins->pOptions.compareArgs("MULTIGRID DOWNWARD SMOOTHER","JAC")) {
-      std::map<int,std::vector<int> > mg_level_lookup =
-      {
-        {1,{1}},
-        {2,{2,1}},
-        {3,{3,1}},
-        {4,{4,2,1}},
-        {5,{5,3,1}},
-        {6,{6,4,2,1}},
-        {7,{7,5,3,1}},
-        {8,{8,6,4,1}},
-        {9,{9,7,5,1}},
-        {10,{10,8,5,1}},
-        {11,{11,9,5,1}},
-        {12,{12,10,5,1}},
-        {13,{13,11,5,1}},
-        {14,{14,12,5,1}},
-        {15,{15,13,5,1}},
-      };
-
-      const std::vector<int>& levels = mg_level_lookup.at(mesh->Nq - 1);
-      ins->pSolver->nLevels = levels.size();
-      ins->pSolver->levels = (int*) calloc(ins->pSolver->nLevels,sizeof(int));
-      for(int i = 0; i < ins->pSolver->nLevels; ++i)
-        ins->pSolver->levels[i] = levels.at(i);
-      ins->pOptions.setArgs("MULTIGRID COARSENING","CUSTOM");
-    }
-
-    ins->pSolver->options = ins->pOptions;
-    ellipticSolveSetup(ins->pSolver, kernelInfoP);
-
-    // setup boundary mapping
-    dfloat largeNumber = 1 << 20;
-    ins->VmapB = (int*) calloc(mesh->Nelements * mesh->Np,sizeof(int));
-    for (int e = 0; e < mesh->Nelements; e++)
-      for (int n = 0; n < mesh->Np; n++) ins->VmapB[n + e * mesh->Np] = largeNumber;
-
-    ins->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int));
-
-    int cnt = 0;
-    for (int e = 0; e < mesh->Nelements; e++)
-      for (int f = 0; f < mesh->Nfaces; f++) {
-        int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "velocity");
-        ins->EToB[cnt] = bc;
-        if (bc > 0) {
-          for (int n = 0; n < mesh->Nfp; n++) {
-            int fid = mesh->faceNodes[n + f * mesh->Nfp];
-            ins->VmapB[fid + e * mesh->Np] = mymin(bc,ins->VmapB[fid + e * mesh->Np]); // Dirichlet wins
-          }
-        }
-        cnt++;
-      }
-
-    ogsGatherScatter(ins->VmapB, ogsInt, ogsMin, mesh->ogs);
-    for (int n = 0; n < mesh->Nelements * mesh->Np; n++)
-      if (ins->VmapB[n] == largeNumber) ins->VmapB[n] = 0;
-
-    ins->o_EToB = mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),ins->EToB);
-    ins->o_VmapB = mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(int), ins->VmapB);
-  } // flow
-
-  // build mass + inverse mass matrix
-  dfloat* lumpedMassMatrix  = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
-  for(hlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Np; ++n)
-      lumpedMassMatrix[e * mesh->Np + n] = mesh->vgeo[e * mesh->Np * mesh->Nvgeo + JWID * mesh->Np + n];
-  mesh->o_LMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
-  mesh->o_LMM.copyTo(mesh->LMM);
-  ogsGatherScatter(lumpedMassMatrix, ogsDfloat, ogsAdd, mesh->ogs);
-  for(int n = 0; n < mesh->Np * mesh->Nelements; ++n)
-    lumpedMassMatrix[n] = 1. / lumpedMassMatrix[n];
-  mesh->o_invLMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
-  mesh->o_invLMM.copyTo(mesh->invLMM);
-  free(lumpedMassMatrix);
-
-  // build kernels
-  string fileName, kernelName;
-  const string suffix = "Hex3D";
-  const string oklpath = install_dir + "/okl/core/";
-
-  MPI_Barrier(mesh->comm);
-  double tStartLoadKernel = MPI_Wtime();
-  if(mesh->rank == 0)  printf("loading NS-solver kernels ... "); fflush(stdout);
-
-  for (int r = 0; r < 2; r++) {
-    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
-      fileName = oklpath + "insAdvection" + suffix + ".okl";
-
-      kernelName = "insStrongAdvectionVolume" + suffix;
-      ins->advectionStrongVolumeKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-      kernelName = "insStrongAdvectionCubatureVolume" + suffix;
-      ins->advectionStrongCubatureVolumeKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insCurl" + suffix + ".okl";
-      kernelName = "insCurl" + suffix;
-      ins->curlKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insMassMatrix" + ".okl";
-      kernelName = "insMassMatrix" + suffix;
-      ins->massMatrixKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      kernelName = "insInvMassMatrix" + suffix;
-      ins->invMassMatrixKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insGradient" + suffix + ".okl";
-      kernelName = "insGradientVolume" + suffix;
-      ins->gradientVolumeKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      kernelName = "inswGradientVolume" + suffix;
-      ins->wgradientVolumeKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insSumMakef" + suffix + ".okl";
-      kernelName = "insSumMakef" + suffix;
-      ins->sumMakefKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insDivergence" + suffix + ".okl";
-      kernelName = "insDivergenceVolume" + suffix;
-      ins->divergenceVolumeKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
-
-      kernelName = "insDivergenceSurfaceTOMBO" + suffix;
-      ins->divergenceSurfaceKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
-
-      fileName = oklpath + "insPressureRhs" + suffix + ".okl";
-      kernelName = "insPressureRhsTOMBO" + suffix;
-      ins->pressureRhsKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insPressureStress" + suffix + ".okl";
-      kernelName = "insPressureStress" + suffix;
-      ins->pressureStressKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insPressureBC" + suffix + ".okl";
-      kernelName = "insPressureDirichletBC" + suffix;
-      ins->pressureDirichletBCKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
-
-      fileName = oklpath + "insPressureUpdate" + ".okl";
-      kernelName = "insPressureUpdate";
-      ins->pressureUpdateKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      fileName = oklpath + "insVelocityRhs" + suffix + ".okl";
-      kernelName = "insVelocityRhsTOMBO" + suffix;
-      ins->velocityRhsKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insVelocityBC" + suffix + ".okl";
-      kernelName = "insVelocityDirichletBC" + suffix;
-      ins->velocityDirichletBCKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
-
-      kernelName = "insVelocityNeumannBC" + suffix;
-      ins->velocityNeumannBCKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
-
-      fileName = oklpath + "insSubCycle" + suffix + ".okl";
-      kernelName = "insSubCycleStrongCubatureVolume" + suffix;
-      ins->subCycleStrongCubatureVolumeKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      kernelName = "insSubCycleStrongVolume" + suffix;
-      ins->subCycleStrongVolumeKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insSubCycleRKUpdate" + ".okl";
-      kernelName = "insSubCycleLSERKUpdate";
-      if(ins->SNrk == 4) kernelName = "insSubCycleERKUpdate";
-      ins->subCycleRKUpdateKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insExtrapolate" + ".okl";
-      kernelName = "insMultiExtrapolate";
-      ins->extrapolateKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      // ===========================================================================
-
-      fileName = install_dir + "/libparanumal/okl/scaledAdd.okl";
-      kernelName = "scaledAddwOffset";
-      ins->scaledAddKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = install_dir + "/libparanumal/okl/dotMultiply.okl";
-      kernelName = "dotMultiply";
-      ins->dotMultiplyKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "math" + ".okl";
-      kernelName = "fill";
-      ins->fillKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      kernelName = "max";
-      ins->maxKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      kernelName = "scalarScaledAdd";
-      ins->scalarScaledAddKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      kernelName = "maskCopy";
-      ins->maskCopyKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      // ===========================================================================
-
-      fileName = oklpath + "insFilterRT" + suffix + ".okl";
-      kernelName = "insFilterRT" + suffix;
-      ins->filterRTKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insCfl" + suffix + ".okl";
-      kernelName = "insCfl" + suffix;
-      ins->cflKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insQtl" + suffix + ".okl";
-      kernelName = "insQtl" + suffix;
-      ins->qtlKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "insPressureAddQtl" + ".okl";
-      kernelName = "insPressureAddQtl";
-      ins->pressureAddQtlKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = oklpath + "setEllipticCoeff.okl";
-      kernelName = "setEllipticCoeff";
-      ins->setEllipticCoeffKernel =
-        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      kernelName = "setEllipticCoeffPressure";
-      ins->setEllipticCoeffPressureKernel =
-        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      fileName = oklpath + "insPQ.okl";
-      kernelName = "insPQ";
-      ins->PQKernel =
-        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      fileName = oklpath + "insMueDiv.okl";
-      kernelName = "insMueDiv";
-      ins->mueDivKernel =
-        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-    }
-    MPI_Barrier(mesh->comm);
-  }
-
-  MPI_Barrier(mesh->comm);
-  if(mesh->rank == 0)  printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout);
-
-  if(options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
-    filterSetup(ins);
-
-  return ins;
-}
-
-cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &kernelInfoH)
-{
-  cds_t* cds = new cds_t();
-  cds->mesh = mesh;
-
-  if (mesh->rank == 0)
-    cout << "==================SCALAR SETUP===========================\n";
-
-  string install_dir;
-  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
-
-  // set mesh, options
-  cds->meshV       = ins->mesh;
-  cds->elementType = ins->elementType;
-  cds->dim         = ins->dim;
-  cds->NVfields    = ins->NVfields;
-  cds->NSfields    = ins->Nscalar;
-
-  cds->extbdfA = ins->extbdfA;
-  cds->extbdfB = ins->extbdfB;
-  cds->extbdfC = ins->extbdfC;
-  cds->extC    = ins->extC;
-
-  cds->Nstages       = ins->Nstages;
-  cds->temporalOrder = ins->temporalOrder;
-
-  cds->o_usrwrk = &(ins->o_usrwrk);
-
-  dlong Nlocal = mesh->Np * mesh->Nelements;
-  dlong Ntotal = mesh->Np * (mesh->Nelements + mesh->totalHaloPairs);
-  cds->Nlocal  = Nlocal;
-  cds->Ntotal  = Ntotal;
-
-  cds->vFieldOffset = ins->fieldOffset;
-  cds->fieldOffset  = ins->fieldOffset;
-  cds->Nblock       = (Nlocal + blockSize - 1) / blockSize;
-
-  cds->o_wrk0 = ins->o_wrk0;
-  cds->o_wrk1 = ins->o_wrk1;
-  cds->o_wrk2 = ins->o_wrk2;
-  cds->o_wrk3 = ins->o_wrk3;
-  cds->o_wrk4 = ins->o_wrk4;
-  cds->o_wrk5 = ins->o_wrk5;
-  cds->o_wrk6 = ins->o_wrk6;
-
-  cds->gsh = ins->gsh;
-  
-  if(ins->cht) {
-    meshParallelGatherScatterSetup(mesh, cds->Nlocal, mesh->globalIds, mesh->comm, 0);
-    oogs_mode oogsMode = OOGS_AUTO; 
-    if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT;
-    cds->gshT = oogs::setup(mesh->ogs, 1, cds->fieldOffset, ogsDfloat, NULL, oogsMode);
-  } else {
-    cds->gshT = cds->gsh;
-  }
-
-  // Solution storage at interpolation nodes
-  cds->U     = ins->U; // Point to INS side Velocity
-  cds->S     =
-    (dfloat*) calloc(cds->NSfields * cds->Nstages * cds->fieldOffset,sizeof(dfloat));
-  cds->BF    = (dfloat*) calloc(cds->NSfields * cds->fieldOffset,sizeof(dfloat));
-  cds->FS    =
-    (dfloat*) calloc(cds->NSfields * cds->Nstages * cds->fieldOffset,sizeof(dfloat));
-
-  cds->Nsubsteps = ins->Nsubsteps;
-  if(cds->Nsubsteps) {
-    cds->SNrk   = ins->SNrk;
-    cds->Srka   = ins->Srka;
-    cds->Srkb   = ins->Srkb;
-    cds->Srkc   = ins->Srkc;
-    cds->o_Srka = ins->o_Srka;
-    cds->o_Srkb = ins->o_Srkb;
-  }
-
-  cds->startTime = ins->startTime;
-  cds->dt  = ins->dt;
-  cds->idt = 1.0 / cds->dt;
-  cds->sdt = ins->sdt;
-  cds->NtimeSteps = ins->NtimeSteps;
-
-  cds->prop = (dfloat*) calloc(cds->NSfields * 2 * cds->fieldOffset,sizeof(dfloat));
-  for(int is = 0; is < cds->NSfields; is++) {
-    std::stringstream ss;
-    ss << std::setfill('0') << std::setw(2) << is;
-    string sid = ss.str();
-
-    if(options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) continue;
-
-    dfloat diff = 1;
-    dfloat rho = 1;
-    options.getArgs("SCALAR" + sid + " DIFFUSIVITY", diff);
-    options.getArgs("SCALAR" + sid + " DENSITY", rho);
-
-    const dlong off = cds->NSfields * cds->fieldOffset;
-    for (int e = 0; e < mesh->Nelements; e++)
-      for (int n = 0; n < mesh->Np; n++) {
-        cds->prop[0 * off + is * cds->fieldOffset + e * mesh->Np + n] = diff;
-        cds->prop[1 * off + is * cds->fieldOffset + e * mesh->Np + n] = rho;
-      }
-  }
-  cds->o_prop =
-    mesh->device.malloc(cds->NSfields * 2 * cds->fieldOffset * sizeof(dfloat), cds->prop);
-  cds->o_diff = cds->o_prop.slice(0 * cds->NSfields * cds->fieldOffset * sizeof(dfloat));
-  cds->o_rho  = cds->o_prop.slice(1 * cds->NSfields * cds->fieldOffset * sizeof(dfloat));
-
-  cds->var_coeff = 1; // use always var coeff elliptic
-  cds->ellipticCoeff   = ins->ellipticCoeff;
-  cds->o_ellipticCoeff = ins->o_ellipticCoeff;
-
-  cds->o_U  = ins->o_U;
-  cds->o_Ue = ins->o_Ue;
-  cds->o_S  =
-    mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat), cds->S);
-  cds->o_Se =
-    mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat));
-  cds->o_BF = mesh->device.malloc(cds->NSfields * cds->fieldOffset * sizeof(dfloat), cds->BF);
-  cds->o_FS =
-    mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat),
-                        cds->FS);
-
-  cds->options = options;
-  cds->options.setArgs("KRYLOV SOLVER",        options.getArgs("SCALAR SOLVER"));
-  cds->options.setArgs("DISCRETIZATION",       options.getArgs("SCALAR DISCRETIZATION"));
-  cds->options.setArgs("BASIS",                options.getArgs("SCALAR BASIS"));
-
-  cds->options.setArgs("DEBUG ENABLE OGS", "1");
-  cds->options.setArgs("DEBUG ENABLE REDUCTIONS", "1");
-
-  cds->TOL = 1e-6;
-
-  for (int is = 0; is < cds->NSfields; is++) {
-    mesh_t* mesh;
-    (is) ? mesh = cds->meshV : mesh = cds->mesh; // only first scalar can be a CHT mesh
-
-    int nbrBIDs = bcMap::size(0);
-    if(ins->cht && is == 0) nbrBIDs = bcMap::size(1);
-    int* sBCType = (int*) calloc(nbrBIDs + 1, sizeof(int));
-
-    std::stringstream ss;
-    ss << std::setfill('0') << std::setw(2) << is;
-    string sid = ss.str();
-
-    cds->compute[is] = 1;
-    if (options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) {
-      cds->compute[is] = 0;
-      continue;
-    }
-
-    for (int bID = 1; bID <= nbrBIDs; bID++) {
-      string bcTypeText(bcMap::text(bID, "scalar" + sid));
-      if(mesh->rank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str());
-      sBCType[bID] = bcMap::type(bID, "scalar" + sid);
-    }
-
-    cds->options.setArgs("PRECONDITIONER", options.getArgs("SCALAR" + sid + " PRECONDITIONER"));
-    cds->options.setArgs("SOLVER TOLERANCE",
-                         options.getArgs("SCALAR" + sid +  " SOLVER TOLERANCE"));
-    cds->options.setArgs("RESIDUAL PROJECTION",  options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION"));
-    cds->options.setArgs("RESIDUAL PROJECTION VECTORS",  options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION VECTORS"));
-    cds->options.setArgs("RESIDUAL PROJECTION START",  options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION START"));
-
-    cds->solver[is] = new elliptic_t();
-    cds->solver[is]->blockSolver = 0;
-    cds->solver[is]->Nfields = 1;
-    cds->solver[is]->Ntotal = ins->fieldOffset;
-    cds->solver[is]->wrk = scratch + ins->ellipticWrkOffset;
-    cds->solver[is]->o_wrk = o_scratch.slice(ins->ellipticWrkOffset * sizeof(dfloat));
-    cds->solver[is]->mesh = mesh;
-    cds->solver[is]->options = cds->options;
-    cds->solver[is]->dim = cds->dim;
-    cds->solver[is]->elementType = cds->elementType;
-    cds->solver[is]->BCType = (int*) calloc(nbrBIDs + 1,sizeof(int));
-    memcpy(cds->solver[is]->BCType,sBCType,(nbrBIDs + 1) * sizeof(int));
-
-    cds->solver[is]->var_coeff = cds->var_coeff;
-    for (int i = 0; i < 2 * ins->fieldOffset; i++) ins->ellipticCoeff[i] = 1;
-    cds->solver[is]->lambda = cds->ellipticCoeff;
-    cds->solver[is]->o_lambda = cds->o_ellipticCoeff;
-    cds->solver[is]->loffset = 0;
-    ellipticSolveSetup(cds->solver[is], kernelInfoH);
-
-    // setup boundary mapping
-    dfloat largeNumber = 1 << 20;
-    cds->mapB[is] = (int*) calloc(mesh->Nelements * mesh->Np,sizeof(int));
-    int* mapB = cds->mapB[is];
-    for (int e = 0; e < mesh->Nelements; e++)
-      for (int n = 0; n < mesh->Np; n++) mapB[n + e * mesh->Np] = largeNumber;
-
-    cds->EToB[is] = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int));
-    int* EToB = cds->EToB[is];
-
-    int cnt = 0;
-    for (int e = 0; e < mesh->Nelements; e++)
-      for (int f = 0; f < mesh->Nfaces; f++) {
-        int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "scalar" + sid);
-        EToB[cnt] = bc;
-        if (bc > 0) {
-          for (int n = 0; n < mesh->Nfp; n++) {
-            int fid = mesh->faceNodes[n + f * mesh->Nfp];
-            mapB[fid + e * mesh->Np] = mymin(bc,mapB[fid + e * mesh->Np]); // Dirichlet wins
-          }
-        }
-        cnt++;
-      }
-
-    ogsGatherScatter(mapB, ogsInt, ogsMin, mesh->ogs);
-
-    for (int n = 0; n < mesh->Nelements * mesh->Np; n++)
-      if (mapB[n] == largeNumber) mapB[n] = 0;
-
-    cds->o_EToB[is] = mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int), EToB);
-    cds->o_mapB[is] = mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(int), mapB);
-
-    free(sBCType);
-  }
-
-  // build mass + inverse mass matrix
-  dfloat* lumpedMassMatrix = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
-  for(hlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Np; ++n)
-      lumpedMassMatrix[e * mesh->Np +
-                       n] = mesh->vgeo[e * mesh->Np * mesh->Nvgeo + JWID * mesh->Np + n];
-  ogsGatherScatter(lumpedMassMatrix, ogsDfloat, ogsAdd, mesh->ogs);
-  mesh->o_LMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
-  mesh->o_LMM.copyTo(mesh->LMM);
-  for(int n = 0; n < mesh->Np * mesh->Nelements; ++n)
-    lumpedMassMatrix[n] = 1. / lumpedMassMatrix[n];
-  mesh->o_invLMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
-  mesh->o_invLMM.copyTo(mesh->invLMM);
-  free(lumpedMassMatrix);
-
-  // time stepper
-  dfloat rkC[4]  = {1.0, 0.0, -1.0, -2.0};
-  cds->o_rkC     = ins->o_rkC;
-  cds->o_extbdfA = ins->o_extbdfA;
-  cds->o_extbdfB = ins->o_extbdfB;
-  cds->o_extbdfC = ins->o_extbdfC;
-  cds->o_extC    = ins->o_extC;
-
-  // build kernels
-  occa::properties kernelInfo = *ins->kernelInfo;
-  occa::properties kernelInfoBC = kernelInfo;
-  //kernelInfo["defines/" "p_NSfields"]  = cds->NSfields;
-
-  const string bcDataFile = install_dir + "/include/insBcData.h";
-  kernelInfoBC["includes"] += bcDataFile.c_str();
-  string boundaryHeaderFileName;
-  options.getArgs("DATA FILE", boundaryHeaderFileName);
-  kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL);
-
-  string fileName, kernelName;
-  const string suffix = "Hex3D";
-  const string oklpath = install_dir + "/okl/core/";
-
-  for (int r = 0; r < 2; r++) {
-    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
-      fileName = oklpath + "cdsAdvection" + suffix + ".okl";
-
-      kernelName = "cdsStrongAdvectionVolume" + suffix;
-      cds->advectionStrongVolumeKernel =
-        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      kernelName = "cdsStrongAdvectionCubatureVolume" + suffix;
-      cds->advectionStrongCubatureVolumeKernel =  mesh->device.buildKernel(fileName,
-                                                                           kernelName,
-                                                                           kernelInfo);
-
-      // ===========================================================================
-
-      fileName = oklpath + "math.okl";
-      kernelName = "fill";
-      cds->fillKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      kernelName = "maskCopy";
-      cds->maskCopyKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName   = oklpath + "cdsSumMakef" + suffix + ".okl";
-      kernelName = "cdsSumMakef" + suffix;
-      cds->sumMakefKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      fileName = oklpath + "cdsHelmholtzBC" + suffix + ".okl";
-      kernelName = "cdsHelmholtzBC" + suffix;
-      cds->helmholtzRhsBCKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfoBC);
-
-      kernelName = "cdsDirichletBC";
-      cds->dirichletBCKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfoBC);
-
-      fileName = oklpath + "setEllipticCoeff.okl";
-      kernelName = "setEllipticCoeff";
-      cds->setEllipticCoeffKernel =
-        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      fileName = oklpath + "cdsMassMatrix.okl";
-      kernelName = "cdsMassMatrix" + suffix;
-      cds->massMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      kernelName = "cdsInvMassMatrix" + suffix;
-      cds->invMassMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-      fileName = oklpath + "cdsFilterRT" + suffix + ".okl";
-      kernelName = "cdsFilterRT" + suffix;
-      cds->filterRTKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      fileName = install_dir + "/libparanumal/okl/scaledAdd.okl";
-      kernelName = "scaledAddwOffset";
-      cds->scaledAddKernel =
-        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
-
-      if(cds->Nsubsteps) {
-        fileName = oklpath + "cdsSubCycle" + suffix + ".okl";
-        kernelName = "cdsSubCycleStrongCubatureVolume" + suffix;
-        cds->subCycleStrongCubatureVolumeKernel =  mesh->device.buildKernel(fileName,
-                                                                            kernelName,
-                                                                            kernelInfo);
-
-        kernelName = "cdsSubCycleStrongVolume" + suffix;
-        cds->subCycleStrongVolumeKernel =
-          mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-
-        fileName = oklpath + "cdsSubCycleRKUpdate.okl";
-        kernelName = "cdsSubCycleLSERKUpdate";
-        if(cds->SNrk == 4) kernelName = "cdsSubCycleERKUpdate";
-        cds->subCycleRKUpdateKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
-      }
-    }
-    MPI_Barrier(mesh->comm);
-  }
-
-  return cds;
-}
diff --git a/src/core/libParanumal.hpp b/src/core/libParanumal.hpp
deleted file mode 100644
index b837bfb93..000000000
--- a/src/core/libParanumal.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#if !defined(nekrs_libparanumal_hpp_)
-#define nekrs_libparanumal_hpp_
-
-#include "mesh.h"
-#include "mesh3D.h"
-
-namespace libParanumal
-{
-// Data structures
-using ::mesh_t;
-using ::mesh3D;
-using ::setupAide;
-
-// mesh
-using ::meshParallelConnect;
-using ::meshHaloSetup;
-using ::meshConnectBoundary;
-using ::meshLoadReferenceNodesHex3D;
-using ::meshConnectFaceNodes3D;
-using ::meshParallelConnectNodes;
-using ::meshSurfaceGeometricFactorsHex3D;
-using ::meshGeometricFactorsHex3D;
-}
-
-#endif
diff --git a/src/core/ins.h b/src/core/nrs.hpp
similarity index 81%
rename from src/core/ins.h
rename to src/core/nrs.hpp
index 32c8b84ba..b50ea1ee3 100644
--- a/src/core/ins.h
+++ b/src/core/nrs.hpp
@@ -1,11 +1,20 @@
-#if !defined(nekrs_ins_hpp_)
-#define nekrs_ins_hpp_
-
-#include "mesh2D.h"
+#if !defined(nekrs_nekrs_hpp_)
+#define nekrs_nekrs_hpp_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "nrssys.hpp"
 #include "mesh3D.h"
 #include "elliptic.h"
-#include "cds.h"
+#include "cds.hpp"
 #include "linAlg.hpp"
+#include "timer.hpp"
 
 typedef struct
 {
@@ -41,25 +50,20 @@ typedef struct
 
   int Nblock;
 
-  dfloat dt, idt;
-  dfloat time;
+  dfloat dt[3], idt;
   int tstep;
+  int lastStep;
   dfloat g0, ig0;
-  dfloat startTime;
-  dfloat finalTime;
 
   int cht;
 
   int temporalOrder;
   int ExplicitOrder;
-  int NtimeSteps;    // number of time steps
   int Nstages;
-  int outputStep;
   int isOutputStep;
   int outputForceStep;
 
   int NiterU, NiterV, NiterW, NiterP;
-  dfloat presTOL, velTOL;
 
   dfloat* U, * P;
   dfloat* BF, * FU;
@@ -101,8 +105,6 @@ typedef struct
 
   occa::memory o_idH; // i.e. inverse of 1D Gll Spacing for quad and Hex
 
-  int readRestartFile,writeRestartFile, restartedFromFile;
-
   int filterNc; // filter cut modes i.e. below is not touched
   dfloat* filterM, filterS;
   occa::memory o_filterMT; // transpose of filter matrix
@@ -198,6 +200,34 @@ typedef struct
   occa::memory o_EToB;
 
   occa::properties* kernelInfo;
-}ins_t;
+} nrs_t;
+
+
+#include "io.hpp"
+
+occa::device occaDeviceConfig(setupAide &options, MPI_Comm comm);
+
+// std::to_string might be not accurate enough
+static string to_string_f(double a)
+{
+  stringstream s;
+  s << std::scientific << a;
+  return s.str();
+}
+
+static std::vector<std::string> serializeString(const std::string sin)
+{
+  std::vector<std::string> slist;
+  string s(sin);
+  s.erase(std::remove_if(s.begin(), s.end(), ::isspace), s.end());
+  std::stringstream ss;
+  ss.str(s);
+  while( ss.good() ) {
+    std::string substr;
+    std::getline(ss, substr, ',');
+    slist.push_back(substr);
+  }
+  return slist;
+}
 
 #endif
diff --git a/src/core/nrssys.hpp b/src/core/nrssys.hpp
new file mode 100644
index 000000000..121e11ea8
--- /dev/null
+++ b/src/core/nrssys.hpp
@@ -0,0 +1,71 @@
+#if !defined(nekrs_nrssys_hpp_)
+#define nekrs_nrssys_hpp_
+
+#define BLOCKSIZE 256
+
+//float data type
+#if 0
+#define DFLOAT_SINGLE
+#define dfloat float
+#define MPI_DFLOAT MPI_FLOAT
+#define dfloatFormat "%f"
+#define dfloatString "float"
+#else
+#define DFLOAT_DOUBLE
+#define dfloat double
+#define MPI_DFLOAT MPI_DOUBLE
+#define dfloatFormat "%lf"
+#define dfloatString "double"
+#endif
+
+//smoother float data type
+#if 1
+#define pfloat float
+#define MPI_PFLOAT MPI_FLOAT
+#define pfloatFormat "%f"
+#define pfloatString "float"
+#else
+#define pfloat double
+#define MPI_PFLOAT MPI_DOUBLE
+#define pfloatFormat "%lf"
+#define pfloatString "double"
+#endif
+
+//host index data type
+#if 0
+#define hlong int
+#define MPI_HLONG MPI_INT
+#define hlongFormat "%d"
+#define hlongString "int"
+#else
+#define hlong long long int
+#define MPI_HLONG MPI_LONG_LONG_INT
+#define hlongFormat "%lld"
+#define hlongString "long long int"
+#endif
+
+//device index data type
+#if 1
+#define dlong int
+#define MPI_DLONG MPI_INT
+#define dlongFormat "%d"
+#define dlongString "int"
+#else
+#define dlong long long int
+#define MPI_DLONG MPI_LONG_LONG_INT
+#define dlongFormat "%lld"
+#define dlongString "long long int"
+#endif
+
+#include <mpi.h>
+#include "occa.hpp"
+#include "ogs.hpp"
+#include "setupAide.hpp"
+
+#define NEKRS_VERSION "20"
+#define NEKRS_SUBVERSION "1"
+
+#define EXIT(a)  { fflush(stdout); MPI_Finalize(); exit(a); }
+#define ABORT(a) { fflush(stdout); MPI_Abort(MPI_COMM_WORLD,a); }
+
+#endif
diff --git a/src/core/parReader.cpp b/src/core/parReader.cpp
index b03e73773..dad1ac845 100644
--- a/src/core/parReader.cpp
+++ b/src/core/parReader.cpp
@@ -18,7 +18,7 @@
 #define LOWER(a)  { transform(a.begin(), a.end(), a.begin(), std::ptr_fun<int, int>(std::tolower)); \
 }
 
-void setDefaultSettings(libParanumal::setupAide &options, string casename, int rank)
+void setDefaultSettings(setupAide &options, string casename, int rank)
 {
   options.setArgs("FORMAT", string("1.0"));
 
@@ -41,21 +41,22 @@ void setDefaultSettings(libParanumal::setupAide &options, string casename, int r
   options.setArgs("DEVICE NUMBER", "LOCAL-RANK");
   options.setArgs("PLATFORM NUMBER", "0");
   options.setArgs("VERBOSE", "FALSE");
+
+  options.setArgs("ADVECTION", "TRUE");
   options.setArgs("ADVECTION TYPE", "CUBATURE+CONVECTIVE");
+
   options.setArgs("RESTART FROM FILE", "0");
-  options.setArgs("TSTEPS FOR SOLUTION OUTPUT", "0");
+  options.setArgs("SOLUTION OUTPUT INTERVAL", "0");
+  options.setArgs("SOLUTION OUTPUT CONTROL", "STEPS");
   options.setArgs("FILTER STABILIZATION", "NONE");
 
   options.setArgs("START TIME", "0.0");
 
-  options.setArgs("VELOCITY BLOCK SOLVER", "FALSE");
+  options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
   options.setArgs("VELOCITY KRYLOV SOLVER", "PCG");
   options.setArgs("VELOCITY BASIS", "NODAL");
   options.setArgs("VELOCITY PRECONDITIONER", "JACOBI");
   options.setArgs("VELOCITY DISCRETIZATION", "CONTINUOUS");
-  options.setArgs("VELOCITY RESIDUAL PROJECTION", "FALSE");
-  options.setArgs("VELOCITY RESIDUAL PROJECTION VECTORS", "8");
-  options.setArgs("VELOCITY RESIDUAL PROJECTION START", "5");
 
   options.setArgs("STRESSFORMULATION", "FALSE");
 
@@ -83,20 +84,24 @@ void setDefaultSettings(libParanumal::setupAide &options, string casename, int r
   options.setArgs("BOOMERAMG ITERATIONS", "2");
   options.setArgs("PRESSURE MULTIGRID CHEBYSHEV DEGREE", "2");
 #endif
+
   options.setArgs("PRESSURE RESIDUAL PROJECTION", "TRUE");
   options.setArgs("PRESSURE RESIDUAL PROJECTION VECTORS", "8");
   options.setArgs("PRESSURE RESIDUAL PROJECTION START", "5");
 
-  options.setArgs("PRESSURE PARALMOND CHEBYSHEV DEGREE", "2");
-  options.setArgs("PRESSURE PARALMOND SMOOTHER", "CHEBYSHEV");
-  options.setArgs("PRESSURE PARALMOND PARTITION", "STRONGNODES");
-  options.setArgs("PRESSURE PARALMOND AGGREGATION STRATEGY", "DEFAULT");
-  options.setArgs("PRESSURE PARALMOND LPSCN ORDERING", "MAX");
+  options.setArgs("SCALAR INITIAL GUESS DEFAULT","EXTRAPOLATION");
+  options.setArgs("VELOCITY INITIAL GUESS DEFAULT","EXTRAPOLATION");
+
+  //options.setArgs("PRESSURE PARALMOND CHEBYSHEV DEGREE", "2");
+  //options.setArgs("PRESSURE PARALMOND SMOOTHER", "CHEBYSHEV");
+  //options.setArgs("PRESSURE PARALMOND PARTITION", "STRONGNODES");
+  //options.setArgs("PRESSURE PARALMOND AGGREGATION STRATEGY", "DEFAULT");
+  //options.setArgs("PRESSURE PARALMOND LPSCN ORDERING", "MAX");
   options.setArgs("PARALMOND SMOOTH COARSEST", "FALSE");
   options.setArgs("ENABLE FLOATCOMMHALF GS SUPPORT", "FALSE");
 }
 
-libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
+setupAide parRead(std::string &setupFile, MPI_Comm comm)
 {
   int rank;
   MPI_Comm_rank(comm, &rank);
@@ -107,7 +112,7 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
     ABORT(1);
   }
 
-  libParanumal::setupAide options;
+  setupAide options;
 
   string casename = setupFile.substr(0, setupFile.find(".par"));
   setDefaultSettings(options, casename, rank);
@@ -167,10 +172,12 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
   }
 
   int N;
-  if(ini.extract("general", "polynomialorder", N))
+  if(ini.extract("general", "polynomialorder", N)) {
     options.setArgs("POLYNOMIAL DEGREE", std::to_string(N));
-  else
+    if(N>9) exit("polynomialOrder > 9 is currently not supported!", EXIT_FAILURE);
+  } else {
     exit("Cannot find mandatory parameter GENERAL::polynomialOrder!", EXIT_FAILURE);
+  }
 
   int cubN = round(3./2 * (N+1) - 1) - 1;
   ini.extract("general", "cubaturepolynomialorder", cubN);
@@ -186,7 +193,6 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
   ini.extract("general", "timestepper", timeStepper);
   if(timeStepper == "bdf3" || timeStepper == "tombo3") {
     options.setArgs("TIME INTEGRATOR", "TOMBO3");
-    //exit("No support for bdf3!", EXIT_FAILURE);
   }
   if(timeStepper == "bdf2" || timeStepper == "tombo2")
     options.setArgs("TIME INTEGRATOR", "TOMBO2");
@@ -198,19 +204,26 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
   if(variableDt) exit("GENERAL::variableDt = Yes not supported!", EXIT_FAILURE);
 
   double endTime;
-  string stopAt;
+  string stopAt = "numsteps";
   ini.extract("general", "stopat", stopAt);
-  if(stopAt != "endtime") {
+  if(stopAt == "numsteps") {
     int numSteps;
     if(ini.extract("general", "numsteps", numSteps)) {
       options.setArgs("NUMBER TIMESTEPS", std::to_string(numSteps));
-      endTime = numSteps * dt;
+      endTime = -1;
     } else {
       exit("Cannot find mandatory parameter GENERAL::numSteps!", EXIT_FAILURE);
     }
-  } else if(!ini.extract("general", "endtime", endTime))
-    exit("Cannot find mandatory parameter GENERAL::endTime!", EXIT_FAILURE);
-  options.setArgs("FINAL TIME", to_string_f(endTime));
+  } else if(stopAt == "endtime") {
+    if(!ini.extract("general", "endtime", endTime))
+      exit("Cannot find mandatory parameter GENERAL::endTime!", EXIT_FAILURE);
+    options.setArgs("END TIME", to_string_f(endTime));
+  } else if(stopAt == "elapsedtime") { 
+    double elapsedTime;
+    if(!ini.extract("general", "elapsedtime", elapsedTime))
+      exit("Cannot find mandatory parameter GENERAL::elapsedTime!", EXIT_FAILURE);
+    options.setArgs("STOP AT ELAPSED TIME", to_string_f(elapsedTime));
+  } 
 
   string extrapolation;
   ini.extract("general", "extrapolation", extrapolation);
@@ -231,15 +244,14 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
 
   double writeInterval = 0;
   ini.extract("general", "writeinterval", writeInterval);
+  options.setArgs("SOLUTION OUTPUT INTERVAL", std::to_string(writeInterval));
 
-  int writeSteps = writeInterval;
   string writeControl;
-  if(ini.extract("general", "writecontrol", writeControl))
-    if(writeControl == "runtime") {
-      writeSteps = writeInterval / dt;
-      if((writeInterval - writeSteps*dt) / writeInterval > 1e-6*dt) writeSteps++;
-    }
-  options.setArgs("TSTEPS FOR SOLUTION OUTPUT", std::to_string(writeSteps));
+  if(ini.extract("general", "writecontrol", writeControl)) {
+    options.setArgs("SOLUTION OUTPUT CONTROL", "STEPS");
+    if(writeControl == "runtime") 
+      options.setArgs("SOLUTION OUTPUT CONTROL", "RUNTIME");
+  }
 
   bool dealiasing;
   if(ini.extract("general", "dealiasing", dealiasing))
@@ -284,6 +296,12 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
   if(ini.extract("problemtype", "stressformulation", stressFormulation))
     if(stressFormulation) options.setArgs("STRESSFORMULATION", "TRUE");
 
+  bool stokesFlow;
+  if(ini.extract("problemtype", "stokes", stokesFlow)) {
+    options.setArgs("ADVECTION", "TRUE");
+    if(stokesFlow) options.setArgs("ADVECTION", "FALSE");
+  }
+
   int bcInPar = 1;
   if(ini.sections.count("velocity")) {
     // PRESSURE
@@ -463,10 +481,12 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
     bool v_rproj;
     if(ini.extract("velocity", "residualproj", v_rproj) ||
        ini.extract("velocity", "residualprojection", v_rproj)) {
-      if(v_rproj)
+      if(v_rproj) {
         options.setArgs("VELOCITY RESIDUAL PROJECTION", "TRUE");
-      else
+        options.setArgs("VELOCITY INITIAL GUESS DEFAULT","PREVIOUS STEP");
+      } else {
         options.setArgs("VELOCITY RESIDUAL PROJECTION", "FALSE");
+      }
 
       int v_nProjVec;
       if(ini.extract("velocity", "residualprojectionvectors", v_nProjVec))
@@ -479,10 +499,13 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
     if(vsolver == "none") {
       options.setArgs("VELOCITY SOLVER", "NONE");
       flow = 0;
-    } else if(std::strstr(vsolver.c_str(), "block")) {
-      options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
-      if(options.compareArgs("VELOCITY RESIDUAL PROJECTION","TRUE"))
-        exit("Residual projection is not enabled for the velocity block solver!", EXIT_FAILURE);
+    } else if(!vsolver.empty()){
+      options.setArgs("VELOCITY BLOCK SOLVER", "FALSE");
+      if(std::strstr(vsolver.c_str(), "block")) {
+        options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
+        if(options.compareArgs("VELOCITY RESIDUAL PROJECTION","TRUE"))
+          exit("Residual projection is not enabled for the velocity block solver!", EXIT_FAILURE);
+      }
     }
 
     double v_residualTol;
@@ -529,17 +552,19 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
     if(solver == "none") {
       options.setArgs("SCALAR00 SOLVER", "NONE");
     } else {
+      options.setArgs("TEMPERATURE", "TRUE");
+      options.setArgs("SCALAR INITIAL GUESS DEFAULT","PREVIOUS STEP");
       options.setArgs("SCALAR00 PRECONDITIONER", "JACOBI");
-      options.setArgs("SCALAR00 RESIDUAL PROJECTION", "FALSE");
-      options.setArgs("SCALAR00 RESIDUAL PROJECTION VECTORS", "8");
-      options.setArgs("SCALAR00 RESIDUAL PROJECTION START", "5");
       bool t_rproj;
       if(ini.extract("temperature", "residualproj", t_rproj) || 
          ini.extract("temperature", "residualprojection", t_rproj)) {
-        if(t_rproj)
+        if(t_rproj) {
           options.setArgs("SCALAR00 RESIDUAL PROJECTION", "TRUE");
-        else
+          options.setArgs("SCALAR00 RESIDUAL PROJECTION VECTORS", "8");
+          options.setArgs("SCALAR00 RESIDUAL PROJECTION START", "5");
+        } else {
           options.setArgs("SCALAR00 RESIDUAL PROJECTION", "FALSE");
+        }
 
         int t_nProjVec;
         if(ini.extract("temperature", "residualprojectionvectors", t_nProjVec))
@@ -605,16 +630,17 @@ libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm)
       options.setArgs("SCALAR" + sid + " SOLVER", "NONE");
       continue;
     }
-    options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION", "FALSE");
-    options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION VECTORS", "8");
-    options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION START", "5");
     bool t_rproj;
     if(ini.extract("scalar" + sidPar, "residualproj", t_rproj) || 
        ini.extract("scalar" + sidPar, "residualprojection", t_rproj)) {
-      if(t_rproj)
+      if(t_rproj) {
         options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION", "TRUE");
-      else
+        options.setArgs("SCALAR INITIAL GUESS DEFAULT","PREVIOUS STEP");
+        options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION VECTORS", "8");
+        options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION START", "5");
+      } else {
         options.setArgs("SCALAR" + sid + " RESIDUAL PROJECTION", "FALSE");
+      }
 
       int t_nProjVec;
       if(ini.extract("scalar" + sidPar, "residualprojectionvectors", t_nProjVec))
diff --git a/src/core/parReader.hpp b/src/core/parReader.hpp
index a52219dd7..4c5422664 100644
--- a/src/core/parReader.hpp
+++ b/src/core/parReader.hpp
@@ -1,7 +1,7 @@
 #if !defined(nekrs_parreader_hpp_)
 #define nekrs_parreader_hpp_
 
-#include "nekrs.hpp"
-libParanumal::setupAide parRead(std::string &setupFile, MPI_Comm comm);
+#include "nrs.hpp"
+setupAide parRead(std::string &setupFile, MPI_Comm comm);
 
 #endif
diff --git a/src/core/runTime.cpp b/src/core/runTime.cpp
deleted file mode 100644
index d0f0b5504..000000000
--- a/src/core/runTime.cpp
+++ /dev/null
@@ -1,722 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "nrs.hpp"
-#include "nekInterfaceAdapter.hpp"
-#include "udf.hpp"
-#include "tombo.hpp"
-#include "cfl.hpp"
-
-void extbdfCoefficents(ins_t* ins, int order);
-
-void makef(ins_t* ins, dfloat time, occa::memory o_FU, occa::memory o_BF);
-occa::memory velocityStrongSubCycle(ins_t* ins, dfloat time,
-                                    occa::memory o_U);
-void fluidSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_U);
-
-void makeq(ins_t* ins, dfloat time, occa::memory o_FS, occa::memory o_BF);
-occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is,
-                                  occa::memory o_U, occa::memory o_S);
-void scalarSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_S);
-
-void qthermal(ins_t* ins, dfloat time, occa::memory o_div);
-
-double tElapsed = 0;
-
-void runStep(ins_t* ins, dfloat time, dfloat dt, int tstep)
-{
-  mesh_t* mesh = ins->mesh;
-  cds_t* cds = ins->cds;
-
-  mesh->device.finish();
-  MPI_Barrier(mesh->comm);
-  double tStart = MPI_Wtime();
-
-  ins->dt = dt;
-  if(tstep <= 1)
-    extbdfCoefficents(ins,tstep);
-  else if(tstep <= 2 && ins->temporalOrder >= 2)
-    extbdfCoefficents(ins,tstep);
-  else if(tstep <= 3 && ins->temporalOrder >= 3)
-    extbdfCoefficents(ins,tstep);
-
-  // extrapolate
-  if(ins->flow) 
-    ins->extrapolateKernel(mesh->Nelements,
-                           ins->NVfields,
-                           ins->ExplicitOrder,
-                           ins->fieldOffset,
-                           ins->o_extbdfA,
-                           ins->o_U,
-                           ins->o_Ue);
-  if(ins->Nscalar) 
-    ins->extrapolateKernel(mesh->Nelements,
-                           cds->NSfields,
-                           cds->ExplicitOrder,
-                           cds->fieldOffset,
-                           cds->o_extbdfA,
-                           cds->o_S,
-                           cds->o_Se);
-
-  if(ins->Nscalar)
-    scalarSolve(ins, time, dt, cds->o_S);
-
-  if(udf.properties) {
-    timer::tic("udfProperties", 1);
-    occa::memory o_S = ins->o_wrk0;
-    occa::memory o_SProp = ins->o_wrk0;
-    if(ins->Nscalar) {
-      o_S = cds->o_S;
-      o_SProp = cds->o_prop;
-    }
-    udf.properties(ins, time + dt, ins->o_U, o_S, ins->o_prop, o_SProp);
-    timer::toc("udfProperties");
-  }
-
-  if(udf.div) udf.div(ins, time + dt, ins->o_div);
-  //ins->fillKernel(ins->fieldOffset, 0.0, ins->o_div);
-
-  if(ins->flow) fluidSolve(ins, time, dt, ins->o_U); 
-
-  const dfloat cfl = computeCFL(ins, time + dt, tstep);
-
-  mesh->device.finish();
-  MPI_Barrier(mesh->comm);
-  const double tElapsedStep = MPI_Wtime() - tStart;
-  tElapsed += tElapsedStep;
-  timer::set("solve", tElapsed);
-  if(mesh->rank == 0) {
-    printf("step= %d  t= %.8e  dt=%.1e  C= %.2f",
-           tstep, time + dt, dt, cfl);
-
-    if(ins->flow) {
-      if(ins->uvwSolver)
-        printf("  UVW: %d  P: %d", ins->NiterU, ins->NiterP);
-      else
-        printf("  U: %d  V: %d  W: %d  P: %d", ins->NiterU, ins->NiterV, ins->NiterW, ins->NiterP);
-    }
-
-    for(int is = 0; is < ins->Nscalar; is++)
-      if(cds->compute[is]) printf("  S: %d", cds->Niter[is]);
-
-    printf("  eTime= %.2e, %.5e s\n", tElapsedStep, tElapsed);
-  }
-
-  if(cfl > 30 || std::isnan(cfl)) {
-    if(mesh->rank == 0) cout << "Unreasonable CFL! Dying ...\n" << endl;
-    ABORT(1);
-  }
-
-  if(tstep % 10 == 0) fflush(stdout);
-}
-
-void extbdfCoefficents(ins_t* ins, int order)
-{
-  if(order == 1) {
-    ins->g0 =  1.0;
-    dfloat extbdfB[] = {1.0, 0.0, 0.0};
-    dfloat extbdfA[] = {1.0, 0.0, 0.0};
-    memcpy(ins->extbdfB, extbdfB, 3 * sizeof(dfloat));
-    memcpy(ins->extbdfA, extbdfA, 3 * sizeof(dfloat));
-    ins->ExplicitOrder = 1;
-  } else if(order == 2) {
-    ins->g0 =  1.5;
-    dfloat extbdfB[] = {2.0,-0.5, 0.0};
-    dfloat extbdfA[] = {2.0,-1.0, 0.0};
-    memcpy(ins->extbdfB, extbdfB, 3 * sizeof(dfloat));
-    memcpy(ins->extbdfA, extbdfA, 3 * sizeof(dfloat));
-    ins->ExplicitOrder = 2;
-  } else if(order == 3) {
-    ins->g0 =  11./6.;
-    dfloat extbdfB[] = {3.0,-1.5, 1.0/3.0};
-    dfloat extbdfA[] = {3.0,-3.0, 1.0};
-    memcpy(ins->extbdfB, extbdfB, 3 * sizeof(dfloat));
-    memcpy(ins->extbdfA, extbdfA, 3 * sizeof(dfloat));
-    ins->ExplicitOrder = 3;
-  }
-
-  ins->o_extbdfB.copyFrom(ins->extbdfB); // bdf
-  ins->o_extbdfA.copyFrom(ins->extbdfA); // ext
-
-  ins->ig0 = 1.0 / ins->g0;
-
-  if (ins->Nscalar) {
-    ins->cds->ExplicitOrder = ins->ExplicitOrder;
-    ins->cds->g0 = ins->g0;
-    ins->cds->ig0 = ins->ig0;
-  }
-}
-
-void makeq(ins_t* ins, dfloat time, occa::memory o_FS, occa::memory o_BF)
-{
-  cds_t* cds   = ins->cds;
-  mesh_t* mesh = cds->mesh;
-
-  if(udf.sEqnSource) {
-    timer::tic("udfSEqnSource", 1);
-    udf.sEqnSource(ins, time, cds->o_S, o_FS);
-    timer::toc("udfSEqnSource");
-  }
-
-  for(int is = 0; is < cds->NSfields; is++) {
-    if(!cds->compute[is]) continue;
-
-    mesh_t* mesh;
-    (is) ? mesh = cds->meshV : mesh = cds->mesh;
-    const dlong isOffset = is * cds->fieldOffset;
-    occa::memory o_adv = cds->o_wrk0;
-
-    if(cds->options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
-      cds->filterRTKernel(
-        cds->meshV->Nelements,
-        ins->o_filterMT,
-        ins->filterS,
-        isOffset,
-        cds->o_rho,
-        cds->o_S,
-        o_FS);
-
-    if(cds->Nsubsteps) {
-      o_adv = scalarStrongSubCycle(cds, time, is, cds->o_U, cds->o_S);
-    } else {
-      if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
-        cds->advectionStrongCubatureVolumeKernel(
-          cds->meshV->Nelements,
-          mesh->o_vgeo,
-          mesh->o_cubvgeo,
-          mesh->o_cubDiffInterpT,
-          mesh->o_cubInterpT,
-          mesh->o_cubProjectT,
-          cds->vFieldOffset,
-          isOffset,
-          cds->o_U,
-          cds->o_S,
-          cds->o_rho,
-          cds->o_wrk0);
-      else
-        cds->advectionStrongVolumeKernel(
-          cds->meshV->Nelements,
-          mesh->o_vgeo,
-          mesh->o_Dmatrices,
-          cds->vFieldOffset,
-          isOffset,
-          cds->o_U,
-          cds->o_S,
-          cds->o_rho,
-          cds->o_wrk0);
-
-      ins->scaledAddKernel(
-        cds->meshV->Nelements * cds->meshV->Np,
-        -1.0,
-        0 * cds->fieldOffset,
-        cds->o_wrk0,
-        1.0,
-        isOffset,
-        o_FS);
-    }
-
-    cds->sumMakefKernel(
-      mesh->Nelements,
-      mesh->o_vgeo,
-      cds->idt,
-      cds->o_extbdfA,
-      cds->o_extbdfB,
-      cds->fieldOffset * cds->NSfields,
-      isOffset,
-      cds->o_S,
-      o_adv,
-      o_FS,
-      cds->o_rho,
-      o_BF);
-  }
-}
-
-void scalarSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_S)
-{
-  cds_t* cds   = ins->cds;
-
-  timer::tic("makeq", 1);
-  cds->fillKernel(cds->fieldOffset * cds->NSfields, 0.0, cds->o_FS);
-  makeq(ins, time, cds->o_FS, cds->o_BF);
-  timer::toc("makeq");
-
-  for (int s = cds->Nstages; s > 1; s--) {
-    const dlong Nbyte = cds->fieldOffset * cds->NSfields * sizeof(dfloat);
-    cds->o_FS.copyFrom(cds->o_FS, Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
-    cds->o_S.copyFrom (cds->o_S , Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
-  }
-
-  timer::tic("scalarSolve", 1);
-  for (int is = 0; is < cds->NSfields; is++) {
-    if(!cds->compute[is]) continue;
-
-    mesh_t* mesh;
-    (is) ? mesh = cds->meshV : mesh = cds->mesh;
-
-    cds->setEllipticCoeffKernel(
-      cds->Nlocal,
-      cds->g0 * cds->idt,
-      is * cds->fieldOffset,
-      cds->fieldOffset,
-      cds->o_diff,
-      cds->o_rho,
-      cds->o_ellipticCoeff);
-
-    if(cds->o_BFDiag.ptr())
-      cds->scaledAddKernel(
-        cds->Nlocal,
-        1.0,
-        is * cds->fieldOffset,
-        cds->o_BFDiag,
-        1.0,
-        cds->fieldOffset,
-        cds->o_ellipticCoeff);
-
-    occa::memory o_Snew = cdsSolve(is, cds, time + dt);
-    o_Snew.copyTo(o_S, cds->Ntotal * sizeof(dfloat), is * cds->fieldOffset * sizeof(dfloat));
-  }
-  timer::toc("scalarSolve");
-}
-
-void makef(ins_t* ins, dfloat time, occa::memory o_FU, occa::memory o_BF)
-{
-  mesh_t* mesh = ins->mesh;
-
-  if(udf.uEqnSource) {
-    timer::tic("udfUEqnSource", 1);
-    udf.uEqnSource(ins, time, ins->o_U, o_FU);
-    timer::toc("udfUEqnSource");
-  }
-
-  if(ins->options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
-    ins->filterRTKernel(
-      mesh->Nelements,
-      ins->o_filterMT,
-      ins->filterS,
-      ins->fieldOffset,
-      ins->o_U,
-      o_FU);
-
-  occa::memory o_adv = ins->o_wrk0;
-  if(ins->Nsubsteps) {
-    o_adv = velocityStrongSubCycle(ins, time, ins->o_U);
-  } else {
-    if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
-      ins->advectionStrongCubatureVolumeKernel(
-        mesh->Nelements,
-        mesh->o_vgeo,
-        mesh->o_cubvgeo,
-        mesh->o_cubDiffInterpT,
-        mesh->o_cubInterpT,
-        mesh->o_cubProjectT,
-        ins->fieldOffset,
-        ins->o_U,
-        ins->o_wrk0);
-    else
-      ins->advectionStrongVolumeKernel(
-        mesh->Nelements,
-        mesh->o_vgeo,
-        mesh->o_Dmatrices,
-        ins->fieldOffset,
-        ins->o_U,
-        ins->o_wrk0);
-
-    ins->scaledAddKernel(
-      ins->NVfields * ins->fieldOffset,
-      -1.0,
-      0,
-      ins->o_wrk0,
-      1.0,
-      0,
-      o_FU);
-  }
-
-  ins->sumMakefKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    ins->idt,
-    ins->o_extbdfA,
-    ins->o_extbdfB,
-    ins->fieldOffset,
-    ins->o_U,
-    o_adv,
-    o_FU,
-    o_BF);
-}
-
-void fluidSolve(ins_t* ins, dfloat time, dfloat dt, occa::memory o_U)
-{
-  mesh_t* mesh = ins->mesh;
-
-  timer::tic("makef", 1);
-  ins->fillKernel(ins->fieldOffset * ins->NVfields, 0.0, ins->o_FU);
-  makef(ins, time, ins->o_FU, ins->o_BF);
-  timer::toc("makef");
-
-  for (int s = ins->Nstages; s > 1; s--) {
-    const dlong Nbyte = ins->fieldOffset * ins->NVfields * sizeof(dfloat);
-    ins->o_FU.copyFrom(ins->o_FU, Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
-    ins->o_U.copyFrom (ins->o_U , Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
-  }
-
-  timer::tic("pressureSolve", 1);
-  ins->setEllipticCoeffPressureKernel(
-    ins->Nlocal,
-    ins->fieldOffset,
-    ins->o_rho,
-    ins->o_ellipticCoeff);
-  occa::memory o_Pnew = tombo::pressureSolve(ins, time + dt);
-  ins->o_P.copyFrom(o_Pnew, ins->Ntotal * sizeof(dfloat));
-  timer::toc("pressureSolve");
-
-  timer::tic("velocitySolve", 1);
-  ins->setEllipticCoeffKernel(
-    ins->Nlocal,
-    ins->g0 * ins->idt,
-    0 * ins->fieldOffset,
-    ins->fieldOffset,
-    ins->o_mue,
-    ins->o_rho,
-    ins->o_ellipticCoeff);
-
-  occa::memory o_Unew = tombo::velocitySolve(ins, time + dt);
-  o_U.copyFrom(o_Unew, ins->NVfields * ins->fieldOffset * sizeof(dfloat));
-  timer::toc("velocitySolve");
-}
-
-occa::memory velocityStrongSubCycle(ins_t* ins, dfloat time, occa::memory o_U)
-{
-  mesh_t* mesh = ins->mesh;
-
-  // Solve for Each SubProblem
-  for (int torder = ins->ExplicitOrder - 1; torder >= 0; torder--) {
-    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
-    dlong toffset = torder * ins->NVfields * ins->fieldOffset;
-    const dfloat b = ins->extbdfB[torder];
-    if (torder == ins->ExplicitOrder - 1)
-      ins->scaledAddKernel(ins->NVfields * ins->fieldOffset, b, toffset,
-                           o_U, 0.0, 0, ins->o_wrk0);
-    else
-      ins->scaledAddKernel(ins->NVfields * ins->fieldOffset, b, toffset,
-                           o_U, 1.0, 0, ins->o_wrk0);
-
-    // Advance subproblem from here from t^(n-torder) to t^(n-torder+1)
-    for(int ststep = 0; ststep < ins->Nsubsteps; ++ststep) {
-      const dfloat tsub   = time - torder * ins->dt;
-      const dfloat tstage = tsub + ststep * ins->sdt;
-
-      //ins->o_wrk3.copyFrom(ins->o_wrk0, ins->NVfields*ins->fieldOffset*sizeof(dfloat));
-      ins->o_wrk0.copyFrom(ins->o_wrk0, ins->NVfields * ins->fieldOffset * sizeof(dfloat),
-                           ins->NVfields * ins->fieldOffset * sizeof(dfloat),0);
-
-      for(int rk = 0; rk < ins->SNrk; ++rk) {
-        // Extrapolate velocity to subProblem stage time
-        const dfloat t   = tstage +  ins->sdt * ins->Srkc[rk];
-        const dfloat tn0 = time - 0 * ins->dt;
-        const dfloat tn1 = time - 1 * ins->dt;
-        const dfloat tn2 = time - 2 * ins->dt;
-        switch(ins->ExplicitOrder) {
-        case 1:
-          ins->extC[0] = 1;
-          ins->extC[1] = 0;
-          ins->extC[2] = 0;
-          break;
-        case 2:
-          ins->extC[0] = (t - tn1) / (tn0 - tn1);
-          ins->extC[1] = (t - tn0) / (tn1 - tn0);
-          ins->extC[2] = 0;
-          break;
-        case 3:
-          ins->extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2));
-          ins->extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2));
-          ins->extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1));
-          break;
-        }
-        ins->o_extC.copyFrom(ins->extC);
-
-        if(mesh->NglobalGatherElements) {
-          if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
-            ins->subCycleStrongCubatureVolumeKernel(
-              mesh->NglobalGatherElements,
-              mesh->o_globalGatherElementList,
-              mesh->o_vgeo,
-              mesh->o_cubvgeo,
-              mesh->o_cubDiffInterpT,
-              mesh->o_cubInterpT,
-              mesh->o_cubProjectT,
-              ins->fieldOffset,
-              rk * ins->NVfields * ins->fieldOffset,
-              mesh->o_invLMM,
-              ins->o_extC,
-              o_U,
-              ins->o_wrk0,
-              ins->o_wrk6);
-          else
-            ins->subCycleStrongVolumeKernel(
-              mesh->NglobalGatherElements,
-              mesh->o_globalGatherElementList,
-              mesh->o_vgeo,
-              mesh->o_Dmatrices,
-              ins->fieldOffset,
-              rk * ins->NVfields * ins->fieldOffset,
-              mesh->o_invLMM,
-              ins->o_extC,
-              o_U,
-              ins->o_wrk0,
-              ins->o_wrk6);
-        }
-
-        occa::memory o_rhs;
-        if(rk == 0) o_rhs = ins->o_wrk6;
-        if(rk == 1) o_rhs = ins->o_wrk9;
-        if(rk == 2) o_rhs = ins->o_wrk12;
-        if(rk == 3) o_rhs = ins->o_wrk15;
-
-        oogs::start(o_rhs, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);                     
-
-        if(mesh->NlocalGatherElements) {
-          if(ins->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
-            ins->subCycleStrongCubatureVolumeKernel(
-              mesh->NlocalGatherElements,
-              mesh->o_localGatherElementList,
-              mesh->o_vgeo,
-              mesh->o_cubvgeo,
-              mesh->o_cubDiffInterpT,
-              mesh->o_cubInterpT,
-              mesh->o_cubProjectT,
-              ins->fieldOffset,
-              rk * ins->NVfields * ins->fieldOffset,
-              mesh->o_invLMM,
-              ins->o_extC,
-              o_U,
-              ins->o_wrk0,
-              ins->o_wrk6);
-          else
-            ins->subCycleStrongVolumeKernel(
-              mesh->NlocalGatherElements,
-              mesh->o_localGatherElementList,
-              mesh->o_vgeo,
-              mesh->o_Dmatrices,
-              ins->fieldOffset,
-              rk * ins->NVfields * ins->fieldOffset,
-              mesh->o_invLMM,
-              ins->o_extC,
-              o_U,
-              ins->o_wrk0,
-              ins->o_wrk6);
-        }
-
-        oogs::finish(o_rhs, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);                     
-
-        ins->subCycleRKUpdateKernel(
-          mesh->Nelements,
-          rk,
-          ins->sdt,
-          ins->fieldOffset,
-          ins->o_Srka,
-          ins->o_Srkb,
-          ins->o_wrk3,
-          ins->o_wrk6,
-          ins->o_wrk0);
-      }
-    }
-  }
-  return ins->o_wrk0;
-}
-
-occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is,
-                                  occa::memory o_U, occa::memory o_S)
-{
-  mesh_t* mesh = cds->meshV;
-
-  // Solve for Each SubProblem
-  for (int torder = (cds->ExplicitOrder - 1); torder >= 0; torder--) {
-    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
-    const dlong toffset = is * cds->fieldOffset +
-                          torder * cds->NSfields * cds->fieldOffset;
-    if (torder == cds->ExplicitOrder - 1)
-      cds->scaledAddKernel(cds->fieldOffset, cds->extbdfB[torder],
-                           toffset, o_S, 0.0, 0, cds->o_wrk0);
-    else
-      cds->scaledAddKernel(cds->fieldOffset, cds->extbdfB[torder],
-                           toffset, o_S, 1.0, 0, cds->o_wrk0);
-
-    // Advance SubProblem to t^(n-torder+1)
-    for(int ststep = 0; ststep < cds->Nsubsteps; ++ststep) {
-      const dfloat tsub   = time - torder * cds->dt;
-      const dfloat tstage = tsub + ststep * cds->sdt;
-
-      //cds->o_wrk1.copyFrom(cds->o_wrk0, cds->fieldOffset*sizeof(dfloat));
-      cds->o_wrk0.copyFrom(cds->o_wrk0, cds->fieldOffset * sizeof(dfloat),
-                           cds->fieldOffset * sizeof(dfloat), 0);
-
-      for(int rk = 0; rk < cds->SNrk; ++rk) {
-        // Extrapolate velocity to subProblem stage time
-        const dfloat t   = tstage +  cds->sdt * cds->Srkc[rk];
-        const dfloat tn0 = time - 0 * cds->dt;
-        const dfloat tn1 = time - 1 * cds->dt;
-        const dfloat tn2 = time - 2 * cds->dt;
-        switch(cds->ExplicitOrder) {
-        case 1:
-          cds->extC[0] = 1;
-          cds->extC[1] = 0;
-          cds->extC[2] = 0;
-          break;
-        case 2:
-          cds->extC[0] = (t - tn1) / (tn0 - tn1);
-          cds->extC[1] = (t - tn0) / (tn1 - tn0);
-          cds->extC[2] = 0;
-          break;
-        case 3:
-          cds->extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2));
-          cds->extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2));
-          cds->extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1));
-          break;
-        }
-        cds->o_extC.copyFrom(cds->extC);
-
-        if(mesh->NglobalGatherElements) {
-          if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
-            cds->subCycleStrongCubatureVolumeKernel(
-              mesh->NglobalGatherElements,
-              mesh->o_globalGatherElementList,
-              cds->vFieldOffset,
-              rk * cds->fieldOffset,
-              mesh->o_vgeo,
-              mesh->o_cubvgeo,
-              mesh->o_cubDiffInterpT,
-              mesh->o_cubInterpT,
-              mesh->o_cubProjectT,
-              mesh->o_invLMM,
-              cds->o_extC,
-              o_U,
-              cds->o_wrk0,
-              cds->o_wrk2);
-          else
-            cds->subCycleStrongVolumeKernel(
-              mesh->NglobalGatherElements,
-              mesh->o_globalGatherElementList,
-              cds->vFieldOffset,
-              rk * cds->fieldOffset,
-              mesh->o_vgeo,
-              mesh->o_Dmatrices,
-              mesh->o_invLMM,
-              cds->o_extC,
-              o_U,
-              cds->o_wrk0,
-              cds->o_wrk2);
-        }
-
-        occa::memory o_rhs;
-        if(rk == 0) o_rhs = cds->o_wrk2;
-        if(rk == 1) o_rhs = cds->o_wrk3;
-        if(rk == 2) o_rhs = cds->o_wrk4;
-        if(rk == 3) o_rhs = cds->o_wrk5;
-
-        oogs::start(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh);
-
-        if(mesh->NlocalGatherElements) {
-          if(cds->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
-            cds->subCycleStrongCubatureVolumeKernel(
-              mesh->NlocalGatherElements,
-              mesh->o_localGatherElementList,
-              cds->vFieldOffset,
-              rk * cds->fieldOffset,
-              mesh->o_vgeo,
-              mesh->o_cubvgeo,
-              mesh->o_cubDiffInterpT,
-              mesh->o_cubInterpT,
-              mesh->o_cubProjectT,
-              mesh->o_invLMM,
-              cds->o_extC, 
-              o_U,
-              cds->o_wrk0,
-              cds->o_wrk2);
-          else
-            cds->subCycleStrongVolumeKernel(
-              mesh->NlocalGatherElements,
-              mesh->o_localGatherElementList,
-              cds->vFieldOffset,
-              rk * cds->fieldOffset,
-              mesh->o_vgeo,
-              mesh->o_Dmatrices,
-              mesh->o_invLMM,
-              cds->o_extC,
-              o_U,
-              cds->o_wrk0,
-              cds->o_wrk2);
-        }
-
-        oogs::finish(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh);
-
-        cds->subCycleRKUpdateKernel(
-          mesh->Nelements,
-          rk,
-          cds->sdt,
-          cds->fieldOffset,
-          cds->o_Srka,
-          cds->o_Srkb,
-          cds->o_wrk1,
-          cds->o_wrk2,
-          cds->o_wrk0);
-      }
-    }
-  }
-  return cds->o_wrk0;
-}
-
-// qtl = 1/(rho*cp*T) * (div[k*grad[T] ] + qvol)
-void qthermal(ins_t* ins, dfloat time, occa::memory o_div)
-{
-  cds_t* cds = ins->cds;
-  mesh_t* mesh = ins->mesh;
-
-  ins->gradientVolumeKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    cds->o_S,
-    cds->o_wrk0);
-
-  oogs::startFinish(cds->o_wrk0, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);
-
-  ins->invMassMatrixKernel(
-    mesh->Nelements,
-    ins->fieldOffset,
-    ins->NVfields,
-    mesh->o_vgeo,
-    mesh->o_invLMM,
-    cds->o_wrk0);
-
-  if(udf.sEqnSource) {
-    timer::tic("udfSEqnSource", 1);
-    udf.sEqnSource(ins, time, cds->o_S, cds->o_wrk3);
-    timer::toc("udfSEqnSource");
-  } else {
-    ins->fillKernel(mesh->Nelements * mesh->Np, 0.0, cds->o_wrk3);
-  }
-
-  ins->qtlKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    cds->o_wrk0,
-    cds->o_S,
-    cds->o_diff,
-    cds->o_rho,
-    cds->o_wrk3,
-    o_div);
-
-  oogs::startFinish(o_div, 1, ins->fieldOffset, ogsDfloat, ogsAdd, ins->gsh);
-
-  ins->invMassMatrixKernel(
-    mesh->Nelements,
-    ins->fieldOffset,
-    1,
-    mesh->o_vgeo,
-    mesh->o_invLMM,
-    o_div);
-}
diff --git a/src/core/runTime.hpp b/src/core/runTime.hpp
deleted file mode 100644
index 7552a4400..000000000
--- a/src/core/runTime.hpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#if !defined(nekrs_runtime_hpp_)
-#define nekrs_runtime_hpp_
-
-#include "nekrs.hpp"
-void runStep(ins_t* ins, dfloat time, dfloat dt, int tstep);
-
-#endif
diff --git a/src/core/setup.cpp b/src/core/setup.cpp
new file mode 100644
index 000000000..3c2800d3b
--- /dev/null
+++ b/src/core/setup.cpp
@@ -0,0 +1,1166 @@
+#include "nrs.hpp"
+#include "meshSetup.hpp"
+#include "nekInterfaceAdapter.hpp"
+#include "udf.hpp"
+#include "filter.hpp"
+#include "bcMap.hpp"
+#include <vector>
+#include <map>
+
+static dfloat* scratch;
+static occa::memory o_scratch;
+
+static cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &kernelInfoH);
+
+nrs_t* nrsSetup(MPI_Comm comm, occa::device device, setupAide &options, int buildOnly)
+{
+  nrs_t* nrs = new nrs_t();
+
+  nrs->options = options;
+  nrs->kernelInfo = new occa::properties();
+  occa::properties& kernelInfo = *nrs->kernelInfo;
+  kernelInfo["defines"].asObject();
+  kernelInfo["includes"].asArray();
+  kernelInfo["header"].asArray();
+  kernelInfo["flags"].asObject();
+  kernelInfo["include_paths"].asArray();
+
+  int N, cubN;
+  string install_dir;
+  nrs->options.getArgs("POLYNOMIAL DEGREE", N);
+  nrs->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN);
+  nrs->options.getArgs("NUMBER OF SCALARS", nrs->Nscalar);
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+  nrs->options.getArgs("MESH DIMENSION", nrs->dim);
+  nrs->options.getArgs("ELEMENT TYPE", nrs->elementType);
+
+  nrs->flow = 1;
+  if(nrs->options.compareArgs("VELOCITY", "FALSE")) nrs->flow = 0;
+  if(nrs->options.compareArgs("VELOCITY SOLVER", "NONE")) nrs->flow = 0;
+
+  if(nrs->flow) {
+    if(nrs->options.compareArgs("STRESSFORMULATION", "TRUE"))
+       nrs->options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
+  }
+
+
+  // jit compile + init nek
+  {  
+    int rank, size;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+    string casename;
+    nrs->options.getArgs("CASENAME", casename);
+
+    int npTarget = size;
+    if (buildOnly) nrs->options.getArgs("NP TARGET", npTarget);
+    if (rank == 0) buildNekInterface(casename.c_str(), mymax(1, nrs->Nscalar), N, npTarget);
+    MPI_Barrier(comm);
+    if (!buildOnly) {
+      nek_setup(comm, nrs->options, nrs);
+      nek_setic();
+      nek_userchk();
+    }
+  }
+
+  nrs->cht = 0;
+  if (nekData.nelv != nekData.nelt && nrs->Nscalar) nrs->cht = 1;
+
+  // create mesh
+  if (buildOnly) {
+    nrs->meshT = createMeshDummy(comm, N, cubN, nrs->options, device, kernelInfo);
+    nrs->mesh = nrs->meshT;
+  } else {
+    nrs->meshT = createMesh(comm, N, cubN, nrs->cht, nrs->options, device, kernelInfo);
+    nrs->mesh = nrs->meshT;
+    if (nrs->cht) nrs->mesh = createMeshV(comm, N, cubN, nrs->meshT, nrs->options, kernelInfo);
+  }
+  mesh_t* mesh = nrs->mesh;
+
+  if (nrs->cht && !nrs->options.compareArgs("TEMPERATURE", "TRUE")) {
+    if (mesh->rank == 0) cout << "Conjugate heat transfer requires solving for temperature!\n"; 
+    EXIT(1);
+  } 
+
+  { 
+    dlong retVal; 
+    MPI_Allreduce(&mesh->NinternalElements,&retVal,1,MPI_DLONG,MPI_MIN,mesh->comm);
+    if(mesh->rank == 0) printf("min NinternalElements: %d (ratio: %4.2f)\n", retVal, (double)retVal/mesh->Nelements);
+  }
+
+  occa::properties kernelInfoV  = kernelInfo;
+  occa::properties kernelInfoP  = kernelInfo;
+  occa::properties kernelInfoS  = kernelInfo;
+
+  nrs->NVfields = 3;
+  nrs->NTfields = nrs->NVfields + 1;   // Total Velocity + Pressure
+
+  nrs->SNrk = 0;
+  nrs->options.getArgs("SUBCYCLING TIME STAGE NUMBER", nrs->SNrk);
+
+  mesh->Nfields = 1;
+
+  nrs->extbdfA = (dfloat*) calloc(3, sizeof(dfloat));
+  nrs->extbdfB = (dfloat*) calloc(3, sizeof(dfloat));
+  nrs->extbdfC = (dfloat*) calloc(3, sizeof(dfloat));
+  nrs->extC = (dfloat*) calloc(3, sizeof(dfloat));
+
+  if (nrs->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) {
+    nrs->Nstages = 1;
+    nrs->temporalOrder = 1;
+  } else if (nrs->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) {
+    nrs->Nstages = 2;
+    nrs->temporalOrder = 2;
+  } else if (nrs->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) {
+    nrs->Nstages = 3;
+    nrs->temporalOrder = 3;
+  }
+
+  dfloat mue = 1;
+  dfloat rho = 1;
+  nrs->options.getArgs("VISCOSITY", mue);
+  nrs->options.getArgs("DENSITY", rho);
+
+  nrs->options.getArgs("SUBCYCLING STEPS",nrs->Nsubsteps);
+  nrs->options.getArgs("DT", nrs->dt[0]);
+
+  const dlong Nlocal = mesh->Np * mesh->Nelements;
+  const dlong Ntotal = mesh->Np * (mesh->Nelements + mesh->totalHaloPairs);
+
+  nrs->Nlocal = Nlocal;
+  nrs->Ntotal = Ntotal;
+
+  // ensure that offset is large enough for v and t mesh and is properly aligned
+  {
+    const dlong NtotalT = nrs->meshT->Np * (nrs->meshT->Nelements + nrs->meshT->totalHaloPairs);
+    nrs->fieldOffset = mymax(Ntotal, NtotalT);
+
+    int PAGESIZE = 4096; // default is 4kB
+    char* tmp;
+    tmp = getenv("NEKRS_PAGE_SIZE");
+    if (tmp != NULL) PAGESIZE = std::stoi(tmp);
+    const int pageW = PAGESIZE / sizeof(dfloat);
+    if (nrs->fieldOffset % pageW) nrs->fieldOffset = (nrs->fieldOffset / pageW + 1) * pageW;
+  }
+
+  nrs->Nblock = (Nlocal + BLOCKSIZE - 1) / BLOCKSIZE;
+
+  if(nrs->Nsubsteps) {
+    int Sorder;
+    nrs->options.getArgs("SUBCYCLING TIME ORDER", Sorder);
+    if(Sorder == 4 && nrs->SNrk == 4) { // ERK(4,4)
+      dfloat rka[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0};
+      dfloat rkb[4] = {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0};
+      dfloat rkc[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0};
+      nrs->Srka = (dfloat*) calloc(nrs->SNrk, sizeof(dfloat));
+      nrs->Srkb = (dfloat*) calloc(nrs->SNrk, sizeof(dfloat));
+      nrs->Srkc = (dfloat*) calloc(nrs->SNrk, sizeof(dfloat));
+      memcpy(nrs->Srka, rka, nrs->SNrk * sizeof(dfloat));
+      memcpy(nrs->Srkb, rkb, nrs->SNrk * sizeof(dfloat));
+      memcpy(nrs->Srkc, rkc, nrs->SNrk * sizeof(dfloat));
+    }else{
+      if(mesh->rank == 0) cout << "Unsupported subcycling scheme!\n";
+      ABORT(1);
+    }
+    nrs->o_Srka = mesh->device.malloc(nrs->SNrk * sizeof(dfloat), nrs->Srka);
+    nrs->o_Srkb = mesh->device.malloc(nrs->SNrk * sizeof(dfloat), nrs->Srkb);
+  }
+
+  // setup scratch space
+  const int wrkNflds = 6;
+  const int ellipticWrkNflds = 15;
+  nrs->ellipticWrkOffset = wrkNflds * nrs->fieldOffset;
+
+  const int scratchNflds = wrkNflds + ellipticWrkNflds;
+  scratch   = (dfloat*) calloc(scratchNflds * nrs->fieldOffset,sizeof(dfloat));
+  o_scratch = mesh->device.malloc(scratchNflds * nrs->fieldOffset * sizeof(dfloat), scratch);
+
+  nrs->o_wrk0  = o_scratch.slice( 0 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk1  = o_scratch.slice( 1 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk2  = o_scratch.slice( 2 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk3  = o_scratch.slice( 3 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk4  = o_scratch.slice( 4 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk5  = o_scratch.slice( 5 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk6  = o_scratch.slice( 6 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk7  = o_scratch.slice( 7 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk9  = o_scratch.slice( 9 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk12 = o_scratch.slice(12 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_wrk15 = o_scratch.slice(15 * nrs->fieldOffset * sizeof(dfloat));
+
+  nrs->U  = (dfloat*) calloc(nrs->NVfields * nrs->Nstages * nrs->fieldOffset,sizeof(dfloat));
+  nrs->Ue = (dfloat*) calloc(nrs->NVfields * nrs->fieldOffset,sizeof(dfloat));
+  nrs->P  = (dfloat*) calloc(nrs->fieldOffset,sizeof(dfloat));
+  nrs->BF = (dfloat*) calloc(nrs->NVfields * nrs->fieldOffset,sizeof(dfloat));
+  nrs->FU = (dfloat*) calloc(nrs->NVfields * nrs->Nstages * nrs->fieldOffset,sizeof(dfloat));
+
+  nrs->o_U  = mesh->device.malloc(nrs->NVfields * nrs->Nstages * nrs->fieldOffset * sizeof(dfloat), nrs->U);
+  nrs->o_Ue = mesh->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), nrs->Ue);
+  nrs->o_P  = mesh->device.malloc(nrs->fieldOffset * sizeof(dfloat), nrs->P);
+  nrs->o_BF = mesh->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), nrs->BF);
+  nrs->o_FU = mesh->device.malloc(nrs->NVfields * nrs->Nstages * nrs->fieldOffset * sizeof(dfloat), nrs->FU);
+
+  nrs->var_coeff = 1; // use always var coeff elliptic
+  nrs->ellipticCoeff = (dfloat*) calloc(2 * nrs->fieldOffset,sizeof(dfloat));
+  nrs->o_ellipticCoeff = mesh->device.malloc(2 * nrs->fieldOffset * sizeof(dfloat),
+                                             nrs->ellipticCoeff);
+
+  nrs->prop =  (dfloat*) calloc(2 * nrs->fieldOffset,sizeof(dfloat));
+  for (int e = 0; e < mesh->Nelements; e++)
+    for (int n = 0; n < mesh->Np; n++) {
+      nrs->prop[0 * nrs->fieldOffset + e * mesh->Np + n] = mue;
+      nrs->prop[1 * nrs->fieldOffset + e * mesh->Np + n] = rho;
+    }
+  nrs->o_prop = mesh->device.malloc(2 * nrs->fieldOffset * sizeof(dfloat), nrs->prop);
+  nrs->o_mue = nrs->o_prop.slice(0 * nrs->fieldOffset * sizeof(dfloat));
+  nrs->o_rho = nrs->o_prop.slice(1 * nrs->fieldOffset * sizeof(dfloat));
+
+  nrs->div   = (dfloat*) calloc(nrs->fieldOffset,sizeof(dfloat));
+  nrs->o_div = mesh->device.malloc(nrs->fieldOffset * sizeof(dfloat), nrs->div);
+
+  nrs->elementInfo = (dlong*) calloc(nrs->meshT->Nelements,sizeof(dlong));
+  for (int e = 0; e < nrs->meshT->Nelements; e++) nrs->elementInfo[e] = mesh->elementInfo[e];
+  nrs->o_elementInfo = mesh->device.malloc(nrs->meshT->Nelements * sizeof(dlong), nrs->elementInfo);
+  dfloat rkC[4]  = {1.0, 0.0, -1.0, -2.0};
+  nrs->o_rkC     = mesh->device.malloc(4 * sizeof(dfloat),rkC);
+  nrs->o_extbdfA = mesh->device.malloc(3 * sizeof(dfloat), nrs->extbdfA);
+  nrs->o_extbdfB = mesh->device.malloc(3 * sizeof(dfloat), nrs->extbdfB);
+  nrs->o_extbdfC = mesh->device.malloc(3 * sizeof(dfloat), nrs->extbdfA);
+  nrs->o_extC    = mesh->device.malloc(3 * sizeof(dfloat), nrs->extC);
+
+  // define aux kernel constants
+  kernelInfo["defines/" "p_eNfields"] = nrs->NVfields;
+  kernelInfo["defines/" "p_NVfields"] = nrs->NVfields;
+  kernelInfo["defines/" "p_Nstages"] =  nrs->Nstages;
+  if(nrs->Nsubsteps)
+    kernelInfo["defines/" "p_SUBCYCLING"] =  1;
+  else
+    kernelInfo["defines/" "p_SUBCYCLING"] =  0;
+
+  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  //kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
+
+  int NblockV = mymax(1, BLOCKSIZE/mesh->Np);
+  kernelInfo["defines/" "p_NblockV"] = NblockV;
+
+  // jit compile udf kernels
+  if (udf.loadKernels) {
+    if (mesh->rank == 0) cout << "loading udf kernels ... ";
+    udf.loadKernels(nrs);
+    if (mesh->rank == 0) cout << "done" << endl;
+  }
+
+  nrs->linAlg = new linAlg_t(mesh->device, nrs->kernelInfo, mesh->comm);
+
+  meshParallelGatherScatterSetup(mesh, nrs->Nlocal, mesh->globalIds, mesh->comm, 0);
+  oogs_mode oogsMode = OOGS_AUTO; 
+  if(nrs->options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT;
+  nrs->gsh = oogs::setup(mesh->ogs, nrs->NVfields, nrs->fieldOffset, ogsDfloat, NULL, oogsMode);
+
+  if(!buildOnly) {
+    int err = 0;
+    dlong gNelements = mesh->Nelements;
+    MPI_Allreduce(MPI_IN_PLACE, &gNelements, 1, MPI_DLONG, MPI_SUM, mesh->comm);
+    const dfloat sum2 = (dfloat)gNelements * mesh->Np;
+    nrs->linAlg->fillKernel(nrs->fieldOffset, 1.0, nrs->o_wrk0);
+    ogsGatherScatter(nrs->o_wrk0, ogsDfloat, ogsAdd, mesh->ogs);
+    nrs->linAlg->axmyKernel(Nlocal, 1.0, mesh->ogs->o_invDegree, nrs->o_wrk0); 
+    dfloat* tmp = (dfloat*) calloc(Nlocal, sizeof(dfloat));
+    nrs->o_wrk0.copyTo(tmp, Nlocal * sizeof(dfloat));
+    dfloat sum1 = 0;
+    for(int i = 0; i < Nlocal; i++) sum1 += tmp[i];
+    MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, mesh->comm);
+    sum1 = abs(sum1 - sum2) / sum2;
+    if(sum1 > 1e-15) {
+      if(mesh->rank == 0) printf("ogsGatherScatter test err=%g!\n", sum1);
+      fflush(stdout);
+      err++;
+    }
+
+    mesh->ogs->o_invDegree.copyTo(tmp, Nlocal * sizeof(dfloat));
+    double* vmult = (double*) nek_ptr("vmult");
+    sum1 = 0;
+    for(int i = 0; i < Nlocal; i++) sum1 += abs(tmp[i] - vmult[i]);
+    MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, mesh->comm);
+    if(sum1 > 1e-15) {
+      if(mesh->rank == 0) printf("multiplicity test err=%g!\n", sum1);
+      fflush(stdout);
+      err++;
+    }
+
+    if(err) ABORT(1);
+    free(tmp);
+  }
+
+  // build mass + inverse mass matrix
+  dfloat* lumpedMassMatrix  = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
+  for(hlong e = 0; e < mesh->Nelements; ++e)
+    for(int n = 0; n < mesh->Np; ++n)
+      lumpedMassMatrix[e * mesh->Np + n] = mesh->vgeo[e * mesh->Np * mesh->Nvgeo + JWID * mesh->Np + n];
+  mesh->o_LMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
+  mesh->o_LMM.copyTo(mesh->LMM);
+  ogsGatherScatter(lumpedMassMatrix, ogsDfloat, ogsAdd, mesh->ogs);
+  for(int n = 0; n < mesh->Np * mesh->Nelements; ++n)
+    lumpedMassMatrix[n] = 1. / lumpedMassMatrix[n];
+  mesh->o_invLMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
+  mesh->o_invLMM.copyTo(mesh->invLMM);
+  free(lumpedMassMatrix);
+
+  // setup boundary mapping
+  dfloat largeNumber = 1 << 20;
+  nrs->VmapB = (int*) calloc(mesh->Nelements * mesh->Np,sizeof(int));
+  for (int e = 0; e < mesh->Nelements; e++)
+    for (int n = 0; n < mesh->Np; n++) nrs->VmapB[n + e * mesh->Np] = largeNumber;
+
+  nrs->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int));
+
+  int cnt = 0;
+  for (int e = 0; e < mesh->Nelements; e++)
+    for (int f = 0; f < mesh->Nfaces; f++) {
+      int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "velocity");
+      nrs->EToB[cnt] = bc;
+      if (bc > 0) {
+        for (int n = 0; n < mesh->Nfp; n++) {
+          int fid = mesh->faceNodes[n + f * mesh->Nfp];
+          nrs->VmapB[fid + e * mesh->Np] = mymin(bc,nrs->VmapB[fid + e * mesh->Np]); // Dirichlet wnrs
+        }
+      }
+      cnt++;
+    }
+
+  ogsGatherScatter(nrs->VmapB, ogsInt, ogsMin, mesh->ogs);
+  for (int n = 0; n < mesh->Nelements * mesh->Np; n++)
+    if (nrs->VmapB[n] == largeNumber) nrs->VmapB[n] = 0;
+
+  nrs->o_EToB = mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),nrs->EToB);
+  nrs->o_VmapB = mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(int), nrs->VmapB);
+
+  if(nrs->options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
+    filterSetup(nrs);
+
+  // build kernels
+  string fileName, kernelName;
+  const string suffix = "Hex3D";
+  const string oklpath = install_dir + "/okl/core/";
+
+  MPI_Barrier(mesh->comm);
+  double tStartLoadKernel = MPI_Wtime();
+  if(mesh->rank == 0)  printf("loading ns kernels ... "); fflush(stdout);
+
+  for (int r = 0; r < 2; r++) {
+    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
+
+      occa::properties kernelInfoBC = kernelInfo;
+      const string bcDataFile = install_dir + "/include/core/bcData.h";
+      kernelInfoBC["includes"] += bcDataFile.c_str();
+      string boundaryHeaderFileName;
+      nrs->options.getArgs("DATA FILE", boundaryHeaderFileName);
+      kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL);
+
+      fileName = oklpath + "nrsAdvection" + suffix + ".okl";
+      kernelName = "nrsStrongAdvectionVolume" + suffix;
+      nrs->advectionStrongVolumeKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+      kernelName = "nrsStrongAdvectionCubatureVolume" + suffix;
+      nrs->advectionStrongCubatureVolumeKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsCurl" + suffix + ".okl";
+      kernelName = "nrsCurl" + suffix;
+      nrs->curlKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsMassMatrix" + ".okl";
+      kernelName = "nrsMassMatrix" + suffix;
+      nrs->massMatrixKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      kernelName = "nrsInvMassMatrix" + suffix;
+      nrs->invMassMatrixKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsGradient" + suffix + ".okl";
+      kernelName = "nrsGradientVolume" + suffix;
+      nrs->gradientVolumeKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      kernelName = "nrswGradientVolume" + suffix;
+      nrs->wgradientVolumeKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsSumMakef" + suffix + ".okl";
+      kernelName = "nrsSumMakef" + suffix;
+      nrs->sumMakefKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsDivergence" + suffix + ".okl";
+      kernelName = "nrsDivergenceVolume" + suffix;
+      nrs->divergenceVolumeKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
+
+      kernelName = "nrsDivergenceSurfaceTOMBO" + suffix;
+      nrs->divergenceSurfaceKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
+
+      fileName = oklpath + "nrsPressureRhs" + suffix + ".okl";
+      kernelName = "nrsPressureRhsTOMBO" + suffix;
+      nrs->pressureRhsKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsPressureStress" + suffix + ".okl";
+      kernelName = "nrsPressureStress" + suffix;
+      nrs->pressureStressKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsPressureBC" + suffix + ".okl";
+      kernelName = "nrsPressureDirichletBC" + suffix;
+      nrs->pressureDirichletBCKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
+
+      fileName = oklpath + "nrsPressureUpdate" + ".okl";
+      kernelName = "nrsPressureUpdate";
+      nrs->pressureUpdateKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      fileName = oklpath + "nrsVelocityRhs" + suffix + ".okl";
+      kernelName = "nrsVelocityRhsTOMBO" + suffix;
+      nrs->velocityRhsKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsVelocityBC" + suffix + ".okl";
+      kernelName = "nrsVelocityDirichletBC" + suffix;
+      nrs->velocityDirichletBCKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
+
+      kernelName = "nrsVelocityNeumannBC" + suffix;
+      nrs->velocityNeumannBCKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfoBC);
+
+      fileName = oklpath + "nrsSubCycle" + suffix + ".okl";
+      kernelName = "nrsSubCycleStrongCubatureVolume" + suffix;
+      nrs->subCycleStrongCubatureVolumeKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      kernelName = "nrsSubCycleStrongVolume" + suffix;
+      nrs->subCycleStrongVolumeKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsSubCycleRKUpdate" + ".okl";
+      kernelName = "nrsSubCycleLSERKUpdate";
+      if(nrs->SNrk == 4) kernelName = "nrsSubCycleERKUpdate";
+      nrs->subCycleRKUpdateKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsExtrapolate" + ".okl";
+      kernelName = "nrsMultiExtrapolate";
+      nrs->extrapolateKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      // ===========================================================================
+
+      fileName = install_dir + "/okl/core/scaledAdd.okl";
+      kernelName = "scaledAddwOffset";
+      nrs->scaledAddKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = install_dir + "/okl/core/dotMultiply.okl";
+      kernelName = "dotMultiply";
+      nrs->dotMultiplyKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "math" + ".okl";
+      kernelName = "fill";
+      nrs->fillKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      kernelName = "max";
+      nrs->maxKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      kernelName = "scalarScaledAdd";
+      nrs->scalarScaledAddKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      kernelName = "maskCopy";
+      nrs->maskCopyKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      // ===========================================================================
+
+      fileName = oklpath + "nrsFilterRT" + suffix + ".okl";
+      kernelName = "nrsFilterRT" + suffix;
+      nrs->filterRTKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsCfl" + suffix + ".okl";
+      kernelName = "nrsCfl" + suffix;
+      nrs->cflKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsQtl" + suffix + ".okl";
+      kernelName = "nrsQtl" + suffix;
+      nrs->qtlKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "nrsPressureAddQtl" + ".okl";
+      kernelName = "nrsPressureAddQtl";
+      nrs->pressureAddQtlKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = oklpath + "setEllipticCoeff.okl";
+      kernelName = "setEllipticCoeff";
+      nrs->setEllipticCoeffKernel =
+        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      kernelName = "setEllipticCoeffPressure";
+      nrs->setEllipticCoeffPressureKernel =
+        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      fileName = oklpath + "nrsPQ.okl";
+      kernelName = "nrsPQ";
+      nrs->PQKernel =
+        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      fileName = oklpath + "nrsMueDiv.okl";
+      kernelName = "nrsMueDiv";
+      nrs->mueDivKernel =
+        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+    }
+    MPI_Barrier(mesh->comm);
+  }
+
+  MPI_Barrier(mesh->comm);
+  if(mesh->rank == 0)  printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout);
+
+  if(nrs->Nscalar) {
+    mesh_t* msh;
+    (nrs->cht) ? msh = nrs->meshT : msh = nrs->mesh;
+    nrs->cds = cdsSetup(nrs, msh, nrs->options, kernelInfoS);
+  }
+
+  if(!buildOnly) {
+    int readRestartFile;
+    nrs->options.getArgs("RESTART FROM FILE", readRestartFile);
+    double startTime;
+    nek_copyTo(startTime);
+    if(readRestartFile) nrs->options.setArgs("START TIME", to_string_f(startTime));
+
+    if(mesh->rank == 0)  printf("calling udf_setup ... "); fflush(stdout);
+    udf.setup(nrs);
+    if(mesh->rank == 0)  printf("done\n"); fflush(stdout);
+   }
+
+  // setup elliptic solvers
+
+  const int nbrBIDs = bcMap::size(0);
+  int NBCType = nbrBIDs + 1;
+
+  if(nrs->Nscalar) {
+    mesh_t* mesh;
+    (nrs->cht) ? mesh = nrs->meshT : mesh = nrs->mesh;
+    cds_t* cds = nrs->cds;
+
+    for (int is = 0; is < cds->NSfields; is++) {
+      std::stringstream ss;
+      ss << std::setfill('0') << std::setw(2) << is;
+      string sid = ss.str();
+ 
+      if(!cds->compute[is]) continue;
+ 
+      mesh_t* mesh;
+      (is) ? mesh = cds->meshV : mesh = cds->mesh; // only first scalar can be a CHT mesh
+
+      if (mesh->rank == 0)
+        cout << "================= ELLIPTIC SETUP SCALAR" << sid << " ===============\n";
+
+      int nbrBIDs = bcMap::size(0);
+      if(nrs->cht && is == 0) nbrBIDs = bcMap::size(1);
+      int* sBCType = (int*) calloc(nbrBIDs + 1, sizeof(int));
+ 
+      for (int bID = 1; bID <= nbrBIDs; bID++) {
+        string bcTypeText(bcMap::text(bID, "scalar" + sid));
+        if(mesh->rank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str());
+        sBCType[bID] = bcMap::type(bID, "scalar" + sid);
+      }
+ 
+      cds->solver[is] = new elliptic_t();
+      cds->solver[is]->blockSolver = 0;
+      cds->solver[is]->Nfields = 1;
+      cds->solver[is]->Ntotal = nrs->fieldOffset;
+      cds->solver[is]->wrk = scratch + nrs->ellipticWrkOffset;
+      cds->solver[is]->o_wrk = o_scratch.slice(nrs->ellipticWrkOffset * sizeof(dfloat));
+      cds->solver[is]->mesh = mesh;
+      cds->solver[is]->dim = cds->dim;
+      cds->solver[is]->elementType = cds->elementType;
+      cds->solver[is]->BCType = (int*) calloc(nbrBIDs + 1,sizeof(int));
+      memcpy(cds->solver[is]->BCType,sBCType,(nbrBIDs + 1) * sizeof(int));
+      free(sBCType);
+      cds->solver[is]->var_coeff = cds->var_coeff;
+      for (int i = 0; i < 2 * nrs->fieldOffset; i++) nrs->ellipticCoeff[i] = 1;
+      cds->solver[is]->lambda = cds->ellipticCoeff;
+      cds->solver[is]->o_lambda = cds->o_ellipticCoeff;
+      cds->solver[is]->loffset = 0;
+ 
+      cds->solver[is]->options = cds->options[is];
+      ellipticSolveSetup(cds->solver[is], kernelInfoS);
+    }
+  }
+
+  if (nrs->flow) {
+    if (mesh->rank == 0) printf("================ ELLIPTIC SETUP VELOCITY ================\n");
+
+    nrs->uvwSolver = NULL;
+
+    if(nrs->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE"))
+      nrs->uvwSolver = new elliptic_t();
+
+    int* uvwBCType = (int*) calloc(3 * NBCType, sizeof(int));
+    int* uBCType = uvwBCType + 0 * NBCType;
+    int* vBCType = uvwBCType + 1 * NBCType;
+    int* wBCType = uvwBCType + 2 * NBCType;
+    for (int bID = 1; bID <= nbrBIDs; bID++) {
+      string bcTypeText(bcMap::text(bID, "velocity"));
+      if(mesh->rank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str());
+
+      uBCType[bID] = bcMap::type(bID, "x-velocity");
+      vBCType[bID] = bcMap::type(bID, "y-velocity");
+      wBCType[bID] = bcMap::type(bID, "z-velocity");
+    }
+
+    nrs->vOptions = options;
+    nrs->vOptions.setArgs("KRYLOV SOLVER",        options.getArgs("VELOCITY KRYLOV SOLVER"));
+    nrs->vOptions.setArgs("SOLVER TOLERANCE",     options.getArgs("VELOCITY SOLVER TOLERANCE"));
+    nrs->vOptions.setArgs("DISCRETIZATION",       options.getArgs("VELOCITY DISCRETIZATION"));
+    nrs->vOptions.setArgs("BASIS",                options.getArgs("VELOCITY BASIS"));
+    nrs->vOptions.setArgs("PRECONDITIONER",       options.getArgs("VELOCITY PRECONDITIONER"));
+    nrs->vOptions.setArgs("RESIDUAL PROJECTION",       options.getArgs("VELOCITY RESIDUAL PROJECTION"));
+    nrs->vOptions.setArgs("RESIDUAL PROJECTION VECTORS",       options.getArgs("VELOCITY RESIDUAL PROJECTION VECTORS"));
+    nrs->vOptions.setArgs("RESIDUAL PROJECTION START",       options.getArgs("VELOCITY RESIDUAL PROJECTION START"));
+    nrs->vOptions.setArgs("MULTIGRID COARSENING", options.getArgs("VELOCITY MULTIGRID COARSENING"));
+    nrs->vOptions.setArgs("MULTIGRID SMOOTHER",   options.getArgs("VELOCITY MULTIGRID SMOOTHER"));
+    nrs->vOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE",
+                          options.getArgs("VELOCITY MULTIGRID CHEBYSHEV DEGREE"));
+    nrs->vOptions.setArgs("PARALMOND CYCLE",      options.getArgs("VELOCITY PARALMOND CYCLE"));
+    nrs->vOptions.setArgs("PARALMOND SMOOTHER",   options.getArgs("VELOCITY PARALMOND SMOOTHER"));
+    nrs->vOptions.setArgs("PARALMOND PARTITION",  options.getArgs("VELOCITY PARALMOND PARTITION"));
+    nrs->vOptions.setArgs("PARALMOND CHEBYSHEV DEGREE",
+                          options.getArgs("VELOCITY PARALMOND CHEBYSHEV DEGREE"));
+    nrs->vOptions.setArgs("PARALMOND AGGREGATION STRATEGY",
+                          options.getArgs("VELOCITY PARALMOND AGGREGATION STRATEGY"));
+
+    // coeff used by ellipticSetup to detect allNeumann
+    for (int i = 0; i < 2 * nrs->fieldOffset; i++) nrs->ellipticCoeff[i] = 1;
+
+    if(nrs->uvwSolver) {
+      nrs->uvwSolver->blockSolver = 1;
+      nrs->uvwSolver->stressForm = 0;
+      if(options.compareArgs("STRESSFORMULATION", "TRUE"))
+        nrs->uvwSolver->stressForm = 1;
+      nrs->uvwSolver->Nfields = nrs->NVfields;
+      nrs->uvwSolver->Ntotal = nrs->fieldOffset;
+      nrs->uvwSolver->wrk = scratch + nrs->ellipticWrkOffset;
+      nrs->uvwSolver->o_wrk = o_scratch.slice(nrs->ellipticWrkOffset * sizeof(dfloat));
+      nrs->uvwSolver->mesh = mesh;
+      nrs->uvwSolver->options = nrs->vOptions;
+      nrs->uvwSolver->dim = nrs->dim;
+      nrs->uvwSolver->elementType = nrs->elementType;
+      nrs->uvwSolver->NBCType = NBCType;
+      nrs->uvwSolver->BCType = (int*) calloc(nrs->NVfields * NBCType,sizeof(int));
+      memcpy(nrs->uvwSolver->BCType,uvwBCType,nrs->NVfields * NBCType * sizeof(int));
+      nrs->uvwSolver->var_coeff = nrs->var_coeff;
+      nrs->uvwSolver->lambda = nrs->ellipticCoeff;
+      nrs->uvwSolver->o_lambda = nrs->o_ellipticCoeff;
+      nrs->uvwSolver->loffset = 0; // use same ellipticCoeff for u,v and w
+
+      ellipticSolveSetup(nrs->uvwSolver, kernelInfoV);
+    } else {
+      nrs->uSolver = new elliptic_t();
+      nrs->uSolver->blockSolver = 0;
+      nrs->uSolver->Nfields = 1;
+      nrs->uSolver->Ntotal = nrs->fieldOffset;
+      nrs->uSolver->wrk = scratch + nrs->ellipticWrkOffset;
+      nrs->uSolver->o_wrk = o_scratch.slice(nrs->ellipticWrkOffset * sizeof(dfloat));
+      nrs->uSolver->mesh = mesh;
+      nrs->uSolver->options = nrs->vOptions;
+      nrs->uSolver->dim = nrs->dim;
+      nrs->uSolver->elementType = nrs->elementType;
+      nrs->uSolver->NBCType = NBCType;
+      nrs->uSolver->BCType = (int*) calloc(NBCType,sizeof(int));
+      memcpy(nrs->uSolver->BCType,uBCType,NBCType * sizeof(int));
+      nrs->uSolver->var_coeff = nrs->var_coeff;
+      nrs->uSolver->lambda = nrs->ellipticCoeff;
+      nrs->uSolver->o_lambda = nrs->o_ellipticCoeff;
+      nrs->uSolver->loffset = 0;
+
+      ellipticSolveSetup(nrs->uSolver, kernelInfoV);
+
+      nrs->vSolver = new elliptic_t();
+      nrs->vSolver->blockSolver = 0;
+      nrs->vSolver->Nfields = 1;
+      nrs->vSolver->Ntotal = nrs->fieldOffset;
+      nrs->vSolver->wrk = scratch + nrs->ellipticWrkOffset;
+      nrs->vSolver->o_wrk = o_scratch.slice(nrs->ellipticWrkOffset * sizeof(dfloat));
+      nrs->vSolver->mesh = mesh;
+      nrs->vSolver->options = nrs->vOptions;
+      nrs->vSolver->dim = nrs->dim;
+      nrs->vSolver->elementType = nrs->elementType;
+      nrs->vSolver->NBCType = NBCType;
+      nrs->vSolver->BCType = (int*) calloc(NBCType,sizeof(int));
+      memcpy(nrs->vSolver->BCType,vBCType,NBCType * sizeof(int));
+      nrs->vSolver->var_coeff = nrs->var_coeff;
+      nrs->vSolver->lambda = nrs->ellipticCoeff;
+      nrs->vSolver->o_lambda = nrs->o_ellipticCoeff;
+      nrs->vSolver->loffset = 0;
+
+      ellipticSolveSetup(nrs->vSolver, kernelInfoV);
+
+      if (nrs->dim == 3) {
+        nrs->wSolver = new elliptic_t();
+        nrs->wSolver->blockSolver = 0;
+        nrs->wSolver->Nfields = 1;
+        nrs->wSolver->Ntotal = nrs->fieldOffset;
+        nrs->wSolver->wrk = scratch + nrs->ellipticWrkOffset;
+        nrs->wSolver->o_wrk = o_scratch.slice(nrs->ellipticWrkOffset * sizeof(dfloat));
+        nrs->wSolver->mesh = mesh;
+        nrs->wSolver->options = nrs->vOptions;
+        nrs->wSolver->dim = nrs->dim;
+        nrs->wSolver->elementType = nrs->elementType;
+        nrs->wSolver->NBCType = NBCType;
+        nrs->wSolver->BCType = (int*) calloc(NBCType,sizeof(int));
+        memcpy(nrs->wSolver->BCType,wBCType,NBCType * sizeof(int));
+        nrs->wSolver->var_coeff = nrs->var_coeff;
+        nrs->wSolver->lambda = nrs->ellipticCoeff;
+        nrs->wSolver->o_lambda = nrs->o_ellipticCoeff;
+        nrs->wSolver->loffset = 0;
+
+        ellipticSolveSetup(nrs->wSolver, kernelInfoV);
+      }
+    }
+  } // flow
+
+  if (nrs->flow) {
+    if (mesh->rank == 0) printf("================ ELLIPTIC SETUP PRESSURE ================\n");
+
+    int* pBCType = (int*) calloc(NBCType, sizeof(int));
+    for (int bID = 1; bID <= nbrBIDs; bID++)
+      pBCType[bID] = bcMap::type(bID, "pressure");
+
+    nrs->pOptions = options;
+    nrs->pOptions.setArgs("KRYLOV SOLVER",        options.getArgs("PRESSURE KRYLOV SOLVER"));
+    nrs->pOptions.setArgs("SOLVER TOLERANCE",     options.getArgs("PRESSURE SOLVER TOLERANCE"));
+    nrs->pOptions.setArgs("DISCRETIZATION",       options.getArgs("PRESSURE DISCRETIZATION"));
+    nrs->pOptions.setArgs("BASIS",                options.getArgs("PRESSURE BASIS"));
+    nrs->pOptions.setArgs("PRECONDITIONER",       options.getArgs("PRESSURE PRECONDITIONER"));
+    nrs->pOptions.setArgs("MULTIGRID COARSENING", options.getArgs("PRESSURE MULTIGRID COARSENING"));
+    nrs->pOptions.setArgs("MULTIGRID SMOOTHER",   options.getArgs("PRESSURE MULTIGRID SMOOTHER"));
+    nrs->pOptions.setArgs("MULTIGRID DOWNWARD SMOOTHER",
+                          options.getArgs("PRESSURE MULTIGRID DOWNWARD SMOOTHER"));
+    nrs->pOptions.setArgs("MULTIGRID UPWARD SMOOTHER",
+                          options.getArgs("PRESSURE MULTIGRID UPWARD SMOOTHER"));
+    nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE",
+                          options.getArgs("PRESSURE MULTIGRID CHEBYSHEV DEGREE"));
+    nrs->pOptions.setArgs("PARALMOND CYCLE",      options.getArgs("PRESSURE PARALMOND CYCLE"));
+    nrs->pOptions.setArgs("PARALMOND SMOOTHER",   options.getArgs("PRESSURE MULTIGRID SMOOTHER"));
+    nrs->pOptions.setArgs("PARALMOND PARTITION",  options.getArgs("PRESSURE PARALMOND PARTITION"));
+    nrs->pOptions.setArgs("PARALMOND CHEBYSHEV DEGREE",
+                          options.getArgs("PRESSURE PARALMOND CHEBYSHEV DEGREE"));
+    nrs->pOptions.setArgs("PARALMOND AGGREGATION STRATEGY",
+                          options.getArgs("PRESSURE PARALMOND AGGREGATION STRATEGY"));
+    nrs->pOptions.setArgs("RESIDUAL PROJECTION", options.getArgs("PRESSURE RESIDUAL PROJECTION"));
+    nrs->pOptions.setArgs("RESIDUAL PROJECTION VECTORS",
+                          options.getArgs("PRESSURE RESIDUAL PROJECTION VECTORS"));
+    nrs->pOptions.setArgs("RESIDUAL PROJECTION START",
+                          options.getArgs("PRESSURE RESIDUAL PROJECTION START"));
+    nrs->pOptions.setArgs("MULTIGRID VARIABLE COEFFICIENT", "FALSE");
+
+    nrs->pSolver = new elliptic_t();
+    nrs->pSolver->blockSolver = 0;
+    nrs->pSolver->Nfields = 1;
+    nrs->pSolver->Ntotal = nrs->fieldOffset;
+    nrs->pSolver->wrk = scratch + nrs->ellipticWrkOffset;
+    nrs->pSolver->o_wrk = o_scratch.slice(nrs->ellipticWrkOffset * sizeof(dfloat));
+    nrs->pSolver->mesh = mesh;
+    nrs->pSolver->dim = nrs->dim;
+    nrs->pSolver->elementType = nrs->elementType;
+    nrs->pSolver->BCType = (int*) calloc(nbrBIDs + 1,sizeof(int));
+    memcpy(nrs->pSolver->BCType,pBCType,(nbrBIDs + 1) * sizeof(int));
+    nrs->pSolver->var_coeff = 1;
+    //// coeff used by ellipticSetup to detect allNeumann
+    // and coeff[0] to setup MG levels
+    for (int i = 0; i < 2 * nrs->fieldOffset; i++) nrs->ellipticCoeff[i] = 0;
+    nrs->pSolver->lambda = nrs->ellipticCoeff;
+    nrs->pSolver->o_lambda = nrs->o_ellipticCoeff;
+    nrs->pSolver->loffset = 0;
+
+    string p_mglevels;
+    if(nrs->pOptions.getArgs("MULTIGRID COARSENING", p_mglevels)) {
+      std::vector<std::string> mgLevelList;
+      mgLevelList = serializeString(p_mglevels);
+      nrs->pSolver->nLevels = mgLevelList.size();
+      nrs->pSolver->levels = (int*) calloc(nrs->pSolver->nLevels,sizeof(int));
+      for(int i = 0; i < nrs->pSolver->nLevels; ++i)
+        nrs->pSolver->levels[i] = std::atoi(mgLevelList.at(i).c_str());
+
+      if(nrs->pSolver->levels[0] > mesh->N || 
+         nrs->pSolver->levels[nrs->pSolver->nLevels-1] < 1) {
+        if(mesh->rank == 0) printf("ERROR: Invalid multigrid coarsening!\n");
+        EXIT(1);
+      }
+      nrs->pOptions.setArgs("MULTIGRID COARSENING","CUSTOM");
+    } else if(nrs->pOptions.compareArgs("MULTIGRID DOWNWARD SMOOTHER","ASM") ||
+              nrs->pOptions.compareArgs("MULTIGRID DOWNWARD SMOOTHER","RAS")) {
+      std::map<int,std::vector<int> > mg_level_lookup =
+      {
+        {1,{1}},
+        {2,{2,1}},
+        {3,{3,1}},
+        {4,{4,2,1}},
+        {5,{5,3,1}},
+        {6,{6,3,1}},
+        {7,{7,3,1}},
+        {8,{8,5,1}},
+        {9,{9,5,1}},
+        {10,{10,6,1}},
+        {11,{11,6,1}},
+        {12,{12,7,1}},
+        {13,{13,7,1}},
+        {14,{14,8,1}},
+        {15,{15,9,1}},
+      };
+
+      const std::vector<int>& levels = mg_level_lookup.at(mesh->Nq - 1);
+      nrs->pSolver->nLevels = levels.size();
+      nrs->pSolver->levels = (int*) calloc(nrs->pSolver->nLevels,sizeof(int));
+      for(int i = 0; i < nrs->pSolver->nLevels; ++i)
+        nrs->pSolver->levels[i] = levels.at(i);
+      nrs->pOptions.setArgs("MULTIGRID COARSENING","CUSTOM");
+    } else if(nrs->pOptions.compareArgs("MULTIGRID DOWNWARD SMOOTHER","JAC")) {
+      std::map<int,std::vector<int> > mg_level_lookup =
+      {
+        {1,{1}},
+        {2,{2,1}},
+        {3,{3,1}},
+        {4,{4,2,1}},
+        {5,{5,3,1}},
+        {6,{6,4,2,1}},
+        {7,{7,5,3,1}},
+        {8,{8,6,4,1}},
+        {9,{9,7,5,1}},
+        {10,{10,8,5,1}},
+        {11,{11,9,5,1}},
+        {12,{12,10,5,1}},
+        {13,{13,11,5,1}},
+        {14,{14,12,5,1}},
+        {15,{15,13,5,1}},
+      };
+
+      const std::vector<int>& levels = mg_level_lookup.at(mesh->Nq - 1);
+      nrs->pSolver->nLevels = levels.size();
+      nrs->pSolver->levels = (int*) calloc(nrs->pSolver->nLevels,sizeof(int));
+      for(int i = 0; i < nrs->pSolver->nLevels; ++i)
+        nrs->pSolver->levels[i] = levels.at(i);
+      nrs->pOptions.setArgs("MULTIGRID COARSENING","CUSTOM");
+    }
+
+    nrs->pSolver->options = nrs->pOptions;
+    ellipticSolveSetup(nrs->pSolver, kernelInfoP);
+
+  } // flow
+
+  return nrs;
+}
+
+static cds_t* cdsSetup(nrs_t* nrs, mesh_t* mesh, setupAide options, occa::properties &kernelInfoH)
+{
+  cds_t* cds = new cds_t();
+  cds->mesh = mesh;
+
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+
+  // set mesh, options
+  cds->meshV       = nrs->mesh;
+  cds->elementType = nrs->elementType;
+  cds->dim         = nrs->dim;
+  cds->NVfields    = nrs->NVfields;
+  cds->NSfields    = nrs->Nscalar;
+
+  cds->extbdfA = nrs->extbdfA;
+  cds->extbdfB = nrs->extbdfB;
+  cds->extbdfC = nrs->extbdfC;
+  cds->extC    = nrs->extC;
+
+  cds->Nstages       = nrs->Nstages;
+  cds->temporalOrder = nrs->temporalOrder;
+
+  // time stepper
+  dfloat rkC[4]  = {1.0, 0.0, -1.0, -2.0};
+  cds->o_rkC     = nrs->o_rkC;
+  cds->o_extbdfA = nrs->o_extbdfA;
+  cds->o_extbdfB = nrs->o_extbdfB;
+  cds->o_extbdfC = nrs->o_extbdfC;
+  cds->o_extC    = nrs->o_extC;
+
+  cds->o_usrwrk = &(nrs->o_usrwrk);
+
+  dlong Nlocal = mesh->Np * mesh->Nelements;
+  dlong Ntotal = mesh->Np * (mesh->Nelements + mesh->totalHaloPairs);
+  cds->Nlocal  = Nlocal;
+  cds->Ntotal  = Ntotal;
+
+  cds->vFieldOffset = nrs->fieldOffset;
+  cds->fieldOffset  = nrs->fieldOffset;
+  cds->Nblock       = (Nlocal + BLOCKSIZE - 1) / BLOCKSIZE;
+
+  cds->o_wrk0 = nrs->o_wrk0;
+  cds->o_wrk1 = nrs->o_wrk1;
+  cds->o_wrk2 = nrs->o_wrk2;
+  cds->o_wrk3 = nrs->o_wrk3;
+  cds->o_wrk4 = nrs->o_wrk4;
+  cds->o_wrk5 = nrs->o_wrk5;
+  cds->o_wrk6 = nrs->o_wrk6;
+
+  cds->gsh = nrs->gsh;
+  
+  if(nrs->cht) {
+    meshParallelGatherScatterSetup(mesh, cds->Nlocal, mesh->globalIds, mesh->comm, 0);
+    oogs_mode oogsMode = OOGS_AUTO; 
+    if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT;
+    cds->gshT = oogs::setup(mesh->ogs, 1, cds->fieldOffset, ogsDfloat, NULL, oogsMode);
+  } else {
+    cds->gshT = cds->gsh;
+  }
+
+  // build mass + inverse mass matrix
+  dfloat* lumpedMassMatrix = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
+  for(hlong e = 0; e < mesh->Nelements; ++e)
+    for(int n = 0; n < mesh->Np; ++n)
+      lumpedMassMatrix[e * mesh->Np +
+                       n] = mesh->vgeo[e * mesh->Np * mesh->Nvgeo + JWID * mesh->Np + n];
+  ogsGatherScatter(lumpedMassMatrix, ogsDfloat, ogsAdd, mesh->ogs);
+  mesh->o_LMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
+  mesh->o_LMM.copyTo(mesh->LMM);
+  for(int n = 0; n < mesh->Np * mesh->Nelements; ++n)
+    lumpedMassMatrix[n] = 1. / lumpedMassMatrix[n];
+  mesh->o_invLMM.copyFrom(lumpedMassMatrix, mesh->Nelements * mesh->Np * sizeof(dfloat));
+  mesh->o_invLMM.copyTo(mesh->invLMM);
+  free(lumpedMassMatrix);
+
+  // Solution storage at interpolation nodes
+  cds->U     = nrs->U; // Point to INS side Velocity
+  cds->S     =
+    (dfloat*) calloc(cds->NSfields * cds->Nstages * cds->fieldOffset,sizeof(dfloat));
+  cds->BF    = (dfloat*) calloc(cds->NSfields * cds->fieldOffset,sizeof(dfloat));
+  cds->FS    =
+    (dfloat*) calloc(cds->NSfields * cds->Nstages * cds->fieldOffset,sizeof(dfloat));
+
+  cds->Nsubsteps = nrs->Nsubsteps;
+  if(cds->Nsubsteps) {
+    cds->SNrk   = nrs->SNrk;
+    cds->Srka   = nrs->Srka;
+    cds->Srkb   = nrs->Srkb;
+    cds->Srkc   = nrs->Srkc;
+    cds->o_Srka = nrs->o_Srka;
+    cds->o_Srkb = nrs->o_Srkb;
+  }
+
+  cds->dt  = nrs->dt;
+  cds->sdt = nrs->sdt;
+
+  cds->prop = (dfloat*) calloc(cds->NSfields * 2 * cds->fieldOffset,sizeof(dfloat));
+  for(int is = 0; is < cds->NSfields; is++) {
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(2) << is;
+    string sid = ss.str();
+
+    if(options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) continue;
+
+    dfloat diff = 1;
+    dfloat rho = 1;
+    options.getArgs("SCALAR" + sid + " DIFFUSIVITY", diff);
+    options.getArgs("SCALAR" + sid + " DENSITY", rho);
+
+    const dlong off = cds->NSfields * cds->fieldOffset;
+    for (int e = 0; e < mesh->Nelements; e++)
+      for (int n = 0; n < mesh->Np; n++) {
+        cds->prop[0 * off + is * cds->fieldOffset + e * mesh->Np + n] = diff;
+        cds->prop[1 * off + is * cds->fieldOffset + e * mesh->Np + n] = rho;
+      }
+  }
+  cds->o_prop =
+    mesh->device.malloc(cds->NSfields * 2 * cds->fieldOffset * sizeof(dfloat), cds->prop);
+  cds->o_diff = cds->o_prop.slice(0 * cds->NSfields * cds->fieldOffset * sizeof(dfloat));
+  cds->o_rho  = cds->o_prop.slice(1 * cds->NSfields * cds->fieldOffset * sizeof(dfloat));
+
+  cds->var_coeff = 1; // use always var coeff elliptic
+  cds->ellipticCoeff   = nrs->ellipticCoeff;
+  cds->o_ellipticCoeff = nrs->o_ellipticCoeff;
+
+  cds->o_U  = nrs->o_U;
+  cds->o_Ue = nrs->o_Ue;
+  cds->o_S  =
+    mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat), cds->S);
+  cds->o_Se =
+    mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat));
+  cds->o_BF = mesh->device.malloc(cds->NSfields * cds->fieldOffset * sizeof(dfloat), cds->BF);
+  cds->o_FS =
+    mesh->device.malloc(cds->NSfields * cds->Nstages * cds->fieldOffset * sizeof(dfloat),
+                        cds->FS);
+
+  for (int is = 0; is < cds->NSfields; is++) {
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(2) << is;
+    string sid = ss.str();
+
+    cds->compute[is] = 1;
+    if (options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) {
+      cds->compute[is] = 0;
+      continue;
+    }
+
+    mesh_t* mesh;
+    (is) ? mesh = cds->meshV : mesh = cds->mesh; // only first scalar can be a CHT mesh
+ 
+    cds->options[is] = options;
+
+    cds->options[is].setArgs("KRYLOV SOLVER", options.getArgs("SCALAR SOLVER"));
+    cds->options[is].setArgs("DISCRETIZATION", options.getArgs("SCALAR DISCRETIZATION"));
+    cds->options[is].setArgs("BASIS", options.getArgs("SCALAR BASIS"));
+    cds->options[is].setArgs("PRECONDITIONER", options.getArgs("SCALAR" + sid + " PRECONDITIONER"));
+    cds->options[is].setArgs("SOLVER TOLERANCE",
+                         options.getArgs("SCALAR" + sid +  " SOLVER TOLERANCE"));
+    cds->options[is].setArgs("RESIDUAL PROJECTION",  options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION"));
+    cds->options[is].setArgs("RESIDUAL PROJECTION VECTORS",  options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION VECTORS"));
+    cds->options[is].setArgs("RESIDUAL PROJECTION START",  options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION START"));
+
+    // setup boundary mapping
+    dfloat largeNumber = 1 << 20;
+    cds->mapB[is] = (int*) calloc(mesh->Nelements * mesh->Np,sizeof(int));
+    int* mapB = cds->mapB[is];
+    for (int e = 0; e < mesh->Nelements; e++)
+      for (int n = 0; n < mesh->Np; n++) mapB[n + e * mesh->Np] = largeNumber;
+
+    cds->EToB[is] = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int));
+    int* EToB = cds->EToB[is];
+
+    int cnt = 0;
+    for (int e = 0; e < mesh->Nelements; e++)
+      for (int f = 0; f < mesh->Nfaces; f++) {
+        int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "scalar" + sid);
+        EToB[cnt] = bc;
+        if (bc > 0) {
+          for (int n = 0; n < mesh->Nfp; n++) {
+            int fid = mesh->faceNodes[n + f * mesh->Nfp];
+            mapB[fid + e * mesh->Np] = mymin(bc,mapB[fid + e * mesh->Np]); // Dirichlet wnrs
+          }
+        }
+        cnt++;
+      }
+
+    ogsGatherScatter(mapB, ogsInt, ogsMin, mesh->ogs);
+
+    for (int n = 0; n < mesh->Nelements * mesh->Np; n++)
+      if (mapB[n] == largeNumber) mapB[n] = 0;
+
+    cds->o_EToB[is] = mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int), EToB);
+    cds->o_mapB[is] = mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(int), mapB);
+  }
+
+  // build kernels
+  occa::properties kernelInfo = *nrs->kernelInfo;
+  occa::properties kernelInfoBC = kernelInfo;
+  //kernelInfo["defines/" "p_NSfields"]  = cds->NSfields;
+
+  string fileName, kernelName;
+  const string suffix = "Hex3D";
+  const string oklpath = install_dir + "/okl/core/";
+
+  MPI_Barrier(mesh->comm);
+  double tStartLoadKernel = MPI_Wtime();
+  if(mesh->rank == 0)  printf("loading cds kernels ... "); fflush(stdout);
+
+  for (int r = 0; r < 2; r++) {
+    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
+      fileName = oklpath + "cdsAdvection" + suffix + ".okl";
+
+      const string bcDataFile = install_dir + "/include/core/bcData.h";
+      kernelInfoBC["includes"] += bcDataFile.c_str();
+      string boundaryHeaderFileName;
+      options.getArgs("DATA FILE", boundaryHeaderFileName);
+      kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL);
+
+      kernelName = "cdsStrongAdvectionVolume" + suffix;
+      cds->advectionStrongVolumeKernel =
+        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      kernelName = "cdsStrongAdvectionCubatureVolume" + suffix;
+      cds->advectionStrongCubatureVolumeKernel =  mesh->device.buildKernel(fileName,
+                                                                           kernelName,
+                                                                           kernelInfo);
+
+      // ===========================================================================
+
+      fileName = oklpath + "math.okl";
+      kernelName = "fill";
+      cds->fillKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      kernelName = "maskCopy";
+      cds->maskCopyKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName   = oklpath + "cdsSumMakef" + suffix + ".okl";
+      kernelName = "cdsSumMakef" + suffix;
+      cds->sumMakefKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      fileName = oklpath + "cdsHelmholtzBC" + suffix + ".okl";
+      kernelName = "cdsHelmholtzBC" + suffix;
+      cds->helmholtzRhsBCKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfoBC);
+
+      kernelName = "cdsDirichletBC";
+      cds->dirichletBCKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfoBC);
+
+      fileName = oklpath + "setEllipticCoeff.okl";
+      kernelName = "setEllipticCoeff";
+      cds->setEllipticCoeffKernel =
+        mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      fileName = oklpath + "cdsMassMatrix.okl";
+      kernelName = "cdsMassMatrix" + suffix;
+      cds->massMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      kernelName = "cdsInvMassMatrix" + suffix;
+      cds->invMassMatrixKernel = mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+      fileName = oklpath + "cdsFilterRT" + suffix + ".okl";
+      kernelName = "cdsFilterRT" + suffix;
+      cds->filterRTKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      fileName = install_dir + "/okl/core/scaledAdd.okl";
+      kernelName = "scaledAddwOffset";
+      cds->scaledAddKernel =
+        mesh->device.buildKernel(fileName.c_str(), kernelName.c_str(), kernelInfo);
+
+      if(cds->Nsubsteps) {
+        fileName = oklpath + "cdsSubCycle" + suffix + ".okl";
+        kernelName = "cdsSubCycleStrongCubatureVolume" + suffix;
+        cds->subCycleStrongCubatureVolumeKernel =  mesh->device.buildKernel(fileName,
+                                                                            kernelName,
+                                                                            kernelInfo);
+
+        kernelName = "cdsSubCycleStrongVolume" + suffix;
+        cds->subCycleStrongVolumeKernel =
+          mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+
+        fileName = oklpath + "cdsSubCycleRKUpdate.okl";
+        kernelName = "cdsSubCycleLSERKUpdate";
+        if(cds->SNrk == 4) kernelName = "cdsSubCycleERKUpdate";
+        cds->subCycleRKUpdateKernel =  mesh->device.buildKernel(fileName, kernelName, kernelInfo);
+      }
+    }
+    MPI_Barrier(mesh->comm);
+  }
+
+  MPI_Barrier(mesh->comm);
+  if(mesh->rank == 0)  printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout);
+
+  return cds;
+}
diff --git a/src/core/insSetup.hpp b/src/core/setup.hpp
similarity index 52%
rename from src/core/insSetup.hpp
rename to src/core/setup.hpp
index ee494db4e..b3ba7d401 100644
--- a/src/core/insSetup.hpp
+++ b/src/core/setup.hpp
@@ -1,7 +1,7 @@
 #if !defined(nekrs_inssetup_hpp_)
 #define nekrs_inssetup_hpp_
 
-#include "nekrs.hpp"
-ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buildOnly);
+#include "nrs.hpp"
+nrs_t* nrsSetup(MPI_Comm comm, occa::device device, setupAide &options, int buildOnly);
 
 #endif
diff --git a/src/libP/src/setupAide.c b/src/core/setupAide.cpp
similarity index 100%
rename from src/libP/src/setupAide.c
rename to src/core/setupAide.cpp
diff --git a/src/libP/include/setupAide.hpp b/src/core/setupAide.hpp
similarity index 100%
rename from src/libP/include/setupAide.hpp
rename to src/core/setupAide.hpp
diff --git a/src/libP/include/setupAide.tpp b/src/core/setupAide.tpp
similarity index 100%
rename from src/libP/include/setupAide.tpp
rename to src/core/setupAide.tpp
diff --git a/src/core/timer.cpp b/src/core/timer.cpp
index 7fc989f3a..9fba328c2 100644
--- a/src/core/timer.cpp
+++ b/src/core/timer.cpp
@@ -3,8 +3,6 @@
 #include <map>
 #include <algorithm>
 
-#include "occa.hpp"
-#include "mpi.h"
 #include "timer.hpp"
 
 namespace timer
@@ -261,32 +259,34 @@ void printRunStat()
     if(dEtime[11] > 0)
     std::cout << "  checkpointing         " << dEtime[11]<< " s\n";
 
-    std::cout << "  total solve           " << dEtime[9] << " s\n"
-  	      << "  makef                 " << dEtime[0] << " s\n"
-              << "  velocitySolve         " << dEtime[1] << " s\n"
-              << "  pressureSolve         " << dEtime[2] << " s\n";
+    if(dEtime[12] > 0)
+    std::cout << "  udfExecuteStep        " << dEtime[12] << " s\n";
 
-    if(dEtime[6] > 0)
-    std::cout << "    residual projection " << dEtime[6] << " s\n";
+    std::cout << "  computation           " << dEtime[9]+dEtime[12] << " s\n"
+  	      << "  makef                 " << dEtime[0] << " s\n";
+    if(dEtime[13] > 0)
+    std::cout << "    udfUEqnSource       " << dEtime[13] << " s\n";
+    std::cout << "  velocitySolve         " << dEtime[1] << " s\n"
+              << "  pressureSolve         " << dEtime[2] << " s\n";
 
     std::cout << "    preconditioner      " << dEtime[5] << " s\n"
               << "      coarse grid       " << hEtime[0] << " s\n";
 
-    if(dEtime[4] > 0)
-    std::cout << "  makeq                 " << dEtime[3] << " s\n"
-              << "  scalarSolve           " << dEtime[4] << " s\n"
+    if(dEtime[4] > 0) {
+    std::cout << "  makeq                 " << dEtime[3] << " s\n";
+    std::cout << "    udfSEqnSource       " << dEtime[14] << " s\n";
+    }
+    if(dEtime[14] > 0)
+    std::cout << "  scalarSolve           " << dEtime[4] << " s\n"
               << std::endl;
 
-    if(dEtime[12] > 0)
-    std::cout << "  udfExecuteStep        " << dEtime[12] << " s\n";
-    if(dEtime[13] > 0)
-    std::cout << "  udfUEqnSource         " << dEtime[13] << " s\n";
-    if(dEtime[14] > 0)
-    std::cout << "  udfSEqnSource         " << dEtime[14] << " s\n";
     if(dEtime[15] > 0)
     std::cout << "  udfProperties         " << dEtime[15] << " s\n"
               << std::endl;
 
+    if(dEtime[6] > 0)
+    std::cout << "  residual projection   " << dEtime[6] << " s\n";
+
     if(hEtime[1] > 0)
     std::cout << "  gsMPI                 " << hEtime[1] << " s (without overlap)\n";
     if(dEtime[8] > 0)
diff --git a/src/core/timer.hpp b/src/core/timer.hpp
index a0579a7b5..062bb3388 100644
--- a/src/core/timer.hpp
+++ b/src/core/timer.hpp
@@ -3,8 +3,7 @@
 
 #include <string>
 
-#include "occa.hpp"
-#include "mpi.h"
+#include "nrssys.hpp"
 
 namespace timer
 {
diff --git a/src/core/tombo.cpp b/src/core/tombo.cpp
deleted file mode 100644
index ccba2907b..000000000
--- a/src/core/tombo.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-#include "nrs.hpp"
-#include "udf.hpp"
-
-namespace tombo
-{
-occa::memory pressureSolve(ins_t* ins, dfloat time)
-{
-  mesh_t* mesh = ins->mesh;
-
-  //enforce Dirichlet BCs
-  ins->fillKernel((1+ins->NVfields)*ins->fieldOffset, std::numeric_limits<dfloat>::min(), ins->o_wrk6);
-  for (int sweep = 0; sweep < 2; sweep++) {
-    ins->pressureDirichletBCKernel(mesh->Nelements,
-                                   time,
-                                   ins->dt,
-                                   ins->fieldOffset,
-                                   mesh->o_sgeo,
-                                   mesh->o_x,
-                                   mesh->o_y,
-                                   mesh->o_z,
-                                   mesh->o_vmapM,
-                                   mesh->o_EToB,
-                                   ins->o_EToB,
-                                   ins->o_usrwrk,
-                                   ins->o_U,
-                                   ins->o_P,
-                                   ins->o_wrk6);
-
-    ins->velocityDirichletBCKernel(mesh->Nelements,
-                                   ins->fieldOffset,
-                                   time,
-                                   mesh->o_sgeo,
-                                   mesh->o_x,
-                                   mesh->o_y,
-                                   mesh->o_z,
-                                   mesh->o_vmapM,
-                                   mesh->o_EToB,
-                                   ins->o_EToB,
-                                   ins->o_usrwrk,
-                                   ins->o_U,
-                                   ins->o_wrk7);
-
-    //take care of Neumann-Dirichlet shared edges across elements
-    if (sweep == 0) oogs::startFinish(ins->o_wrk6, 1+ins->NVfields, ins->fieldOffset, ogsDfloat, ogsMax, ins->gsh);
-    if (sweep == 1) oogs::startFinish(ins->o_wrk6, 1+ins->NVfields, ins->fieldOffset, ogsDfloat, ogsMin, ins->gsh);
-  }
-
-  if (ins->pSolver->Nmasked) ins->maskCopyKernel(ins->pSolver->Nmasked, 0, ins->pSolver->o_maskIds,
-                                                 ins->o_wrk6, ins->o_P); 
-
-  if (ins->uvwSolver) {
-    if (ins->uvwSolver->Nmasked) ins->maskCopyKernel(ins->uvwSolver->Nmasked, 0*ins->fieldOffset, ins->uvwSolver->o_maskIds,
-                                                     ins->o_wrk7, ins->o_U);
-  } else {
-    if (ins->uSolver->Nmasked) ins->maskCopyKernel(ins->uSolver->Nmasked, 0*ins->fieldOffset, ins->uSolver->o_maskIds, 
-                                                   ins->o_wrk7, ins->o_U);
-    if (ins->vSolver->Nmasked) ins->maskCopyKernel(ins->vSolver->Nmasked, 1*ins->fieldOffset, ins->vSolver->o_maskIds, 
-                                                   ins->o_wrk7, ins->o_U);
-    if (ins->wSolver->Nmasked) ins->maskCopyKernel(ins->wSolver->Nmasked, 2*ins->fieldOffset, ins->wSolver->o_maskIds, 
-                                                   ins->o_wrk7, ins->o_U);
-  }
-
-  ins->curlKernel(mesh->Nelements,
-                  mesh->o_vgeo,
-                  mesh->o_Dmatrices,
-                  ins->fieldOffset,
-                  ins->o_Ue,
-                  ins->o_wrk0);
-
-  oogs::startFinish(ins->o_wrk0, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);
-
-  ins->invMassMatrixKernel(
-    mesh->Nelements,
-    ins->fieldOffset,
-    ins->NVfields,
-    mesh->o_vgeo,
-    ins->mesh->o_invLMM,
-    ins->o_wrk0);
-
-  ins->curlKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    ins->o_wrk0,
-    ins->o_wrk3);
-
-  ins->gradientVolumeKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    ins->o_div,
-    ins->o_wrk0);
-
-  //if (ins->options.compareArgs("VARIABLE VISCOSITY", "TRUE"))
-  if(ins->options.compareArgs("STRESSFORMULATION", "TRUE"))
-    ins->pressureStressKernel(
-         mesh->Nelements,
-         mesh->o_vgeo,
-         mesh->o_Dmatrices,
-         ins->fieldOffset,
-         ins->o_mue,
-         ins->o_Ue,
-         ins->o_div,
-         ins->o_wrk3);
-
-  occa::memory o_irho = ins->o_ellipticCoeff;
-  ins->pressureRhsKernel(
-    mesh->Nelements * mesh->Np,
-    ins->fieldOffset,
-    ins->o_mue,
-    o_irho,
-    ins->o_BF,
-    ins->o_wrk3,
-    ins->o_wrk0,
-    ins->o_wrk6);
-
-  oogs::startFinish(ins->o_wrk6, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);
-
-  ins->invMassMatrixKernel(
-    mesh->Nelements,
-    ins->fieldOffset,
-    ins->NVfields,
-    mesh->o_vgeo,
-    ins->mesh->o_invLMM,
-    ins->o_wrk6);
-
-  ins->divergenceVolumeKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    ins->o_wrk6,
-    ins->o_wrk3);
-
-  ins->pressureAddQtlKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    ins->g0 * ins->idt,
-    ins->o_div,
-    ins->o_wrk3);
-
-  ins->divergenceSurfaceKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_sgeo,
-    mesh->o_vmapM,
-    mesh->o_EToB,
-    ins->o_EToB,
-    time,
-    ins->g0 * ins->idt,
-    mesh->o_x,
-    mesh->o_y,
-    mesh->o_z,
-    ins->fieldOffset,
-    ins->o_usrwrk,
-    ins->o_wrk6,
-    ins->o_U,
-    ins->o_wrk3);
-
-  oogs::startFinish(ins->o_wrk3, 1, 0, ogsDfloat, ogsAdd, ins->gsh);
-
-  ins->o_wrk1.copyFrom(ins->o_P, ins->Ntotal * sizeof(dfloat));
-  ins->NiterP = ellipticSolve(ins->pSolver, ins->presTOL, ins->o_wrk3, ins->o_wrk1);
-
-  return ins->o_wrk1;
-}
-
-occa::memory velocitySolve(ins_t* ins, dfloat time)
-{
-  mesh_t* mesh = ins->mesh;
-
-  dfloat scale = -1./3;
-  if(ins->options.compareArgs("STRESSFORMULATION", "TRUE")) scale = 2./3;
-
-#if 0
-  ins->PQKernel(
-       mesh->Nelements*mesh->Np,
-       -scale,
-       ins->o_mue,
-       ins->o_div,
-       ins->o_P,
-       ins->o_wrk3); 
-
-  ins->gradientVolumeKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    ins->o_wrk3,
-    ins->o_wrk0);
-#else
-  ins->mueDivKernel(
-       mesh->Nelements*mesh->Np,
-       scale,
-       ins->o_mue,
-       ins->o_div,
-       ins->o_wrk3); 
-
-  ins->gradientVolumeKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    ins->o_wrk3,
-    ins->o_wrk0);
-
-  ins->wgradientVolumeKernel(
-    mesh->Nelements,
-    mesh->o_vgeo,
-    mesh->o_Dmatrices,
-    ins->fieldOffset,
-    ins->o_P,
-    ins->o_wrk3); 
-
-  ins->scaledAddKernel(
-    ins->NVfields*ins->fieldOffset,
-    1.0,
-    0*ins->fieldOffset,
-    ins->o_wrk3,
-    -1.0,
-    0*ins->fieldOffset,
-    ins->o_wrk0);
-#endif
-
-  ins->velocityNeumannBCKernel(
-       mesh->Nelements,
-       ins->fieldOffset,
-       mesh->o_sgeo,
-       mesh->o_vmapM,
-       mesh->o_EToB,
-       ins->o_EToB,
-       time,
-       mesh->o_x,
-       mesh->o_y,
-       mesh->o_z,
-       ins->o_usrwrk,
-       ins->o_U,
-       ins->o_wrk0); 
-
-  ins->velocityRhsKernel(
-    mesh->Nelements,
-    ins->fieldOffset,
-    ins->o_BF,
-    ins->o_wrk0,
-    ins->o_rho,
-    ins->o_wrk3);
-
-  oogs::startFinish(ins->o_wrk3, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);
-
-  if(ins->options.compareArgs("VELOCITY INITIAL GUESS DEFAULT", "EXTRAPOLATION")) { 
-    ins->o_wrk0.copyFrom(ins->o_Ue, ins->NVfields * ins->fieldOffset * sizeof(dfloat));
-    if (ins->uvwSolver) {
-      if (ins->uvwSolver->Nmasked) ins->maskCopyKernel(ins->uvwSolver->Nmasked, 0*ins->fieldOffset, ins->uvwSolver->o_maskIds,
-                                                       ins->o_U, ins->o_wrk0);
-    } else {
-      if (ins->uSolver->Nmasked) ins->maskCopyKernel(ins->uSolver->Nmasked, 0*ins->fieldOffset, ins->uSolver->o_maskIds,
-                                                     ins->o_U, ins->o_wrk0);
-      if (ins->vSolver->Nmasked) ins->maskCopyKernel(ins->vSolver->Nmasked, 1*ins->fieldOffset, ins->vSolver->o_maskIds,
-                                                     ins->o_U, ins->o_wrk0);
-      if (ins->wSolver->Nmasked) ins->maskCopyKernel(ins->wSolver->Nmasked, 2*ins->fieldOffset, ins->wSolver->o_maskIds,
-                                                     ins->o_U, ins->o_wrk0);
-    }
-  } else {
-    ins->o_wrk0.copyFrom(ins->o_U, ins->NVfields * ins->fieldOffset * sizeof(dfloat));
-  }
-
-  if(ins->uvwSolver) {
-    ins->NiterU = ellipticSolve(ins->uvwSolver, ins->velTOL, ins->o_wrk3, ins->o_wrk0);
-  } else {
-    ins->NiterU = ellipticSolve(ins->uSolver, ins->velTOL, ins->o_wrk3, ins->o_wrk0);
-    ins->NiterV = ellipticSolve(ins->vSolver, ins->velTOL, ins->o_wrk4, ins->o_wrk1);
-    ins->NiterW = ellipticSolve(ins->wSolver, ins->velTOL, ins->o_wrk5, ins->o_wrk2);
-  }
-
-  return ins->o_wrk0;
-}
-
-} // namespace
diff --git a/src/core/tombo.hpp b/src/core/tombo.hpp
deleted file mode 100644
index c942012ef..000000000
--- a/src/core/tombo.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#if !defined(nekrs_tombo_hpp_)
-#define nekrs_tombo_hpp_
-
-#include "nekrs.hpp"
-
-namespace tombo
-{
-occa::memory pressureSolve(ins_t* ins, dfloat time);
-occa::memory velocitySolve(ins_t* ins, dfloat time);
-}
-
-#endif
diff --git a/src/core/inipp.hpp b/src/core/utils/inipp.hpp
similarity index 100%
rename from src/core/inipp.hpp
rename to src/core/utils/inipp.hpp
diff --git a/src/libP/src/mysort.c b/src/core/utils/mysort.cpp
similarity index 98%
rename from src/libP/src/mysort.c
rename to src/core/utils/mysort.cpp
index e5365b0fd..963d9f868 100644
--- a/src/libP/src/mysort.c
+++ b/src/core/utils/mysort.cpp
@@ -24,8 +24,9 @@
 
  */
 
-#include "mesh.h"
 #include <stdlib.h>
+#include "nrssys.hpp"
+#include "mesh.h"
 
 int isHigher(const void* a, const void* b)
 {
@@ -55,5 +56,4 @@ void mysort(hlong* data, int N, const char* order)
     qsort(data, N, sizeof(hlong), isHigher);
   else
     qsort(data, N, sizeof(hlong), isLower);
-
 }
diff --git a/src/core/occaDeviceConfig.cpp b/src/core/utils/occaHelpers.cpp
similarity index 56%
rename from src/core/occaDeviceConfig.cpp
rename to src/core/utils/occaHelpers.cpp
index b23c418ee..d08a78478 100644
--- a/src/core/occaDeviceConfig.cpp
+++ b/src/core/utils/occaHelpers.cpp
@@ -1,10 +1,35 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "omp.h"
-#include "mpi.h"
 #include "mesh.h"
 
 occa::device occaDeviceConfig(setupAide &options, MPI_Comm comm)
@@ -68,7 +93,27 @@ occa::device occaDeviceConfig(setupAide &options, MPI_Comm comm)
   omp_set_num_threads(Nthreads);
   //if(rank==0) printf("Number of OMP threads: %d\n", omp_get_num_threads());
 
-  occa::initTimer(device);
-
   return device;
 }
+
+
+void* occaHostMallocPinned(occa::device &device,
+                           size_t size,
+                           void* source,
+                           occa::memory &mem,
+                           occa::memory &h_mem)
+{
+  occa::properties props;
+  props["mapped"] = true;
+
+  if(source != NULL)
+    mem =  device.malloc(size, source);
+  else
+    mem =  device.malloc(size);
+
+  h_mem =  device.malloc(size, props);
+
+  void* ptr = h_mem.ptr(props);
+
+  return ptr;
+}
diff --git a/src/libP/src/parallelSort.c b/src/core/utils/parallelSort.cpp
similarity index 97%
rename from src/libP/src/parallelSort.c
rename to src/core/utils/parallelSort.cpp
index ae9664310..0a89b508d 100644
--- a/src/libP/src/parallelSort.c
+++ b/src/core/utils/parallelSort.cpp
@@ -28,8 +28,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-/* use this for int */
-#include "mesh.h"
+#include "nrssys.hpp"
 
 void mergeLists(size_t sz,
                 int N1, char* v1,
@@ -47,14 +46,14 @@ void mergeLists(size_t sz,
       if(c == -1) {
         memcpy(v3 + n3 * sz, v1 + n1 * sz, sz);
         ++n1;
-      }else  {
+      }else {
         memcpy(v3 + n3 * sz, v2 + n2 * sz, sz);
         ++n2;
       }
-    }else if(n1 < N1)    {
+    }else if(n1 < N1) {
       memcpy(v3 + n3 * sz, v1 + n1 * sz, sz);
       ++n1;
-    }else if(n2 < N2)    {
+    }else if(n2 < N2) {
       memcpy(v3 + n3 * sz, v2 + n2 * sz, sz);
       ++n2;
     }
diff --git a/src/core/tinyexpr.c b/src/core/utils/tinyexpr.c
similarity index 100%
rename from src/core/tinyexpr.c
rename to src/core/utils/tinyexpr.c
diff --git a/src/core/tinyexpr.h b/src/core/utils/tinyexpr.h
similarity index 100%
rename from src/core/tinyexpr.h
rename to src/core/utils/tinyexpr.h
diff --git a/src/dummy.f b/src/dummy.f
deleted file mode 100644
index 1e78d0db0..000000000
--- a/src/dummy.f
+++ /dev/null
@@ -1,2 +0,0 @@
-      subroutine nekrs
-      end
diff --git a/src/libP/solvers/elliptic/elliptic.h b/src/elliptic/elliptic.h
similarity index 98%
rename from src/libP/solvers/elliptic/elliptic.h
rename to src/elliptic/elliptic.h
index e70e528f5..8d3178913 100644
--- a/src/libP/solvers/elliptic/elliptic.h
+++ b/src/elliptic/elliptic.h
@@ -31,21 +31,17 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "mpi.h"
-#include "mesh2D.h"
+
+#include "nrssys.hpp"
 #include "mesh3D.h"
 #include "parAlmond.hpp"
 #include "ellipticPrecon.h"
 
-// block size for reduction (hard coded)
-#define blockSize 256
-
 #include "timer.hpp"
 #define ELLIPTIC_ENABLE_TIMER
 
 class ResidualProjection;
 
-extern "C" { // C Linkage
 typedef struct
 {
   int dim;
@@ -268,13 +264,12 @@ typedef struct
 #include "ellipticMultiGrid.h"
 #include "ellipticResidualProjection.h"
 
-elliptic_t* ellipticSetup(mesh2D* mesh, occa::properties kernelInfo, setupAide options);
 elliptic_t* ellipticBuildMultigridLevelFine(elliptic_t* elliptic);
 
 void ellipticPreconditioner(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_z);
 void ellipticPreconditionerSetup(elliptic_t* elliptic, ogs_t* ogs, occa::properties &kernelInfo);
 
-int  ellipticSolve(elliptic_t* elliptic, dfloat tol, occa::memory &o_r, occa::memory &o_x);
+int  ellipticSolve(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x);
 
 void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo);
 void ellipticBlockSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo);
@@ -335,7 +330,6 @@ void ellipticAx(elliptic_t* elliptic,
                 occa::memory &o_Aq,
                 const char* precision);
 
-
 dfloat ellipticWeightedNorm2(elliptic_t* elliptic, occa::memory &o_w, occa::memory &o_a);
 
 void ellipticBuildIpdg(elliptic_t* elliptic, int basisNp, dfloat* basis, dfloat lambda,
@@ -373,8 +367,6 @@ dfloat ellipticUpdatePCG(elliptic_t* elliptic, occa::memory &o_p, occa::memory &
 
 // dfloat maxEigSmoothAx(elliptic_t* elliptic, agmgLevel *level);
 
-#define maxNthreads 256
-
 extern "C"
 {
 void ellipticPlotVTUHex3D(mesh3D* mesh, char* fileNameBase, int fld);
@@ -508,6 +500,5 @@ void ellipticZeroMean(elliptic_t* elliptic, occa::memory &o_q);
 
 void ellipticThinOasSetup(elliptic_t* elliptic);
 mesh_t* create_extended_mesh(elliptic_t*);
-} // end C Linkage
 
 #endif
diff --git a/src/elliptic/ellipticBuildContinuous.cpp b/src/elliptic/ellipticBuildContinuous.cpp
new file mode 100644
index 000000000..490d5d8e7
--- /dev/null
+++ b/src/elliptic/ellipticBuildContinuous.cpp
@@ -0,0 +1,284 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#include "elliptic.h"
+
+// compare on global indices
+int parallelCompareRowColumn(const void* a, const void* b)
+{
+  nonZero_t* fa = (nonZero_t*) a;
+  nonZero_t* fb = (nonZero_t*) b;
+
+  if(fa->row < fb->row) return -1;
+  if(fa->row > fb->row) return +1;
+
+  if(fa->col < fb->col) return -1;
+  if(fa->col > fb->col) return +1;
+
+  return 0;
+}
+
+void ellipticBuildContinuousHex3D (elliptic_t* elliptic,
+                                   nonZero_t** A,
+                                   dlong* nnz,
+                                   ogs_t** ogs,
+                                   hlong* globalStarts);
+
+void ellipticBuildContinuous(elliptic_t* elliptic,
+                             nonZero_t** A,
+                             dlong* nnz,
+                             ogs_t** ogs,
+                             hlong* globalStarts)
+{
+  switch(elliptic->elementType) {
+  case HEXAHEDRA:
+    ellipticBuildContinuousHex3D(elliptic, A, nnz, ogs, globalStarts);
+    break;
+  }
+}
+
+void ellipticBuildContinuousHex3D(elliptic_t* elliptic,
+                                  nonZero_t** A,
+                                  dlong* nnz,
+                                  ogs_t** ogs,
+                                  hlong* globalStarts)
+{
+  mesh_t* mesh = elliptic->mesh;
+  setupAide options = elliptic->options;
+  // currently constant coefficient case only
+  const dfloat lambda = elliptic->lambda[0];
+
+  int rank = mesh->rank;
+
+  //use the masked gs handle to define a global ordering
+
+  // number of degrees of freedom on this rank (after gathering)
+  hlong Ngather = elliptic->ogs->Ngather;
+  dlong Ntotal  = mesh->Np * mesh->Nelements;
+
+  // create a global numbering system
+  hlong* globalIds = (hlong*) calloc(Ngather,sizeof(hlong));
+  int* owner     = (int*) calloc(Ngather,sizeof(int));
+
+  // every gathered degree of freedom has its own global id
+  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts + 1, 1, MPI_HLONG, mesh->comm);
+  for(int r = 0; r < mesh->size; ++r)
+    globalStarts[r + 1] = globalStarts[r] + globalStarts[r + 1];
+
+  //use the offsets to set a consecutive global numbering
+  for (dlong n = 0; n < elliptic->ogs->Ngather; n++) {
+    globalIds[n] = n + globalStarts[rank];
+    owner[n] = rank;
+  }
+
+  //scatter this numbering to the original nodes
+  hlong* globalNumbering = (hlong*) calloc(Ntotal,sizeof(hlong));
+  int* globalOwners = (int*) calloc(Ntotal,sizeof(int));
+  for (dlong n = 0; n < Ntotal; n++) globalNumbering[n] = -1;
+  ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, elliptic->ogs);
+  ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs);
+
+  free(globalIds);
+  free(owner);
+
+  // 2. Build non-zeros of stiffness matrix (unassembled)
+  dlong nnzLocal = mesh->Np * mesh->Np * mesh->Nelements;
+  nonZero_t* sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
+  int* AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
+  int* ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
+  int* AsendOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
+  int* ArecvOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
+
+  int* mask = (int*) calloc(mesh->Np * mesh->Nelements,sizeof(int));
+  for (dlong n = 0; n < elliptic->Nmasked; n++) mask[elliptic->maskIds[n]] = 1;
+
+  if(mesh->rank == 0) printf("Building full FEM matrix...");
+  fflush(stdout);
+
+  dlong cnt = 0;
+  for (dlong e = 0; e < mesh->Nelements; e++)
+    for (int nz = 0; nz < mesh->Nq; nz++)
+      for (int ny = 0; ny < mesh->Nq; ny++)
+        for (int nx = 0; nx < mesh->Nq; nx++) {
+          int idn = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+          if (mask[e * mesh->Np + idn]) continue; //skip masked nodes
+
+          for (int mz = 0; mz < mesh->Nq; mz++)
+            for (int my = 0; my < mesh->Nq; my++)
+              for (int mx = 0; mx < mesh->Nq; mx++) {
+                int idm = mx + my * mesh->Nq + mz * mesh->Nq * mesh->Nq;
+                if (mask[e * mesh->Np + idm]) continue; //skip masked nodes
+
+                int id;
+                dfloat val = 0.;
+
+                if ((ny == my) && (nz == mz)) {
+                  for (int k = 0; k < mesh->Nq; k++) {
+                    id = k + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                    dfloat Grr = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G00ID * mesh->Np];
+
+                    val += Grr * mesh->D[nx + k * mesh->Nq] * mesh->D[mx + k * mesh->Nq];
+                  }
+                }
+
+                if (nz == mz) {
+                  id = mx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                  dfloat Grs = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
+                  val += Grs * mesh->D[nx + mx * mesh->Nq] * mesh->D[my + ny * mesh->Nq];
+
+                  id = nx + my * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                  dfloat Gsr = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
+                  val += Gsr * mesh->D[mx + nx * mesh->Nq] * mesh->D[ny + my * mesh->Nq];
+                }
+
+                if (ny == my) {
+                  id = mx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                  dfloat Grt = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G02ID * mesh->Np];
+                  val += Grt * mesh->D[nx + mx * mesh->Nq] * mesh->D[mz + nz * mesh->Nq];
+
+                  id = nx + ny * mesh->Nq + mz * mesh->Nq * mesh->Nq;
+                  dfloat Gst = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G02ID * mesh->Np];
+                  val += Gst * mesh->D[mx + nx * mesh->Nq] * mesh->D[nz + mz * mesh->Nq];
+                }
+
+                if ((nx == mx) && (nz == mz)) {
+                  for (int k = 0; k < mesh->Nq; k++) {
+                    id = nx + k * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                    dfloat Gss = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G11ID * mesh->Np];
+
+                    val += Gss * mesh->D[ny + k * mesh->Nq] * mesh->D[my + k * mesh->Nq];
+                  }
+                }
+
+                if (nx == mx) {
+                  id = nx + my * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                  dfloat Gst = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G12ID * mesh->Np];
+                  val += Gst * mesh->D[ny + my * mesh->Nq] * mesh->D[mz + nz * mesh->Nq];
+
+                  id = nx + ny * mesh->Nq + mz * mesh->Nq * mesh->Nq;
+                  dfloat Gts = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G12ID * mesh->Np];
+                  val += Gts * mesh->D[my + ny * mesh->Nq] * mesh->D[nz + mz * mesh->Nq];
+                }
+
+                if ((nx == mx) && (ny == my)) {
+                  for (int k = 0; k < mesh->Nq; k++) {
+                    id = nx + ny * mesh->Nq + k * mesh->Nq * mesh->Nq;
+                    dfloat Gtt = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G22ID * mesh->Np];
+
+                    val += Gtt * mesh->D[nz + k * mesh->Nq] * mesh->D[mz + k * mesh->Nq];
+                  }
+                }
+
+                if ((nx == mx) && (ny == my) && (nz == mz)) {
+                  id = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
+                  dfloat JW = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + GWJID * mesh->Np];
+                  val += JW * lambda;
+                }
+
+                // pack non-zero
+                dfloat nonZeroThreshold = 1e-7;
+                if (fabs(val) >= nonZeroThreshold) {
+                  sendNonZeros[cnt].val = val;
+                  sendNonZeros[cnt].row = globalNumbering[e * mesh->Np + idn];
+                  sendNonZeros[cnt].col = globalNumbering[e * mesh->Np + idm];
+                  sendNonZeros[cnt].ownerRank = globalOwners[e * mesh->Np + idn];
+                  cnt++;
+                }
+              }
+        }
+
+  // Make the MPI_NONZERO_T data type
+  MPI_Datatype MPI_NONZERO_T;
+  MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
+  int blength[4] = {1, 1, 1, 1};
+  MPI_Aint addr[4], displ[4];
+  MPI_Get_address ( &(sendNonZeros[0]          ), addr + 0);
+  MPI_Get_address ( &(sendNonZeros[0].col      ), addr + 1);
+  MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr + 2);
+  MPI_Get_address ( &(sendNonZeros[0].val      ), addr + 3);
+  displ[0] = 0;
+  displ[1] = addr[1] - addr[0];
+  displ[2] = addr[2] - addr[0];
+  displ[3] = addr[3] - addr[0];
+  MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
+  MPI_Type_commit (&MPI_NONZERO_T);
+
+  // count how many non-zeros to send to each process
+  for(dlong n = 0; n < cnt; ++n)
+    AsendCounts[sendNonZeros[n].ownerRank]++;
+
+  // sort by row ordering
+  qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
+
+  // find how many nodes to expect (should use sparse version)
+  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
+
+  // find send and recv offsets for gather
+  *nnz = 0;
+  for(int r = 0; r < mesh->size; ++r) {
+    AsendOffsets[r + 1] = AsendOffsets[r] + AsendCounts[r];
+    ArecvOffsets[r + 1] = ArecvOffsets[r] + ArecvCounts[r];
+    *nnz += ArecvCounts[r];
+  }
+
+  *A = (nonZero_t*) calloc(*nnz, sizeof(nonZero_t));
+
+  // determine number to receive
+  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
+                (*A), ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
+                mesh->comm);
+
+  // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
+  qsort((*A), *nnz, sizeof(nonZero_t), parallelCompareRowColumn);
+
+  // compress duplicates
+  cnt = 0;
+  for(dlong n = 1; n < *nnz; ++n) {
+    if((*A)[n].row == (*A)[cnt].row &&
+       (*A)[n].col == (*A)[cnt].col) {
+      (*A)[cnt].val += (*A)[n].val;
+    }else {
+      ++cnt;
+      (*A)[cnt] = (*A)[n];
+    }
+  }
+  if (*nnz) cnt++;
+  *nnz = cnt;
+
+  if(mesh->rank == 0) printf("done.\n");
+
+  MPI_Barrier(mesh->comm);
+  MPI_Type_free(&MPI_NONZERO_T);
+
+  free(sendNonZeros);
+  free(globalNumbering);
+  free(globalOwners);
+
+  free(AsendCounts);
+  free(ArecvCounts);
+  free(AsendOffsets);
+  free(ArecvOffsets);
+}
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildContinuousGalerkin.c b/src/elliptic/ellipticBuildContinuousGalerkin.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticBuildContinuousGalerkin.c
rename to src/elliptic/ellipticBuildContinuousGalerkin.cpp
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildIpdg.c b/src/elliptic/ellipticBuildIpdg.cpp
similarity index 99%
rename from src/libP/solvers/elliptic/src/ellipticBuildIpdg.c
rename to src/elliptic/ellipticBuildIpdg.cpp
index 22a3e4e44..953e4639a 100644
--- a/src/libP/solvers/elliptic/src/ellipticBuildIpdg.c
+++ b/src/elliptic/ellipticBuildIpdg.cpp
@@ -140,7 +140,6 @@ void ellipticBuildIpdgTri2D(elliptic_t* elliptic, int basisNp, dfloat* basis,
       }
     }
 
-
   // reset non-zero counter
   dlong nnz = 0;
 
@@ -415,7 +414,6 @@ void ellipticBuildIpdgTri3D(elliptic_t* elliptic, int basisNp, dfloat* basis,
       }
     }
 
-
   // reset non-zero counter
   dlong nnz = 0;
 
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildJacobi.c b/src/elliptic/ellipticBuildJacobi.cpp
similarity index 99%
rename from src/libP/solvers/elliptic/src/ellipticBuildJacobi.c
rename to src/elliptic/ellipticBuildJacobi.cpp
index b68f917f0..386f03a30 100644
--- a/src/libP/solvers/elliptic/src/ellipticBuildJacobi.c
+++ b/src/elliptic/ellipticBuildJacobi.cpp
@@ -85,11 +85,10 @@ void ellipticUpdateJacobi(elliptic_t* elliptic)
   oogs::startFinish(precon->o_invDiagA, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs);
 
   const dfloat one = 1.0;
-  if(elliptic->blockSolver) 
+  if(elliptic->blockSolver)
     elliptic->scalarDivideManyKernel(Nlocal, elliptic->Ntotal, one, precon->o_invDiagA);
   else
     elliptic->scalarDivideKernel(Nlocal, one, precon->o_invDiagA);
-  
 }
 
 void ellipticBuildJacobi(elliptic_t* elliptic, dfloat** invDiagA)
@@ -352,11 +351,10 @@ void BuildLocalContinuousDiagHex3D(elliptic_t* elliptic,
       }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       if (elliptic->mapB[n + eM * mesh->Np] != 1) //dont fill rows for masked nodes
         A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 }
 
 void BuildLocalContinuousBlockDiagHex3D(elliptic_t* elliptic,
@@ -443,12 +441,11 @@ void BuildLocalContinuousBlockDiagHex3D(elliptic_t* elliptic,
     //add the rank boost for the allNeumann Poisson problem
     for(int fld = 0; fld < elliptic->Nfields; fld++) {
       const dlong offset = fld * elliptic->Ntotal;
-      if (elliptic->allBlockNeumann[fld]) {
+      if (elliptic->allBlockNeumann[fld])
         for(int n = 0; n < mesh->Np; ++n)
           if (elliptic->mapB[n + eM * mesh->Np + offset] != 1) //dont fill rows for masked nodes
             A[n + eM * mesh->Np + offset] += elliptic->allNeumannPenalty *
                                              elliptic->allNeumannScale * elliptic->allNeumannScale;
-      }
     }
   }
 }
@@ -486,11 +483,10 @@ void BuildLocalContinuousDiagQuad2D(elliptic_t* elliptic, mesh_t* mesh, dlong eM
     }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       if (elliptic->mapB[n + eM * mesh->Np] != 1) //dont fill rows for masked nodes
         A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 }
 
 #if 0
@@ -522,10 +518,9 @@ void BuildLocalIpdgDiagTri2D(elliptic_t* elliptic,
   }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 
   for (int fM = 0; fM < mesh->Nfaces; fM++) {
     // load surface geofactors for this face
@@ -644,10 +639,9 @@ void BuildLocalIpdgDiagTri3D(elliptic_t* elliptic,
   }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 
   for (int fM = 0; fM < mesh->Nfaces; fM++) {
     // load surface geofactors for this face
@@ -794,11 +788,10 @@ void BuildLocalContinuousDiagTri2D(elliptic_t* elliptic,
   }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       if (elliptic->mapB[n + eM * mesh->Np] != 1) //dont fill rows for masked nodes
         A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 }
 
 void BuildLocalIpdgDiagQuad2D(elliptic_t* elliptic,
@@ -1049,10 +1042,9 @@ void BuildLocalIpdgDiagTet3D(elliptic_t* elliptic,
   }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 
   for (int fM = 0; fM < mesh->Nfaces; fM++) {
     // load surface geofactors for this face
@@ -1181,11 +1173,10 @@ void BuildLocalContinuousDiagTet3D(elliptic_t* elliptic,
   }
 
   //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
+  if (elliptic->allNeumann)
     for(int n = 0; n < mesh->Np; ++n)
       if (elliptic->mapB[n + eM * mesh->Np] != 1) //dont fill rows for masked nodes
         A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale;
-  }
 }
 
 void BuildLocalIpdgDiagHex3D(elliptic_t* elliptic,
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c b/src/elliptic/ellipticBuildMultigridLevel.cpp
similarity index 84%
rename from src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c
rename to src/elliptic/ellipticBuildMultigridLevel.cpp
index 7e1b9c298..6ebec1609 100644
--- a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c
+++ b/src/elliptic/ellipticBuildMultigridLevel.cpp
@@ -126,30 +126,6 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
   setupAide options = elliptic->options;
 
   switch(elliptic->elementType) {
-  case TRIANGLES:
-    meshLoadReferenceNodesTri2D(mesh, Nc);
-    if(elliptic->dim == 2) {
-      meshPhysicalNodesTri2D(mesh);
-      meshGeometricFactorsTri2D(mesh);
-    }else{
-      meshPhysicalNodesTri3D(mesh);
-      meshGeometricFactorsTri3D(mesh);
-    }
-    break;
-  case QUADRILATERALS: {
-    meshLoadReferenceNodesQuad2D(mesh, Nc);
-    if(elliptic->dim == 2) {
-      meshPhysicalNodesQuad2D(mesh);
-      meshGeometricFactorsQuad2D(mesh);
-    }else{
-      meshPhysicalNodesQuad3D(mesh);
-      meshGeometricFactorsQuad3D(mesh);
-    }
-  } break;
-  case TETRAHEDRA:
-    meshLoadReferenceNodesTet3D(mesh, Nc);
-    meshPhysicalNodesTet3D(mesh);
-    break;
   case HEXAHEDRA:
     meshLoadReferenceNodesHex3D(mesh, Nc, 1);
     meshPhysicalNodesHex3D(mesh, buildOnly);
@@ -175,36 +151,6 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
   }
 
   switch(elliptic->elementType) {
-  case TRIANGLES:
-    if(elliptic->dim == 2)
-      meshConnectFaceNodes2D(mesh);
-    else
-      meshConnectFaceNodes3D(mesh);
-    break;
-  case QUADRILATERALS: {
-    if(elliptic->dim == 2) {
-      if(!options.compareArgs("BOX DOMAIN", "TRUE")) {
-        meshConnectFaceNodes2D(mesh);
-        meshSurfaceGeometricFactorsQuad2D(mesh);
-      }else {
-        if(mesh->rank == 0) printf("WARNING: connecting periodic box\n");
-        dfloat XMIN = -1, XMAX = +1; // default bi-unit cube
-        dfloat YMIN = -1, YMAX = +1;
-        options.getArgs("BOX XMIN", XMIN);
-        options.getArgs("BOX YMIN", YMIN);
-        options.getArgs("BOX XMAX", XMAX);
-        options.getArgs("BOX YMAX", YMAX);
-        meshConnectPeriodicFaceNodes2D(mesh, XMAX - XMIN, YMAX - YMIN);
-        meshSurfaceGeometricFactorsQuad2D(mesh);
-      }
-    }else{
-      meshConnectFaceNodes3D(mesh);
-      meshSurfaceGeometricFactorsQuad3D(mesh);
-    }
-  } break;
-  case TETRAHEDRA:
-    meshConnectFaceNodes3D(mesh);
-    break;
   case HEXAHEDRA:
 
     if(!options.compareArgs("BOX DOMAIN", "TRUE")) {
@@ -232,7 +178,7 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
   }
 
   // global nodes
-  meshParallelConnectNodes(mesh, 0, buildOnly);
+  meshParallelConnectNodes(mesh, buildOnly);
 
   //dont need these once vmap is made
   free(mesh->x);
@@ -242,8 +188,8 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
   free(sendBuffer);
 
   dlong Ntotal = mesh->Np * mesh->Nelements;
-  dlong Nblock = mymax(1,(Ntotal + blockSize - 1) / blockSize);
-  dlong Nblock2 = mymax(1,(Nblock + blockSize - 1) / blockSize);
+  dlong Nblock = mymax(1,(Ntotal + BLOCKSIZE - 1) / BLOCKSIZE);
+  dlong Nblock2 = mymax(1,(Nblock + BLOCKSIZE - 1) / BLOCKSIZE);
 
   elliptic->Nblock = Nblock;
   elliptic->Nblock2 = Nblock2;
@@ -338,7 +284,6 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
         PBq[j + i * mesh->Np] = mesh->PBq[i + j * mesh->cubNp];
       }
 
-
     for (int i = 0; i < mesh->Nfp; ++i)
       for (int j = 0; j < 3; ++j)
         L0vals[i + j * mesh->Nfp] = mesh->L0vals[j + i * 3];
@@ -756,36 +701,26 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
   //  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
 
   // set kernel name suffix
-  char* suffix;
-
-  if(elliptic->elementType == TRIANGLES) {
-    if(elliptic->dim == 2)
-      suffix = strdup("Tri2D");
-    else
-      suffix = strdup("Tri3D");
-  }
-  if(elliptic->elementType == QUADRILATERALS) {
-    if(elliptic->dim == 2)
-      suffix = strdup("Quad2D");
-    if(elliptic->dim == 3)
-      suffix = strdup("Quad3D");
-  }
-  if(elliptic->elementType == TETRAHEDRA)
-    suffix = strdup("Tet3D");
+  string suffix;
   if(elliptic->elementType == HEXAHEDRA)
-    suffix = strdup("Hex3D");
+    suffix = "Hex3D";
 
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  string filename, kernelName;
 
   MPI_Barrier(mesh->comm);
   double tStartLoadKernel = MPI_Wtime();
-  if(mesh->rank == 0)  printf("loading elliptic MG kernels ... "); fflush(stdout); 
+  if(mesh->rank == 0) printf("loading elliptic MG kernels ... ");
+  fflush(stdout);
+
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+  const string oklpath = install_dir + "/okl/elliptic/";
 
   for (int r = 0; r < 2; r++) {
     MPI_Barrier(mesh->comm);
 
     if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
-      kernelInfo["defines/" "p_blockSize"] = blockSize;
+      kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
 
       // add custom defines
       kernelInfo["defines/" "p_NpP"] = (mesh->Np + mesh->Nfp * mesh->Nfaces);
@@ -797,21 +732,21 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
       int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
       kernelInfo["defines/" "p_maxNodes"] = maxNodes;
 
-      int NblockV = mymax(1,maxNthreads / mesh->Np); // works for CUDA
+      int NblockV = mymax(1,BLOCKSIZE / mesh->Np); // works for CUDA
       kernelInfo["defines/" "p_NblockV"] = NblockV;
 
       int one = 1; //set to one for now. TODO: try optimizing over these
       kernelInfo["defines/" "p_NnodesV"] = one;
 
-      int NblockS = mymax(1,maxNthreads / maxNodes); // works for CUDA
+      int NblockS = mymax(1,BLOCKSIZE / maxNodes); // works for CUDA
       kernelInfo["defines/" "p_NblockS"] = NblockS;
 
-      int NblockP = mymax(1,maxNthreads / (4 * mesh->Np)); // get close to maxNthreads threads
+      int NblockP = mymax(1,BLOCKSIZE / (4 * mesh->Np)); // get close to BLOCKSIZE threads
       kernelInfo["defines/" "p_NblockP"] = NblockP;
 
       int NblockG;
       if(mesh->Np <= 32) NblockG = ( 32 / mesh->Np );
-      else NblockG = mymax(1,maxNthreads / mesh->Np);
+      else NblockG = mymax(1,BLOCKSIZE / mesh->Np);
       kernelInfo["defines/" "p_NblockG"] = NblockG;
 
       kernelInfo["defines/p_Nalign"] = USE_OCCA_MEM_BYTE_ALIGN;
@@ -820,91 +755,63 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
       //add standard boundary functions
       char* boundaryHeaderFileName;
       if (elliptic->dim == 2)
-        boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h");
+        boundaryHeaderFileName = strdup(oklpath + "/data/ellipticBoundary2D.h");
       else if (elliptic->dim == 3)
-        boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h");
+        boundaryHeaderFileName = strdup(oklpath + "/data/ellipticBoundary3D.h");
       kernelInfo["includes"] += boundaryHeaderFileName;
-*/
+ */
 
       occa::properties AxKernelInfo = kernelInfo;
-      sprintf(fileName, DELLIPTIC "/okl/ellipticAx%s.okl", suffix);
-      sprintf(kernelName, "ellipticAx%s", suffix);
+      filename = oklpath + "ellipticAx" + suffix + ".okl";
+      kernelName = "ellipticAx" + suffix;
       if(serial) {
         AxKernelInfo["okl/enabled"] = false;
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticSerialAx%s.c", suffix);
+        filename = oklpath + "ellipticSerialAx" + suffix + ".c";
       }
-      elliptic->AxKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
-      if(!strstr(pfloatString,dfloatString)){
+      elliptic->AxKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
+      if(!strstr(pfloatString,dfloatString)) {
         AxKernelInfo["defines/" "dfloat"] = pfloatString;
-        sprintf(kernelName, "ellipticAx%s", suffix);
-        elliptic->AxPfloatKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
+        kernelName = "ellipticAx" + suffix;
+        elliptic->AxPfloatKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
         AxKernelInfo["defines/" "dfloat"] = dfloatString;
       }
 
       // check for trilinear
       if(elliptic->elementType != HEXAHEDRA) {
-        sprintf(kernelName, "ellipticPartialAx%s", suffix);
+        kernelName = "ellipticPartialAx" + suffix;
       }else {
         if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR"))
-          sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix);
+          kernelName = "ellipticPartialAxTrilinear" + suffix;
         else
-          sprintf(kernelName, "ellipticPartialAx%s", suffix);
+          kernelName = "ellipticPartialAx" + suffix;
       }
 
       if(!serial) {
-        elliptic->partialAxKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
+        elliptic->partialAxKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
         if(!strstr(pfloatString,dfloatString)) {
           AxKernelInfo["defines/" "dfloat"] = pfloatString;
-          elliptic->partialAxPfloatKernel = mesh->device.buildKernel(fileName, kernelName, AxKernelInfo);
+          elliptic->partialAxPfloatKernel =
+            mesh->device.buildKernel(filename.c_str(), kernelName.c_str(), AxKernelInfo);
           AxKernelInfo["defines/" "dfloat"] = dfloatString;
         }
       }
 
-/*
-      // only for Hex3D - cubature Ax
-      if(elliptic->elementType == HEXAHEDRA) {
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticCubatureAx%s.okl", suffix);
-
-        sprintf(kernelName, "ellipticCubaturePartialAx%s", suffix);
-        elliptic->partialCubatureAxKernel = mesh->device.buildKernel(fileName,
-                                                                     kernelName,
-                                                                     AxKernelInfo);
-      }
-*/
-
-      if (options.compareArgs("BASIS", "BERN")) {
-        sprintf(fileName, DELLIPTIC "/okl/ellipticGradientBB%s.okl", suffix);
-        sprintf(kernelName, "ellipticGradientBB%s", suffix);
+      if (options.compareArgs("BASIS", "NODAL")) {
+        filename = oklpath + "ellipticGradient" + suffix + ".okl";
+        kernelName = "ellipticGradient" + suffix;
 
-        elliptic->gradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-
-        sprintf(kernelName, "ellipticPartialGradientBB%s", suffix);
-        elliptic->partialGradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+        elliptic->gradientKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 
+        kernelName = "ellipticPartialGradient" + suffix;
+        elliptic->partialGradientKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 /*
-        sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdgBB%s.okl", suffix);
-        sprintf(kernelName, "ellipticAxIpdgBB%s", suffix);
-        elliptic->ipdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-
-        sprintf(kernelName, "ellipticPartialAxIpdgBB%s", suffix);
-        elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-*/
-      } else if (options.compareArgs("BASIS", "NODAL")) {
-        sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix);
-        sprintf(kernelName, "ellipticGradient%s", suffix);
-
-        elliptic->gradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-
-        sprintf(kernelName, "ellipticPartialGradient%s", suffix);
-        elliptic->partialGradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-/*
-        sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdg%s.okl", suffix);
+        sprintf(fileName, oklpath + "ellipticAxIpdg%s.okl", suffix);
         sprintf(kernelName, "ellipticAxIpdg%s", suffix);
         elliptic->ipdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
 
         sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix);
         elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-*/
+ */
       }
     }
 
@@ -912,7 +819,8 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
   }
 
   MPI_Barrier(mesh->comm);
-  if(mesh->rank == 0)  printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout);
+  if(mesh->rank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel);
+  fflush(stdout);
 
   //new precon struct
   elliptic->precon = new precon_t();
@@ -921,20 +829,20 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
     MPI_Barrier(mesh->comm);
 
     if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
-      sprintf(fileName, DELLIPTIC "/okl/ellipticBlockJacobiPrecon.okl");
-      sprintf(kernelName, "ellipticBlockJacobiPrecon");
+      filename = oklpath + "ellipticBlockJacobiPrecon.okl";
+      kernelName = "ellipticBlockJacobiPrecon";
       elliptic->precon->blockJacobiKernel =
-        mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+        mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 
-      sprintf(kernelName, "ellipticPartialBlockJacobiPrecon");
-      elliptic->precon->partialblockJacobiKernel = mesh->device.buildKernel(fileName,
-                                                                            kernelName,
+      kernelName = "ellipticPartialBlockJacobiPrecon";
+      elliptic->precon->partialblockJacobiKernel = mesh->device.buildKernel(filename.c_str(),
+                                                                            kernelName.c_str(),
                                                                             kernelInfo);
 
-      sprintf(fileName, DELLIPTIC "/okl/ellipticPatchSolver.okl");
-      sprintf(kernelName, "ellipticApproxBlockJacobiSolver");
-      elliptic->precon->approxBlockJacobiSolverKernel = mesh->device.buildKernel(fileName,
-                                                                                 kernelName,
+      filename = oklpath + "ellipticPatchSolver.okl";
+      kernelName = "ellipticApproxBlockJacobiSolver";
+      elliptic->precon->approxBlockJacobiSolverKernel = mesh->device.buildKernel(filename.c_str(),
+                                                                                 kernelName.c_str(),
                                                                                  kernelInfo);
 
       //sizes for the coarsen and prolongation kernels. degree NFine to degree N
@@ -965,8 +873,8 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
       kernelInfo["defines/" "p_NpFine"] = NpFine;
       kernelInfo["defines/" "p_NpCoarse"] = NpCoarse;
 
-      int NblockVFine = maxNthreads / NpFine;
-      int NblockVCoarse = maxNthreads / NpCoarse;
+      int NblockVFine = BLOCKSIZE / NpFine;
+      int NblockVCoarse = BLOCKSIZE / NpCoarse;
       kernelInfo["defines/" "p_NblockVFine"] = NblockVFine;
       kernelInfo["defines/" "p_NblockVCoarse"] = NblockVCoarse;
 
@@ -978,13 +886,13 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
           suffix = strdup("Tri2D");
       }
 
-      sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix);
-      sprintf(kernelName, "ellipticPreconCoarsen%s", suffix);
-      elliptic->precon->coarsenKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+      filename = oklpath + "ellipticPreconCoarsen" + suffix + ".okl";
+      kernelName = "ellipticPreconCoarsen" + suffix;
+      elliptic->precon->coarsenKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 
-      sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix);
-      sprintf(kernelName, "ellipticPreconProlongate%s", suffix);
-      elliptic->precon->prolongateKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+      filename = oklpath + "ellipticPreconProlongate" + suffix + ".okl";
+      kernelName = "ellipticPreconProlongate" + suffix;
+      elliptic->precon->prolongateKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
     }
     MPI_Barrier(mesh->comm);
   }
@@ -1012,14 +920,14 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf
     mesh->o_DmatricesPfloat = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(pfloat));
     mesh->o_SmatricesPfloat = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(pfloat));
     elliptic->copyDfloatToPfloatKernel(mesh->Nelements * mesh->Np * mesh->Nggeo,
-      elliptic->mesh->o_ggeoPfloat,
-      mesh->o_ggeo);
+                                       elliptic->mesh->o_ggeoPfloat,
+                                       mesh->o_ggeo);
     elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq,
-      elliptic->mesh->o_DmatricesPfloat,
-      mesh->o_Dmatrices);
+                                       elliptic->mesh->o_DmatricesPfloat,
+                                       mesh->o_Dmatrices);
     elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq,
-      elliptic->mesh->o_SmatricesPfloat,
-      mesh->o_Smatrices);
+                                       elliptic->mesh->o_SmatricesPfloat,
+                                       mesh->o_Smatrices);
   }
 
   return elliptic;
diff --git a/src/elliptic/ellipticBuildMultigridLevelFine.cpp b/src/elliptic/ellipticBuildMultigridLevelFine.cpp
new file mode 100644
index 000000000..d46c5400d
--- /dev/null
+++ b/src/elliptic/ellipticBuildMultigridLevelFine.cpp
@@ -0,0 +1,139 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#include "elliptic.h"
+
+elliptic_t* ellipticBuildMultigridLevelFine(elliptic_t* baseElliptic)
+{
+  elliptic_t* elliptic = new elliptic_t();
+  memcpy(elliptic, baseElliptic, sizeof(*baseElliptic));
+
+  const int serial = baseElliptic->options.compareArgs("THREAD MODEL", "SERIAL");
+
+  elliptic->var_coeff = 0;
+  elliptic->lambda = (dfloat*) calloc(elliptic->Nfields, sizeof(dfloat)); // enforce lambda = 0
+
+  mesh_t* mesh = elliptic->mesh;
+
+  if(!strstr(pfloatString,dfloatString)) {
+    mesh->o_ggeoPfloat = mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(pfloat));
+    mesh->o_DmatricesPfloat = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(pfloat));
+    mesh->o_SmatricesPfloat = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(pfloat));
+
+    elliptic->copyDfloatToPfloatKernel(mesh->Nelements * mesh->Np * mesh->Nggeo,
+                                       elliptic->mesh->o_ggeoPfloat,
+                                       mesh->o_ggeo);
+    elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq,
+                                       elliptic->mesh->o_DmatricesPfloat,
+                                       mesh->o_Dmatrices);
+    elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq,
+                                       elliptic->mesh->o_SmatricesPfloat,
+                                       mesh->o_Smatrices);
+  }
+
+  string suffix;
+  occa::properties kernelInfo = ellipticKernelInfo(mesh);
+  if(elliptic->elementType == HEXAHEDRA)
+    suffix = "Hex3D";
+
+  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+
+  // add custom defines
+  kernelInfo["defines/" "p_NpP"] = (mesh->Np + mesh->Nfp * mesh->Nfaces);
+  kernelInfo["defines/" "p_Nverts"] = mesh->Nverts;
+
+  int Nmax = mymax(mesh->Np, mesh->Nfaces * mesh->Nfp);
+  kernelInfo["defines/" "p_Nmax"] = Nmax;
+
+  int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
+  kernelInfo["defines/" "p_maxNodes"] = maxNodes;
+
+  int NblockV = mymax(1,BLOCKSIZE / mesh->Np);
+  kernelInfo["defines/" "p_NblockV"] = NblockV;
+
+  int one = 1; //set to one for now. TODO: try optimizing over these
+  kernelInfo["defines/" "p_NnodesV"] = one;
+
+  int NblockS = mymax(1,BLOCKSIZE / maxNodes);
+  kernelInfo["defines/" "p_NblockS"] = NblockS;
+
+  int NblockP = mymax(1,BLOCKSIZE / (4 * mesh->Np)); // get close to BLOCKSIZE threads
+  kernelInfo["defines/" "p_NblockP"] = NblockP;
+
+  int NblockG;
+  if(mesh->Np <= 32) NblockG = ( 32 / mesh->Np );
+  else NblockG = mymax(1,BLOCKSIZE / mesh->Np);
+  kernelInfo["defines/" "p_NblockG"] = NblockG;
+
+  kernelInfo["defines/" "p_eNfields"] = elliptic->Nfields;
+  kernelInfo["defines/p_Nalign"] = USE_OCCA_MEM_BYTE_ALIGN;
+
+  string filename, kernelName;
+
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+  const string oklpath = install_dir + "/okl/elliptic/";
+
+  for (int r = 0; r < 2; r++) {
+    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
+      occa::properties AxKernelInfo = kernelInfo;
+
+      filename = oklpath + "ellipticAx" + suffix + ".okl";
+      kernelName = "ellipticAx" + suffix;
+      if(serial) {
+        AxKernelInfo["okl/enabled"] = false;
+        filename = oklpath + "ellipticSerialAx" + suffix + ".c";
+      }
+      elliptic->AxKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
+
+      if(!strstr(pfloatString,dfloatString)) {
+        AxKernelInfo["defines/" "dfloat"] = pfloatString;
+        kernelName = "ellipticAx" + suffix;
+        elliptic->AxPfloatKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
+        AxKernelInfo["defines/" "dfloat"] = dfloatString;
+      }
+
+      if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR"))
+        kernelName = "ellipticPartialAxTrilinear" + suffix;
+      else
+        kernelName = "ellipticPartialAx" + suffix;
+
+      if(!serial) {
+        elliptic->partialAxKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
+        if(!strstr(pfloatString,dfloatString)) {
+          AxKernelInfo["defines/" "dfloat"] = pfloatString;
+          elliptic->partialAxPfloatKernel =
+            mesh->device.buildKernel(filename.c_str(), kernelName.c_str(), AxKernelInfo);
+          AxKernelInfo["defines/" "dfloat"] = dfloatString;
+        }
+      }
+    }
+
+    MPI_Barrier(mesh->comm);
+  }
+
+  return elliptic;
+}
diff --git a/src/libP/solvers/elliptic/src/ellipticHaloExchange.c b/src/elliptic/ellipticHaloExchange.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticHaloExchange.c
rename to src/elliptic/ellipticHaloExchange.cpp
diff --git a/src/libP/solvers/elliptic/src/ellipticKernelInfo.c b/src/elliptic/ellipticKernelInfo.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticKernelInfo.c
rename to src/elliptic/ellipticKernelInfo.cpp
diff --git a/src/libP/solvers/elliptic/ellipticMultiGrid.h b/src/elliptic/ellipticMultiGrid.h
similarity index 100%
rename from src/libP/solvers/elliptic/ellipticMultiGrid.h
rename to src/elliptic/ellipticMultiGrid.h
diff --git a/src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c b/src/elliptic/ellipticMultiGridLevel.cpp
similarity index 99%
rename from src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c
rename to src/elliptic/ellipticMultiGridLevel.cpp
index 611b3c312..0a6555627 100644
--- a/src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c
+++ b/src/elliptic/ellipticMultiGridLevel.cpp
@@ -65,7 +65,7 @@ void MGLevel::prolongate(occa::memory o_x, occa::memory o_Px)
 
 void MGLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero)
 {
-  if(!strstr(pfloatString,dfloatString)){
+  if(!strstr(pfloatString,dfloatString)) {
     elliptic->copyDfloatToPfloatKernel(Nrows, o_xPfloat, o_x);
     elliptic->copyDfloatToPfloatKernel(Nrows, o_rhsPfloat, o_rhs);
     if (stype == RICHARDSON)
@@ -144,7 +144,6 @@ void MGLevel::smoothChebyshevOneIteration (occa::memory &o_r, occa::memory &o_x,
     //res = Sr
     this->smoother(o_r, o_res, xIsZero);
     elliptic->updateSmoothedSolutionVecKernel(Nrows, invTheta, o_res, one, o_d, zero, o_x);
-
   } else {
     //res = S(r-Ax)
     this->Ax(o_x,o_res);
@@ -160,9 +159,10 @@ void MGLevel::smoothChebyshevOneIteration (occa::memory &o_r, occa::memory &o_x,
   pfloat rhoDivDelta = 2.0 * rho_np1 / delta;
   elliptic->updateChebyshevSolutionVecKernel(Nrows, rhoDivDelta, rho_np1, rho_n, o_Ad, o_res, o_d, o_x);
 }
+
 void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero)
 {
-  if(ChebyshevIterations == 1){
+  if(ChebyshevIterations == 1) {
     smoothChebyshevOneIteration(o_r,o_x,xIsZero);
     return;
   }
diff --git a/src/libP/solvers/elliptic/src/ellipticMultiGridLevelSetup.c b/src/elliptic/ellipticMultiGridLevelSetup.cpp
similarity index 95%
rename from src/libP/solvers/elliptic/src/ellipticMultiGridLevelSetup.c
rename to src/elliptic/ellipticMultiGridLevelSetup.cpp
index 07f517035..3656b308f 100644
--- a/src/libP/solvers/elliptic/src/ellipticMultiGridLevelSetup.c
+++ b/src/elliptic/ellipticMultiGridLevelSetup.cpp
@@ -56,9 +56,8 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, dfloat lambda_, int Nc,
 
   this->setupSmoother(ellipticBase);
 
-  o_xPfloat = mesh->device.malloc(Nrows*sizeof(pfloat));
-  o_rhsPfloat = mesh->device.malloc(Nrows*sizeof(pfloat));
-
+  o_xPfloat = mesh->device.malloc(Nrows * sizeof(pfloat));
+  o_rhsPfloat = mesh->device.malloc(Nrows * sizeof(pfloat));
 }
 
 //build a level and connect it to the previous one
@@ -102,8 +101,8 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, //finest level
   else
     this->buildCoarsenerQuadHex(meshLevels, Nf, Nc);
 
-  o_xPfloat = mesh->device.malloc(Nrows*sizeof(pfloat));
-  o_rhsPfloat = mesh->device.malloc(Nrows*sizeof(pfloat));
+  o_xPfloat = mesh->device.malloc(Nrows * sizeof(pfloat));
+  o_rhsPfloat = mesh->device.malloc(Nrows * sizeof(pfloat));
 }
 
 void MGLevel::setupSmoother(elliptic_t* ellipticBase)
@@ -111,7 +110,7 @@ void MGLevel::setupSmoother(elliptic_t* ellipticBase)
   if (degree == 1) return; // solved by coarse grid solver
 
   if (options.compareArgs("MULTIGRID SMOOTHER","ASM") ||
-             options.compareArgs("MULTIGRID SMOOTHER","RAS")) {
+      options.compareArgs("MULTIGRID SMOOTHER","RAS")) {
     stype = SCHWARZ;
     smtypeUp = JACOBI;
     smtypeDown = JACOBI;
@@ -131,11 +130,10 @@ void MGLevel::setupSmoother(elliptic_t* ellipticBase)
     if(options.compareArgs("MULTIGRID DOWNWARD SMOOTHER","JACOBI") ||
        options.compareArgs("MULTIGRID UPWARD SMOOTHER","JACOBI")) {
       dfloat* invDiagA;
-      std::vector<pfloat> casted_invDiagA(mesh->Np*mesh->Nelements, 0.0);
+      std::vector<pfloat> casted_invDiagA(mesh->Np * mesh->Nelements, 0.0);
       ellipticBuildJacobi(elliptic,&invDiagA);
-      for(dlong i = 0 ; i < mesh->Np*mesh->Nelements; ++i){
+      for(dlong i = 0; i < mesh->Np * mesh->Nelements; ++i)
         casted_invDiagA[i] = static_cast<pfloat>(invDiagA[i]);
-      }
       o_invDiagA = mesh->device.malloc(mesh->Np * mesh->Nelements * sizeof(pfloat), casted_invDiagA.data());
       if(options.compareArgs("MULTIGRID UPWARD SMOOTHER","JACOBI"))
         smtypeUp = JACOBI;
@@ -147,10 +145,9 @@ void MGLevel::setupSmoother(elliptic_t* ellipticBase)
     smtypeDown = JACOBI;
     dfloat* invDiagA;
     ellipticBuildJacobi(elliptic,&invDiagA);
-    std::vector<pfloat> casted_invDiagA(mesh->Np*mesh->Nelements, 0.0);
-    for(dlong i = 0 ; i < mesh->Np*mesh->Nelements; ++i){
+    std::vector<pfloat> casted_invDiagA(mesh->Np * mesh->Nelements, 0.0);
+    for(dlong i = 0; i < mesh->Np * mesh->Nelements; ++i)
       casted_invDiagA[i] = static_cast<pfloat>(invDiagA[i]);
-    }
 
     o_invDiagA = mesh->device.malloc(mesh->Np * mesh->Nelements * sizeof(pfloat), casted_invDiagA.data());
 
@@ -200,7 +197,7 @@ void MGLevel::Report()
   MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh->comm);
 
   char smootherString[BUFSIZ];
-  if (degree != 1){
+  if (degree != 1) {
     if (stype == RICHARDSON && smtypeDown == JACOBI)
       strcpy(smootherString, "Damped Jacobi    ");
     else if (stype == CHEBYSHEV && smtypeDown == JACOBI)
@@ -388,7 +385,7 @@ dfloat MGLevel::maxEigSmoothAx()
     ellipticOperator(elliptic,o_V[j],o_AVx,dfloatString);
     elliptic->copyDfloatToPfloatKernel(M, o_AVxPfloat, o_AVx);
     this->smoother(o_AVxPfloat, o_VxPfloat, true);
-    elliptic->copyPfloatToDPfloatKernel(M, o_VxPfloat, o_V[j+1]);
+    elliptic->copyPfloatToDPfloatKernel(M, o_VxPfloat, o_V[j + 1]);
 
     // modified Gram-Schmidth
     for(int i = 0; i <= j; i++) {
diff --git a/src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c b/src/elliptic/ellipticMultiGridSchwarz.cpp
similarity index 93%
rename from src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c
rename to src/elliptic/ellipticMultiGridSchwarz.cpp
index f4f53f943..4a10935bb 100644
--- a/src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c
+++ b/src/elliptic/ellipticMultiGridSchwarz.cpp
@@ -35,22 +35,22 @@
 
 struct ElementLengths
 {
-  dfloat * length_left_x;
-  dfloat * length_left_y;
-  dfloat * length_left_z;
-  dfloat * length_middle_x;
-  dfloat * length_middle_y;
-  dfloat * length_middle_z;
-  dfloat * length_right_x;
-  dfloat * length_right_y;
-  dfloat * length_right_z;
+  dfloat* length_left_x;
+  dfloat* length_left_y;
+  dfloat* length_left_z;
+  dfloat* length_middle_x;
+  dfloat* length_middle_y;
+  dfloat* length_middle_z;
+  dfloat* length_right_x;
+  dfloat* length_right_y;
+  dfloat* length_right_z;
 };
 struct FDMOperators
 {
-  dfloat * Sx;
-  dfloat * Sy;
-  dfloat * Sz;
-  dfloat * D;
+  dfloat* Sx;
+  dfloat* Sy;
+  dfloat* Sz;
+  dfloat* D;
 };
 void harmonic_mean_element_length(
   ElementLengths* lengths,
@@ -133,6 +133,7 @@ void harmonic_mean_element_length(
     lengths->length_middle_z[e] = 1.0 / sqrt(lt2);
   }
 }
+
 void
 compute_element_lengths(ElementLengths* lengths, elliptic_t* elliptic)
 {
@@ -180,8 +181,8 @@ compute_element_lengths(ElementLengths* lengths, elliptic_t* elliptic)
     }
   }
 
-  dfloat * l = (dfloat*) calloc(mesh->Np * Nelements, sizeof(dfloat));
-  for(unsigned i = 0 ; i < mesh->Np * Nelements; ++i)
+  dfloat* l = (dfloat*) calloc(mesh->Np * Nelements, sizeof(dfloat));
+  for(unsigned i = 0; i < mesh->Np * Nelements; ++i)
     l[i] = 0.0;
 
   for(dlong e = 0; e < Nelements; ++e) {
@@ -234,6 +235,7 @@ compute_element_lengths(ElementLengths* lengths, elliptic_t* elliptic)
   }
   free(l);
 }
+
 void compute_element_boundary_conditions(int* lbr,
                                          int* rbr,
                                          int* lbs,
@@ -258,6 +260,7 @@ void compute_element_boundary_conditions(int* lbr,
   *lbt = fbc[4];
   *rbt = fbc[5];
 }
+
 void row_zero(
   dfloat* S,
   const int nl,
@@ -267,8 +270,9 @@ void row_zero(
   for(int i = 0; i < nl; ++i)
     S[offset + nl * i] = 0.0;
 }
+
 void compute_1d_stiffness_matrix(
-  dfloat * a,
+  dfloat* a,
   const int lbc,
   const int rbc,
   const double ll,
@@ -284,7 +288,7 @@ void compute_1d_stiffness_matrix(
   const int n =  elliptic->mesh->N;
   const int nl = n + 3;
   dfloat* ah = (dfloat*) calloc((n + 1) * (n + 1), sizeof(dfloat));
-  dfloat* tmp = (dfloat*) calloc((n + 1)* (n + 1), sizeof(dfloat));
+  dfloat* tmp = (dfloat*) calloc((n + 1) * (n + 1), sizeof(dfloat));
   for(int i = 0; i < n + 1; ++i)
     for(int j = 0; j < n + 1; ++j)
       tmp[i * (n + 1) + j] = elliptic->mesh->D[i * (n + 1) + j];
@@ -307,7 +311,7 @@ void compute_1d_stiffness_matrix(
   int i1 = n;
   if(rbc == 1) i1 = n - 1;
 
-  for(unsigned int i = 0 ; i < nl*nl; ++i)
+  for(unsigned int i = 0; i < nl * nl; ++i)
     a[i] = 0.0;
   double fac = 2.0 / lm;
   a(1,1) = 1.0;
@@ -336,8 +340,9 @@ void compute_1d_stiffness_matrix(
 #undef a
 #undef ah
 }
+
 void compute_1d_mass_matrix(
-  dfloat * b,
+  dfloat* b,
   const int lbc,
   const int rbc,
   const double ll,
@@ -356,7 +361,7 @@ void compute_1d_mass_matrix(
   int i1 = n;
   if(rbc == 1) i1 = n - 1;
 
-  for(unsigned int i = 0 ; i < nl*nl; ++i)
+  for(unsigned int i = 0; i < nl * nl; ++i)
     b[i] = 0.0;
 
   double fac = 0.5 * lm;
@@ -380,6 +385,7 @@ void compute_1d_mass_matrix(
   }
 #undef b
 }
+
 extern "C"
 {
 void dsygv_ (
@@ -412,9 +418,9 @@ void ssygv_ (
   );
 }
 void solve_generalized_ev(
-  dfloat * a,
-  dfloat * b,
-  dfloat * lam,
+  dfloat* a,
+  dfloat* b,
+  dfloat* lam,
   int n
   )
 {
@@ -425,16 +431,16 @@ void solve_generalized_ev(
   char JOBZ = 'V';
   char UPLO = 'U';
   // copy of A, B in case anything goes wrong
-  dfloat * a_copy = (dfloat*) calloc(n*n, sizeof(dfloat));
-  dfloat * b_copy = (dfloat*) calloc(n*n, sizeof(dfloat));
-  for(unsigned i = 0 ; i < n*n; ++i){
+  dfloat* a_copy = (dfloat*) calloc(n * n, sizeof(dfloat));
+  dfloat* b_copy = (dfloat*) calloc(n * n, sizeof(dfloat));
+  for(unsigned i = 0; i < n * n; ++i) {
     a_copy[i] = a[i];
     b_copy[i] = b[i];
   }
 #ifdef DFLOAT_DOUBLE
-    dsygv_(&itype,&JOBZ,&UPLO,&n,a,&n,b, &n, lam, work_arr, &worksize, &info);
+  dsygv_(&itype,&JOBZ,&UPLO,&n,a,&n,b, &n, lam, work_arr, &worksize, &info);
 #else
-    ssygv_(&itype,&JOBZ,&UPLO,&n,a,&n,b, &n, lam, work_arr, &worksize, &info);
+  ssygv_(&itype,&JOBZ,&UPLO,&n,a,&n,b, &n, lam, work_arr, &worksize, &info);
 #endif
   if(info != 0) {
     std::ostringstream err_logger;
@@ -444,12 +450,12 @@ void solve_generalized_ev(
     } else {
       if(info <= n) {
         err_logger <<
-        "DSYEV failed to converge, as i off-diagonal elements of an intermediate tridiagonal form did not converge to zero\n";
+          "DSYEV failed to converge, as i off-diagonal elements of an intermediate tridiagonal form did not converge to zero\n";
       } else {
         info -= n;
         err_logger << "The leading minor of order " << info << " of B is not positive definite.\n"
                    <<
-        "The factorization of B could not be completed and no eigenvalues/eigenvectors were computed.\n";
+          "The factorization of B could not be completed and no eigenvalues/eigenvectors were computed.\n";
       }
     }
 
@@ -472,9 +478,10 @@ void solve_generalized_ev(
   free(a_copy);
   free(b_copy);
 }
+
 void compute_1d_matrices(
-  dfloat * S,
-  dfloat * lam,
+  dfloat* S,
+  dfloat* lam,
   const int lbc,
   const int rbc,
   const double ll,
@@ -511,6 +518,7 @@ void compute_1d_matrices(
     row_zero(S,nl,nl - 2);
   free(b);
 }
+
 void gen_operators(FDMOperators* op, ElementLengths* lengths, elliptic_t* elliptic)
 {
   const int Nq_e = elliptic->mesh->Nq + 2;
@@ -631,7 +639,7 @@ mesh_t* create_extended_mesh(elliptic_t* elliptic)
 
   meshHaloSetup(mesh);
   meshConnectFaceNodes3D(mesh);
-  meshParallelConnectNodes(mesh, 0, buildOnly);
+  meshParallelConnectNodes(mesh, buildOnly);
   mesh->ogs = ogsSetup(mesh->Nelements * mesh->Np, mesh->globalIds, mesh->comm, 1, mesh->device);
 
   const int bigNum = 1E9;
@@ -676,6 +684,7 @@ mesh_t* create_extended_mesh(elliptic_t* elliptic)
 
   return mesh;
 }
+
 // convenience function
 void to_reg(pfloat* arr1,
             pfloat* arr2,
@@ -695,6 +704,7 @@ void to_reg(pfloat* arr1,
 #undef arr1
 #undef arr2
 }
+
 void extrude(pfloat* arr1,
              const int l1,
              const pfloat f1,
@@ -735,6 +745,7 @@ void extrude(pfloat* arr1,
 #undef arr1
 #undef arr2
 }
+
 void MGLevel::generate_weights()
 {
   const pfloat one = 1.0;
@@ -770,6 +781,7 @@ void MGLevel::generate_weights()
   free(work2);
   free(wts);
 }
+
 void MGLevel::build(
   elliptic_t* pSolver)
 {
@@ -792,19 +804,19 @@ void MGLevel::build(
   const int Np_e = extendedMesh->Np;
   const dlong Nlocal_e = Nelements * Np_e;
 
-  oogs_mode oogsMode = OOGS_AUTO; 
+  oogs_mode oogsMode = OOGS_AUTO;
   if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT;
 
-  extendedOgs = (void*) oogs::setup(Nelements * Np_e, extendedMesh->maskedGlobalIds, 1, 0, 
+  extendedOgs = (void*) oogs::setup(Nelements * Np_e, extendedMesh->maskedGlobalIds, 1, 0,
                                     ogsPfloat, extendedMesh->comm, 1, extendedMesh->device,
                                     NULL, oogsMode);
   meshFree(extendedMesh);
 
 /*
-  ogs = (void*) oogs::setup(Nelements * Np, elliptic->mesh->maskedGlobalIds, 1, 0,
+   ogs = (void*) oogs::setup(Nelements * Np, elliptic->mesh->maskedGlobalIds, 1, 0,
                             ogsPfloat, elliptic->mesh->comm, 1, elliptic->mesh->device,
                             NULL, oogsMode);
-*/
+ */
 
   ogs = (void*) elliptic->oogs;
 
@@ -843,7 +855,6 @@ void MGLevel::build(
   free(lengths->length_right_z);
   free(lengths);
 
-
   const dlong weightSize = Np * Nelements;
   o_Sx = mesh->device.malloc  (Nq_e * Nq_e * Nelements * sizeof(pfloat));
   o_Sy = mesh->device.malloc  (Nq_e * Nq_e * Nelements * sizeof(pfloat));
@@ -864,13 +875,17 @@ void MGLevel::build(
   free(casted_Sz);
   free(casted_D);
 
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+  const string oklpath = install_dir + "/okl/elliptic/";
+  string filename, kernelName;
+
   for (int r = 0; r < 2; r++) {
     if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
       occa::properties properties;
       properties += mesh->device.properties();
       properties["defines/p_Nq_e"] = Nq_e;
-      properties["defines/p_threadBlockSize"] = 256;
+      properties["defines/p_threadBlockSize"] = BLOCKSIZE;
       properties["defines/p_Nq"] = Nq;
       properties["defines/pfloat"] = pfloatString;
       properties["defines/dfloat"] = dfloatString;
@@ -880,18 +895,20 @@ void MGLevel::build(
       if(options.compareArgs("MULTIGRID SMOOTHER","RAS"))
         properties["defines/p_restrict"] = 1;
 
-      sprintf(fileName, DELLIPTIC "/okl/ellipticSchwarzSolverHex3D.okl");
-      preFDMKernel = mesh->device.buildKernel(fileName, "preFDM", properties);
-      fusedFDMKernel = mesh->device.buildKernel(fileName, "fusedFDM", properties);
-      postFDMKernel = mesh->device.buildKernel(fileName, "postFDM", properties);
-      collocateKernel = mesh->device.buildKernel(fileName, "collocate", properties);
+      filename = oklpath + "ellipticSchwarzSolverHex3D.okl";
+      preFDMKernel = mesh->device.buildKernel(filename.c_str(), "preFDM", properties);
+      fusedFDMKernel = mesh->device.buildKernel(filename.c_str(), "fusedFDM", properties);
+      postFDMKernel = mesh->device.buildKernel(filename.c_str(), "postFDM", properties);
+      collocateKernel = mesh->device.buildKernel(filename.c_str(), "collocate", properties);
     }
     MPI_Barrier(mesh->comm);
   }
 }
+
 void MGLevel::smoothSchwarz(occa::memory& o_u, occa::memory& o_Su, bool xIsZero)
 {
-  const char* ogsDataTypeString = (strstr(ogsPfloat,"float") && options.compareArgs("ENABLE FLOATCOMMHALF GS SUPPORT","TRUE")) ?
+  const char* ogsDataTypeString =
+    (strstr(ogsPfloat,"float") && options.compareArgs("ENABLE FLOATCOMMHALF GS SUPPORT","TRUE")) ?
     ogsFloatCommHalf : ogsPfloat;
   if(xIsZero) {
     const dlong Nelements = elliptic->mesh->Nelements;
diff --git a/src/libP/solvers/elliptic/src/ellipticMultiGridSetup.c b/src/elliptic/ellipticMultiGridSetup.cpp
similarity index 84%
rename from src/libP/solvers/elliptic/src/ellipticMultiGridSetup.c
rename to src/elliptic/ellipticMultiGridSetup.cpp
index 3e477f3f2..85d8b0f56 100644
--- a/src/libP/solvers/elliptic/src/ellipticMultiGridSetup.c
+++ b/src/elliptic/ellipticMultiGridSetup.cpp
@@ -44,15 +44,6 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon)
     meshLevels[n]->Nfields = mesh->Nfields; // TW: ahem
 
     switch(elliptic->elementType) {
-    case TRIANGLES:
-      meshLoadReferenceNodesTri2D(meshLevels[n], n);
-      break;
-    case QUADRILATERALS:
-      meshLoadReferenceNodesQuad2D(meshLevels[n], n);
-      break;
-    case TETRAHEDRA:
-      meshLoadReferenceNodesTet3D(meshLevels[n], n);
-      break;
     case HEXAHEDRA:
       meshLoadReferenceNodesHex3D(meshLevels[n], n, 1);
       break;
@@ -68,42 +59,9 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon)
     levelDegree = (int*) calloc(numMGLevels,sizeof(int));
     for(int i = 0; i < numMGLevels; ++i)
       levelDegree[i] = elliptic->levels[i];
-  }else if (options.compareArgs("MULTIGRID COARSENING","ALLDEGREES")) {
-    numMGLevels = mesh->N;
-    levelDegree = (int*) calloc(numMGLevels,sizeof(int));
-    for (int n = 0; n < numMGLevels; n++) levelDegree[n] = mesh->N - n; //all degrees
-  }else if (options.compareArgs("MULTIGRID COARSENING","HALFDEGREES")) {
-    numMGLevels = floor(mesh->N / 2.) + 1;
-    levelDegree = (int*) calloc(numMGLevels,sizeof(int));
-    for (int n = 0; n < numMGLevels; n++) levelDegree[n] = mesh->N - 2 * n; //decrease by two
-    levelDegree[numMGLevels - 1] = 1; //ensure the last level is degree 1
-  }else {  //default "HALFDOFS"
-    // pick the degrees so the dofs of each level halfs (roughly)
-    //start by counting the number of levels neccessary
-    numMGLevels = 1;
-    int degree = mesh->N;
-    int dofs = meshLevels[degree]->Np;
-    int basedofs = mesh->Nverts;
-    while (dofs > basedofs) {
-      numMGLevels++;
-      for (; degree > 0; degree--)
-        if (meshLevels[degree]->Np <= dofs / 2)
-          break;
-      dofs = meshLevels[degree]->Np;
-    }
-    levelDegree = (int*) calloc(numMGLevels,sizeof(int));
-    degree = mesh->N;
-    numMGLevels = 1;
-    levelDegree[0] = degree;
-    dofs = meshLevels[degree]->Np;
-    while (dofs > basedofs) {
-      for (; degree > 0; degree--)
-        if (meshLevels[degree]->Np <= dofs / 2)
-          break;
-      dofs = meshLevels[degree]->Np;
-      levelDegree[numMGLevels] = degree;
-      numMGLevels++;
-    }
+  } else {
+    cout << "Unknown coarsening type!";
+    MPI_Abort(mesh->comm, 1);
   }
 
   int Nmax = levelDegree[0];
@@ -123,10 +81,10 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon)
       printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nmax);
 
     auto callback = [&]()
-      {
-        ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList,
-                   elliptic->o_p, elliptic->o_Ap, pfloatString);
-      };
+                    {
+                      ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList,
+                                 elliptic->o_p, elliptic->o_Ap, pfloatString);
+                    };
     elliptic->oogs   = oogs::setup(elliptic->ogs, 1, 0, ogsPfloat, NULL, oogsMode);
     elliptic->oogsAx = oogs::setup(elliptic->ogs, 1, 0, ogsPfloat, callback, oogsMode);
 
@@ -148,10 +106,14 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon)
     elliptic_t* ellipticC = ellipticBuildMultigridLevel(elliptic,Nc,Nf);
 
     auto callback = [&]()
-      {
-        ellipticAx(ellipticC, ellipticC->mesh->NlocalGatherElements, ellipticC->mesh->o_localGatherElementList,
-                   ellipticC->o_p, ellipticC->o_Ap, pfloatString);
-      };
+                    {
+                      ellipticAx(ellipticC,
+                                 ellipticC->mesh->NlocalGatherElements,
+                                 ellipticC->mesh->o_localGatherElementList,
+                                 ellipticC->o_p,
+                                 ellipticC->o_Ap,
+                                 pfloatString);
+                    };
     ellipticC->oogs   = oogs::setup(ellipticC->ogs, 1, 0, ogsPfloat, NULL, oogsMode);
     ellipticC->oogsAx = oogs::setup(ellipticC->ogs, 1, 0, ogsPfloat, callback, oogsMode);
 
@@ -186,10 +148,14 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon)
     ellipticCoarse = ellipticBuildMultigridLevel(elliptic,Nc,Nf);
 
     auto callback = [&]()
-      {
-        ellipticAx(ellipticCoarse, ellipticCoarse->mesh->NlocalGatherElements, ellipticCoarse->mesh->o_localGatherElementList,
-                   ellipticCoarse->o_p, ellipticCoarse->o_Ap, pfloatString);
-      };
+                    {
+                      ellipticAx(ellipticCoarse,
+                                 ellipticCoarse->mesh->NlocalGatherElements,
+                                 ellipticCoarse->mesh->o_localGatherElementList,
+                                 ellipticCoarse->o_p,
+                                 ellipticCoarse->o_Ap,
+                                 pfloatString);
+                    };
     ellipticCoarse->oogs   = oogs::setup(ellipticCoarse->ogs, 1, 0, ogsPfloat, NULL, oogsMode);
     //ellipticCoarse->oogsAx = oogs::setup(ellipticCoarse->ogs, 1, 0, ogsPfloat, callback, oogsMode);
   } else {
diff --git a/src/libP/solvers/elliptic/src/ellipticOperator.c b/src/elliptic/ellipticOperator.cpp
similarity index 88%
rename from src/libP/solvers/elliptic/src/ellipticOperator.c
rename to src/elliptic/ellipticOperator.cpp
index 286e5078e..1826ba458 100644
--- a/src/libP/solvers/elliptic/src/ellipticOperator.c
+++ b/src/elliptic/ellipticOperator.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "elliptic.h"
-#include "ogsInterface.h"
+//#include "ogsInterface.h"
 #include <iostream>
 
 #include "omp.h"
@@ -50,21 +50,21 @@ void ellipticAx(elliptic_t* elliptic,
   {
     bool valid = true;
     valid &= continuous;
-    if(!strstr(precision, dfloatString)){
+    if(!strstr(precision, dfloatString)) {
       valid &= !elliptic->var_coeff;
       valid &= !elliptic->blockSolver;
-      if(!serial){
+      if(!serial) {
         valid &= mapType == 0;
         valid &= integrationType == 0;
       }
     }
-    if(!valid){
+    if(!valid) {
       printf("Encountered invalid configuration inside ellipticAx!\n");
       if(elliptic->var_coeff)
         printf("Precision level (%s) does not support variable coefficient\n", precision);
       if(elliptic->blockSolver)
         printf("Precision level (%s) does not support block solver\n", precision);
-      if(!serial){
+      if(!serial) {
         if(mapType != 0)
           printf("Precision level (%s) does not support mapType %d\n", precision, mapType);
         if(integrationType != 0)
@@ -77,36 +77,33 @@ void ellipticAx(elliptic_t* elliptic,
   if(serial) {
     if(continuous) {
       if(elliptic->var_coeff) {
-        if(elliptic->blockSolver){
+        if(elliptic->blockSolver) {
           occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo;
-          if(!elliptic->stressForm){
+          if(!elliptic->stressForm)
             elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors,
                                mesh->o_Dmatrices, mesh->o_Smatrices, elliptic->o_lambda,
                                o_q, o_Aq);
-          } else {
+          else
             elliptic->AxStressKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors,
-                               mesh->o_Dmatrices, mesh->o_Smatrices, elliptic->o_lambda,
-                               o_q, o_Aq);
-          }
-        }
-        else
+                                     mesh->o_Dmatrices, mesh->o_Smatrices, elliptic->o_lambda,
+                                     o_q, o_Aq);
+        }else {
           elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, mesh->o_ggeo, mesh->o_Dmatrices,
                              mesh->o_Smatrices, elliptic->o_lambda, o_q, o_Aq);
+        }
       }else{
         const dfloat lambda = elliptic->lambda[0];
-        if(elliptic->blockSolver){
+        if(elliptic->blockSolver) {
           occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo;
-          if(!elliptic->stressForm){
+          if(!elliptic->stressForm)
             elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors,
                                mesh->o_Dmatrices, mesh->o_Smatrices, elliptic->o_lambda,
                                o_q, o_Aq);
-          } else {
+          else
             elliptic->AxStressKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors,
-                               mesh->o_Dmatrices, mesh->o_Smatrices, elliptic->o_lambda,
-                               o_q, o_Aq);
-          }
-        }
-        else{
+                                     mesh->o_Dmatrices, mesh->o_Smatrices, elliptic->o_lambda,
+                                     o_q, o_Aq);
+        }else {
           occa::memory &o_ggeo = (!strstr(precision,dfloatString)) ? mesh->o_ggeoPfloat : mesh->o_ggeo;
           occa::memory &o_Dmatrices = (!strstr(precision,dfloatString)) ? mesh->o_DmatricesPfloat : mesh->o_Dmatrices;
           occa::memory &o_Smatrices = (!strstr(precision,dfloatString)) ? mesh->o_SmatricesPfloat : mesh->o_Smatrices;
@@ -129,7 +126,7 @@ void ellipticAx(elliptic_t* elliptic,
       if(integrationType == 0) { // GLL or non-hex
         if(mapType == 0) {
           if(elliptic->var_coeff) {
-            if(elliptic->blockSolver){
+            if(elliptic->blockSolver) {
               occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo;
               partialAxKernel(NelementsList,
                               elliptic->Ntotal,
@@ -141,8 +138,7 @@ void ellipticAx(elliptic_t* elliptic,
                               elliptic->o_lambda,
                               o_q,
                               o_Aq);
-            }
-            else
+            }else {
               partialAxKernel(NelementsList,
                               elliptic->Ntotal,
                               o_elementsList,
@@ -152,8 +148,9 @@ void ellipticAx(elliptic_t* elliptic,
                               elliptic->o_lambda,
                               o_q,
                               o_Aq);
+            }
           }else{
-            if(elliptic->blockSolver){
+            if(elliptic->blockSolver) {
               occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo;
               partialAxKernel(NelementsList,
                               elliptic->Ntotal,
@@ -165,11 +162,12 @@ void ellipticAx(elliptic_t* elliptic,
                               elliptic->o_lambda,
                               o_q,
                               o_Aq);
-            }
-            else{
+            }else {
               occa::memory &o_ggeo = (!strstr(precision,dfloatString)) ? mesh->o_ggeoPfloat : mesh->o_ggeo;
-              occa::memory &o_Dmatrices = (!strstr(precision,dfloatString)) ? mesh->o_DmatricesPfloat : mesh->o_Dmatrices;
-              occa::memory &o_Smatrices = (!strstr(precision,dfloatString)) ? mesh->o_SmatricesPfloat : mesh->o_Smatrices;
+              occa::memory &o_Dmatrices =
+                (!strstr(precision,dfloatString)) ? mesh->o_DmatricesPfloat : mesh->o_Dmatrices;
+              occa::memory &o_Smatrices =
+                (!strstr(precision,dfloatString)) ? mesh->o_SmatricesPfloat : mesh->o_Smatrices;
               partialAxKernel(NelementsList,
                               o_elementsList,
                               o_ggeo,
@@ -223,10 +221,11 @@ void ellipticOperator(elliptic_t* elliptic,
   mesh_t* mesh = elliptic->mesh;
   setupAide &options = elliptic->options;
   oogs_t* oogsAx = elliptic->oogsAx;
-  const char* ogsDataTypeString = (!strstr(precision, dfloatString)) ? 
-    options.compareArgs("ENABLE FLOATCOMMHALF GS SUPPORT","TRUE") ? ogsFloatCommHalf : ogsPfloat
+  const char* ogsDataTypeString = (!strstr(precision, dfloatString)) ?
+                                  options.compareArgs("ENABLE FLOATCOMMHALF GS SUPPORT",
+                                                      "TRUE") ? ogsFloatCommHalf : ogsPfloat
     :
-    ogsDfloat;
+                                  ogsDfloat;
   int serial = options.compareArgs("THREAD MODEL", "SERIAL");
   if(serial) {
     occa::memory o_dummy;
diff --git a/src/libP/solvers/elliptic/ellipticPrecon.h b/src/elliptic/ellipticPrecon.h
similarity index 99%
rename from src/libP/solvers/elliptic/ellipticPrecon.h
rename to src/elliptic/ellipticPrecon.h
index db7b7ae96..8b48852ad 100644
--- a/src/libP/solvers/elliptic/ellipticPrecon.h
+++ b/src/elliptic/ellipticPrecon.h
@@ -114,6 +114,7 @@ typedef struct
   parAlmond::solver_t* parAlmond;
 
   // block Jacobi precon
+  occa::memory o_invMM;
   occa::kernel blockJacobiKernel;
   occa::kernel partialblockJacobiKernel;
 
diff --git a/src/libP/solvers/elliptic/src/ellipticPreconditioner.c b/src/elliptic/ellipticPreconditioner.cpp
similarity index 99%
rename from src/libP/solvers/elliptic/src/ellipticPreconditioner.c
rename to src/elliptic/ellipticPreconditioner.cpp
index 998e1e62b..68d60a8d5 100644
--- a/src/libP/solvers/elliptic/src/ellipticPreconditioner.c
+++ b/src/elliptic/ellipticPreconditioner.cpp
@@ -40,13 +40,13 @@ void ellipticPreconditioner(elliptic_t* elliptic, occa::memory &o_r, occa::memor
       elliptic->dotMultiplyKernel(Nlocal, elliptic->Ntotal, o_r, precon->o_invDiagA, o_z);
     else
       elliptic->dotMultiplyKernel(Nlocal, o_r, precon->o_invDiagA, o_z);
-  }else if (options.compareArgs("PRECONDITIONER", "MULTIGRID"))  {
+  }else if (options.compareArgs("PRECONDITIONER", "MULTIGRID")) {
     timer::tic("preconditioner", 1);
     parAlmond::Precon(precon->parAlmond, o_z, o_r);
     //ogsGatherScatter(o_z, ogsDfloat, ogsAdd, elliptic->ogs);
     //elliptic->collocateKernel(mesh->Nelements*mesh->Np, elliptic->o_invDegree, o_z);
     timer::toc("preconditioner");
-  }else  {
+  }else {
     if(mesh->rank == 0) printf("ERRROR: Unknown preconditioner\n");
     MPI_Abort(mesh->comm, 1);
     //o_z.copyFrom(o_r);
diff --git a/src/libP/src/meshPrint3D.c b/src/elliptic/ellipticPreconditionerSetup.cpp
similarity index 55%
rename from src/libP/src/meshPrint3D.c
rename to src/elliptic/ellipticPreconditionerSetup.cpp
index 0b9748194..42ca7d9bb 100644
--- a/src/libP/src/meshPrint3D.c
+++ b/src/elliptic/ellipticPreconditionerSetup.cpp
@@ -24,37 +24,30 @@
 
  */
 
-#include <stdlib.h>
-#include <stdio.h>
-#include "mesh3D.h"
+#include "elliptic.h"
 
-void meshPrint3D(mesh3D* mesh)
+void ellipticPreconditionerSetup(elliptic_t* elliptic, ogs_t* ogs, occa::properties &kernelInfo)
 {
-  printf("EToV:\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int v = 0; v < mesh->Nverts; ++v)
-      printf(hlongFormat " ", mesh->EToV[e * mesh->Nverts + v]);
-    printf("\n");
-  }
-
-  printf("EToE:\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      printf(dlongFormat " ",  mesh->EToE[e * mesh->Nfaces + f]);
-    printf("\n");
-  }
-
-  printf("EToB:\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      printf("%d ",  mesh->EToB[e * mesh->Nfaces + f]);
-    printf("\n");
-  }
-
-  printf("EToP:\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      printf("%d ",  mesh->EToP[e * mesh->Nfaces + f]);
-    printf("\n");
+  mesh_t* mesh = elliptic->mesh;
+  precon_t* precon = elliptic->precon;
+  setupAide options = elliptic->options;
+
+  if(options.compareArgs("PRECONDITIONER", "MULTIGRID")) {
+    ellipticMultiGridSetup(elliptic,precon);
+  } else if(options.compareArgs("PRECONDITIONER", "SEMFEM")) {
+    //ellipticSEMFEMSetup(elliptic,precon);
+    printf("ERROR: SEMFEM does not work right now.\n");
+
+    exit(-1);
+  } else if(options.compareArgs("PRECONDITIONER", "JACOBI")) {
+    dfloat* invDiagA;
+    ellipticBuildJacobi(elliptic,&invDiagA);
+    const dlong Nlocal =  mesh->Np * mesh->Nelements;
+    int Ntotal = elliptic->blockSolver ? elliptic->Ntotal * elliptic->Nfields: Nlocal;
+    precon->o_invDiagA = mesh->device.malloc(Ntotal * sizeof(dfloat), invDiagA);
+    free(invDiagA);
+  } else {
+    printf("ERROR: Unknown preconditioner!\n");
+    exit(-1);
   }
 }
diff --git a/src/libP/solvers/elliptic/src/ellipticResidualProjection.c b/src/elliptic/ellipticResidualProjection.cpp
similarity index 79%
rename from src/libP/solvers/elliptic/src/ellipticResidualProjection.c
rename to src/elliptic/ellipticResidualProjection.cpp
index 43f081ce6..97dbdafd8 100644
--- a/src/libP/solvers/elliptic/src/ellipticResidualProjection.c
+++ b/src/elliptic/ellipticResidualProjection.cpp
@@ -26,7 +26,7 @@
 #include "elliptic.h"
 #include "ellipticResidualProjection.h"
 #include <iostream>
-#include <timer.hpp>
+#include "timer.hpp"
 
 void ResidualProjection::matvec(occa::memory& o_Ax,
                                 const dlong Ax_offset,
@@ -37,35 +37,36 @@ void ResidualProjection::matvec(occa::memory& o_Ax,
   occa::memory o_Axtmp = o_Ax + fieldOffset * Ax_offset * sizeof(dfloat);
   matvecOperator(o_xtmp, o_Axtmp);
 }
+
 void ResidualProjection::updateProjectionSpace()
 {
   dlong m = numVecsProjection;
   if(m <= 0) return;
 
-  multiWeightedInnerProduct(o_xx, m, o_bb, m-1);
+  multiWeightedInnerProduct(o_xx, m, o_bb, m - 1);
   const dfloat norm_orig = alpha[m - 1];
   dfloat norm_new = norm_orig;
   const dfloat one = 1.0;
-  multiScaledAddwOffsetKernel(Nlocal, m, (m-1)*fieldOffset, fieldOffset, o_alpha, one, o_xx);
-  multiScaledAddwOffsetKernel(Nlocal, m, (m-1)*fieldOffset, fieldOffset, o_alpha, one, o_bb);
-  for(int k = 0; k < m - 1; ++k) {
+  multiScaledAddwOffsetKernel(Nlocal, m, (m - 1) * fieldOffset, fieldOffset, o_alpha, one, o_xx);
+  multiScaledAddwOffsetKernel(Nlocal, m, (m - 1) * fieldOffset, fieldOffset, o_alpha, one, o_bb);
+  for(int k = 0; k < m - 1; ++k)
     norm_new = norm_new - alpha[k] * alpha[k];
-  }
   norm_new = sqrt(norm_new);
   dfloat tol = 1e-7;
   const dfloat test = norm_new / norm_orig;
   if(test > tol) {
     const dfloat scale = 1.0 / norm_new;
-    scalarMultiplyKernel(Nlocal, (m-1) * fieldOffset, scale, o_xx);
-    scalarMultiplyKernel(Nlocal, (m-1) * fieldOffset, scale, o_bb);
+    scalarMultiplyKernel(Nlocal, (m - 1) * fieldOffset, scale, o_xx);
+    scalarMultiplyKernel(Nlocal, (m - 1) * fieldOffset, scale, o_bb);
   } else {
-    if(verbose && rank == 0){
+    if(verbose && rank == 0) {
       std::cout << "Detected rank deficiency: " << test << ".\n";
       std::cout << "Removing column : " << numVecsProjection << ".\n";
     }
     numVecsProjection--;
   }
 }
+
 void ResidualProjection::computePreProjection(occa::memory& o_r)
 {
   dfloat one = 1.0;
@@ -79,6 +80,7 @@ void ResidualProjection::computePreProjection(occa::memory& o_r)
   accumulateKernel(Nlocal, m, fieldOffset, o_alpha, o_bb, o_rtmp);
   scaledAddKernel(Nlocal, mone, o_rtmp, one, o_r);
 }
+
 void ResidualProjection::computePostProjection(occa::memory & o_x)
 {
   const dfloat one = 1.0;
@@ -95,7 +97,7 @@ void ResidualProjection::computePostProjection(occa::memory & o_x)
   } else {
     numVecsProjection++;
     // xx[m-1] = x
-    o_xx.copyFrom(o_x, Nlocal*sizeof(dfloat), (numVecsProjection - 1)*fieldOffset*sizeof(dfloat), 0);
+    o_xx.copyFrom(o_x, Nlocal * sizeof(dfloat), (numVecsProjection - 1) * fieldOffset * sizeof(dfloat), 0);
     // x = x + xbar
     scaledAddKernel(Nlocal, one, o_xbar, one, o_x);
   }
@@ -110,6 +112,7 @@ void ResidualProjection::computePostProjection(occa::memory & o_x)
     updateProjectionSpace();
   }
 }
+
 ResidualProjection::ResidualProjection(elliptic_t& elliptic,
                                        const dlong _maxNumVecsProjection,
                                        const dlong _numTimeSteps)
@@ -138,47 +141,52 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic,
   verbose = elliptic.options.compareArgs("VERBOSE","TRUE");
   alpha = (dfloat*) calloc(maxNumVecsProjection, sizeof(dfloat));
   work = (dfloat*) calloc(maxNumVecsProjection, sizeof(dfloat));
-  multiwork = (dfloat*) calloc(Nblock*maxNumVecsProjection, sizeof(dfloat));
+  multiwork = (dfloat*) calloc(Nblock * maxNumVecsProjection, sizeof(dfloat));
   o_alpha = elliptic.mesh->device.malloc(maxNumVecsProjection * sizeof(dfloat));
   o_xbar = elliptic.mesh->device.malloc(Nlocal * sizeof(dfloat));
   o_xx = elliptic.mesh->device.malloc(fieldOffset * maxNumVecsProjection * sizeof(dfloat));
   o_bb = elliptic.mesh->device.malloc(fieldOffset * maxNumVecsProjection * sizeof(dfloat));
 
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+  const string oklpath = install_dir + "/okl/elliptic/";
+  string filename, kernelName;
+
   for (int r = 0; r < 2; r++) {
     if ((r == 0 && elliptic.mesh->rank == 0) || (r == 1 && elliptic.mesh->rank > 0)) {
       occa::properties properties;
       properties += elliptic.mesh->device.properties();
-      properties["defines/p_threadBlockSize"] = blockSize;
-      properties["defines/p_blockSize"] = blockSize;
+      properties["defines/p_threadBlockSize"] = BLOCKSIZE;
+      properties["defines/p_blockSize"] = BLOCKSIZE;
       properties["defines/dfloat"] = dfloatString;
       properties["defines/dlong"] = dlongString;
 
-      sprintf(fileName, DELLIPTIC "/okl/ellipticResidualProjection.okl");
-      scalarMultiplyKernel = elliptic.mesh->device.buildKernel(fileName,
+      filename = oklpath + "ellipticResidualProjection.okl";
+      scalarMultiplyKernel = elliptic.mesh->device.buildKernel(filename.c_str(),
                                                                "scalarMultiply",
                                                                properties);
-      multiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(fileName,
-                                                                 "multiScaledAddwOffset",
-                                                                 properties);
-      multiWeightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(fileName,
-                                                                      "multiWeightedInnerProduct2",
+      multiScaledAddwOffsetKernel = elliptic.mesh->device.buildKernel(filename.c_str(),
+                                                                      "multiScaledAddwOffset",
                                                                       properties);
-      accumulateKernel = elliptic.mesh->device.buildKernel(fileName, "accumulate", properties);
+      multiWeightedInnerProduct2Kernel = elliptic.mesh->device.buildKernel(filename.c_str(),
+                                                                           "multiWeightedInnerProduct2",
+                                                                           properties);
+      accumulateKernel = elliptic.mesh->device.buildKernel(filename.c_str(), "accumulate", properties);
     }
     MPI_Barrier(elliptic.mesh->comm);
   }
   scaledAddKernel = elliptic.scaledAddKernel;
   sumKernel = elliptic.mesh->sumKernel;
   matvecOperator = [&](occa::memory& o_x, occa::memory & o_Ax)
-  {
-    ellipticOperator(&elliptic, o_x, o_Ax, dfloatString);
-  };
+                   {
+                     ellipticOperator(&elliptic, o_x, o_Ax, dfloatString);
+                   };
   weightedNorm = [&](occa::memory& o_x)
-  {
-    return ellipticWeightedNorm2(&elliptic, o_invDegree, o_x);
-  };
+                 {
+                   return ellipticWeightedNorm2(&elliptic, o_invDegree, o_x);
+                 };
 }
+
 void ResidualProjection::pre(occa::memory& o_r)
 {
   ++timestep;
@@ -204,33 +212,34 @@ void ResidualProjection::pre(occa::memory& o_r)
               << postResidualNorm << ", "
               << ratio << "\n";
 }
+
 void ResidualProjection::post(occa::memory& o_x)
 {
   if(timestep < numTimeSteps)
     return;
   computePostProjection(o_x);
 }
+
 void ResidualProjection::multiWeightedInnerProduct(
-                                                occa::memory &o_a,
-                                                const dlong m,
-                                                occa::memory &o_b,
-                                                const dlong offset)
+  occa::memory &o_a,
+  const dlong m,
+  occa::memory &o_b,
+  const dlong offset)
 {
 #ifdef ELLIPTIC_ENABLE_TIMER
   timer::tic("dotp",1);
 #endif
-  multiWeightedInnerProduct2Kernel(Nlocal, fieldOffset, Nblock, m, offset*fieldOffset, o_invDegree, o_a, o_b, o_wrk);
+  multiWeightedInnerProduct2Kernel(Nlocal, fieldOffset, Nblock, m, offset * fieldOffset, o_invDegree, o_a, o_b, o_wrk);
 
-  o_wrk.copyTo(multiwork, sizeof(dfloat)*m*Nblock);
-  for(dlong k = 0 ; k < m; ++k){
+  o_wrk.copyTo(multiwork, sizeof(dfloat) * m * Nblock);
+  for(dlong k = 0; k < m; ++k) {
     dfloat accum = 0.0;
-    for(dlong n = 0; n < Nblock; ++n){
-      accum += multiwork[n+k*Nblock];
-    }
+    for(dlong n = 0; n < Nblock; ++n)
+      accum += multiwork[n + k * Nblock];
     alpha[k] = accum;
   }
   MPI_Allreduce(MPI_IN_PLACE, alpha, m, MPI_DFLOAT, MPI_SUM, comm);
-  o_alpha.copyFrom(alpha,sizeof(dfloat)*m);
+  o_alpha.copyFrom(alpha,sizeof(dfloat) * m);
 #ifdef ELLIPTIC_ENABLE_TIMER
   timer::toc("dotp");
 #endif
diff --git a/src/libP/solvers/elliptic/ellipticResidualProjection.h b/src/elliptic/ellipticResidualProjection.h
similarity index 89%
rename from src/libP/solvers/elliptic/ellipticResidualProjection.h
rename to src/elliptic/ellipticResidualProjection.h
index 67d276a86..625419e94 100644
--- a/src/libP/solvers/elliptic/ellipticResidualProjection.h
+++ b/src/elliptic/ellipticResidualProjection.h
@@ -26,12 +26,10 @@
 
 #ifndef ELLIPTIC_RESIDUAL_PROJECTION_H
 #define ELLIPTIC_RESIDUAL_PROJECTION_H
-#include <occa.hpp>
-#include <types.h>
 #include <vector>
 #include <sstream>
-#include <elliptic.h>
 #include <functional>
+#include "elliptic.h"
 
 class ResidualProjection final
 {
@@ -47,10 +45,10 @@ class ResidualProjection final
   void updateProjectionSpace();
   void matvec(occa::memory& o_Ax, const dlong Ax_offset, occa::memory& o_x, const dlong x_offset);
   void multiWeightedInnerProduct(
-                              occa::memory& o_a,
-                              const dlong m,
-                              occa::memory& o_b,
-                              const dlong offset);
+    occa::memory& o_a,
+    const dlong m,
+    occa::memory& o_b,
+    const dlong offset);
   const dlong maxNumVecsProjection;
   const dlong numTimeSteps;
   dlong timestep;
@@ -75,9 +73,9 @@ class ResidualProjection final
   occa::kernel scaledAddKernel;
   occa::kernel sumKernel;
 
-  dfloat * alpha;
-  dfloat * work;
-  dfloat * multiwork;
+  dfloat* alpha;
+  dfloat* work;
+  dfloat* multiwork;
   dfloat* tmp;
 
   dlong numVecsProjection;
@@ -92,7 +90,5 @@ class ResidualProjection final
 
   std::function<void(occa::memory&,occa::memory&)> matvecOperator;
   std::function<dfloat(occa::memory&)> weightedNorm;
-
-
 };
-#endif
\ No newline at end of file
+#endif
diff --git a/src/libP/solvers/elliptic/src/ellipticScaledAdd.c b/src/elliptic/ellipticScaledAdd.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticScaledAdd.c
rename to src/elliptic/ellipticScaledAdd.cpp
diff --git a/src/libP/solvers/elliptic/src/ellipticSolve.c b/src/elliptic/ellipticSolve.cpp
similarity index 92%
rename from src/libP/solvers/elliptic/src/ellipticSolve.c
rename to src/elliptic/ellipticSolve.cpp
index 372ddd74b..1cb888870 100644
--- a/src/libP/solvers/elliptic/src/ellipticSolve.c
+++ b/src/elliptic/ellipticSolve.cpp
@@ -25,9 +25,9 @@
  */
 
 #include "elliptic.h"
-#include <timer.hpp>
+#include "timer.hpp"
 
-int ellipticSolve(elliptic_t* elliptic, dfloat tol,
+int ellipticSolve(elliptic_t* elliptic,
                   occa::memory &o_r, occa::memory &o_x)
 {
   mesh_t* mesh = elliptic->mesh;
@@ -35,6 +35,7 @@ int ellipticSolve(elliptic_t* elliptic, dfloat tol,
 
   int Niter = 0;
   int maxIter = 1000;
+  dfloat tol = 1e-6;
 
   options.getArgs("MAXIMUM ITERATIONS", maxIter);
   options.getArgs("SOLVER TOLERANCE", tol);
@@ -42,11 +43,10 @@ int ellipticSolve(elliptic_t* elliptic, dfloat tol,
   if(elliptic->var_coeff && options.compareArgs("PRECONDITIONER", "JACOBI"))
     ellipticUpdateJacobi(elliptic);
 
-  if(options.compareArgs("RESIDUAL PROJECTION","TRUE")) {
-    elliptic->o_x0.copyFrom(o_x, elliptic->Ntotal*sizeof(dfloat));
-  }
+  if(options.compareArgs("RESIDUAL PROJECTION","TRUE"))
+    elliptic->o_x0.copyFrom(o_x, elliptic->Ntotal * sizeof(dfloat));
 
-  // compute initial residual 
+  // compute initial residual
   ellipticOperator(elliptic, o_x, elliptic->o_Ap, dfloatString);
   ellipticScaledAdd(elliptic, -1.f, elliptic->o_Ap, 1.f, o_r);
   if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_r);
@@ -70,7 +70,7 @@ int ellipticSolve(elliptic_t* elliptic, dfloat tol,
       Niter = nbpcg (elliptic, o_r, o_x, tol, maxIter);
     else
       Niter = nbfpcg (elliptic, o_r, o_x, tol, maxIter);
-*/
+ */
   }
 
   if(options.compareArgs("RESIDUAL PROJECTION","TRUE")) {
diff --git a/src/libP/solvers/elliptic/src/ellipticSolveSetup.c b/src/elliptic/ellipticSolveSetup.cpp
similarity index 68%
rename from src/libP/solvers/elliptic/src/ellipticSolveSetup.c
rename to src/elliptic/ellipticSolveSetup.cpp
index dc7ebf2e4..78a60f977 100644
--- a/src/libP/solvers/elliptic/src/ellipticSolveSetup.c
+++ b/src/elliptic/ellipticSolveSetup.cpp
@@ -33,7 +33,7 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
   setupAide options = elliptic->options;
 
   const dlong Nlocal = mesh->Np * mesh->Nelements;
-  elliptic->resNormFactor = 1 / (elliptic->Nfields*mesh->volume);
+  elliptic->resNormFactor = 1 / (elliptic->Nfields * mesh->volume);
 
   const int serial = options.compareArgs("THREAD MODEL", "SERIAL");
 
@@ -47,7 +47,6 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
     exit(-1);
   }
 
-  // Sanity check for discretization type
   if (options.compareArgs("COEFFICIENT","VARIABLE") &&  elliptic->elementType != HEXAHEDRA &&
       !options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {
     if(mesh->rank == 0)
@@ -57,7 +56,6 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
     exit(-1);
   }
 
-  // Sanity check for preconditioner type
   if (options.compareArgs("COEFFICIENT","VARIABLE")) {
     if(options.compareArgs("PRECONDITIONER",
                            "MULTIGRID") &&
@@ -68,38 +66,12 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
       MPI_Finalize();
       exit(-1);
     }
-
-    if(!options.compareArgs("PRECONDITIONER",
-                            "MULTIGRID") && !options.compareArgs("PRECONDITIONER", "JACOBI")
-       && !options.compareArgs("PRECONDITIONER", "NONE")) {
-      if(mesh->rank == 0)
-        printf(
-          "ERROR: Varibale coefficient solver is implemented for multigrid/Jacobi/None preconditioners only\n");
-
-      MPI_Finalize();
-      exit(-1);
-    }
-  }
-
-  //sanity checking
-  if (options.compareArgs("BASIS","BERN") && elliptic->elementType != TRIANGLES) {
-    printf("ERROR: BERN basis is only available for triangular elements\n");
-    MPI_Finalize();
-    exit(-1);
-  }
-
-  if (options.compareArgs("PRECONDITIONER","MASSMATRIX") && elliptic->elementType != TRIANGLES
-      && elliptic->elementType != TETRAHEDRA ) {
-    printf(
-      "ERROR: MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead.\n");
-    MPI_Finalize();
-    exit(-1);
   }
 
-  dlong Nblock  = mymax(1,(Nlocal + blockSize - 1) / blockSize);
-  dlong Nblock2 = mymax(1,(Nblock + blockSize - 1) / blockSize);
+  dlong Nblock  = mymax(1,(Nlocal + BLOCKSIZE - 1) / BLOCKSIZE);
+  dlong Nblock2 = mymax(1,(Nblock + BLOCKSIZE - 1) / BLOCKSIZE);
 
-  dlong NthreadsUpdatePCG = 256;
+  dlong NthreadsUpdatePCG = BLOCKSIZE;
   dlong NblocksUpdatePCG = mymin((Nlocal + NthreadsUpdatePCG - 1) / NthreadsUpdatePCG, 160);
 
   elliptic->NthreadsUpdatePCG = NthreadsUpdatePCG;
@@ -373,7 +345,8 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
             int BCFlag = elliptic->BCType[bc + elliptic->NBCType * fld];
             int fid = mesh->faceNodes[n + f * mesh->Nfp];
             elliptic->mapB[fid + e * mesh->Np + fld * elliptic->Ntotal] = mymin(BCFlag,
-                                                  elliptic->mapB[fid + e *mesh->Np + fld*elliptic->Ntotal]);
+                                                                                elliptic->mapB[fid + e * mesh->Np +
+                                                                                               fld * elliptic->Ntotal]);
           }
         }
       }
@@ -413,7 +386,6 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
       elliptic->Nmasked * sizeof(dlong),
       elliptic->maskIds);
 
-
   if(elliptic->blockSolver) { // Create a gs handle independent from BC handler
     elliptic->ogs = ogsSetup(Nlocal, mesh->globalIds, mesh->comm, verbose, mesh->device);
     // Create copy of invDegree so that we can accelerate vector form of masking!!!!!!
@@ -444,32 +416,16 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
   //  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
   kernelInfo["defines/pfloat"] = pfloatString;
 
-
   // set kernel name suffix
-  char* suffix;
-
-  if(elliptic->elementType == TRIANGLES) {
-    if(elliptic->dim == 2)
-      suffix = strdup("Tri2D");
-    else
-      suffix = strdup("Tri3D");
-  }
-  if(elliptic->elementType == QUADRILATERALS) {
-    if(elliptic->dim == 2)
-      suffix = strdup("Quad2D");
-    else
-      suffix = strdup("Quad3D");
-  }
-  if(elliptic->elementType == TETRAHEDRA)
-    suffix = strdup("Tet3D");
+  string suffix;
   if(elliptic->elementType == HEXAHEDRA)
-    suffix = strdup("Hex3D");
+    suffix = "Hex3D";
 
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  string filename, kernelName;
 
   kernelInfo["defines/" "p_eNfields"] = elliptic->Nfields;
   kernelInfo["defines/p_Nalign"] = USE_OCCA_MEM_BYTE_ALIGN;
-  kernelInfo["defines/" "p_blockSize"] = blockSize;
+  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
 
   occa::properties pfloatKernelInfo = kernelInfo;
   pfloatKernelInfo["defines/dfloat"] = pfloatString;
@@ -478,243 +434,305 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
   occa::properties kernelInfoNoOKL = kernelInfo;
   if(serial) kernelInfoNoOKL["okl/enabled"] = false;
 
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+
   MPI_Barrier(mesh->comm);
-  double tStartLoadKernel = MPI_Wtime(); 
-  if(mesh->rank == 0)  printf("loading elliptic kernels ... "); fflush(stdout);
+  double tStartLoadKernel = MPI_Wtime();
+  if(mesh->rank == 0) printf("loading elliptic kernels ... ");
+  fflush(stdout);
 
   for (int r = 0; r < 2; r++) {
     if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
+      const string oklpath = install_dir + "/okl/core/";
+      string filename;
+
       //mesh kernels
+      filename = oklpath + "meshHaloExtract2D.okl";
       mesh->haloExtractKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/meshHaloExtract2D.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "meshHaloExtract2D",
                                  kernelInfo);
 
+      filename = oklpath + "addScalar.okl";
       mesh->addScalarKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/addScalar.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "addScalar",
                                  kernelInfo);
 
+      filename = oklpath + "mask.okl";
       mesh->maskKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/mask.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "mask",
                                  kernelInfo);
+
+      filename = oklpath + "mask.okl";
       mesh->maskPfloatKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/mask.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "mask",
                                  pfloatKernelInfo);
 
+      filename = oklpath + "sum.okl";
       mesh->sumKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/sum.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "sum",
                                  kernelInfo);
 
+      filename = oklpath + "fill.okl";
       elliptic->fillKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/fill.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "fill",
                                  kernelInfo);
 
+      filename = oklpath + "dotMultiplyAdd.okl";
       elliptic->dotMultiplyAddKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/dotMultiplyAdd.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "dotMultiplyAdd",
                                  kernelInfo);
 
+      filename = oklpath + "dotDivide.okl";
       elliptic->dotDivideKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/dotDivide.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "dotDivide",
                                  kernelInfo);
 
+      filename = oklpath + "dotDivide.okl";
       elliptic->scalarDivideKernel =
-        mesh->device.buildKernel(DHOLMES "/okl/dotDivide.okl",
+        mesh->device.buildKernel(filename.c_str(),
                                  "scalarDivide",
                                  kernelInfo);
 
       if(elliptic->blockSolver) {
+        filename = oklpath + "sum.okl", "sumBlock";
         elliptic->sumBlockKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/sum.okl", "sumBlock", kernelInfo);
+          mesh->device.buildKernel(filename.c_str(), "sumBlock", kernelInfo);
 
+        filename = oklpath + "sum.okl", "sumBlockField";
         elliptic->sumBlockFieldKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/sum.okl", "sumBlockField", kernelInfo);
+          mesh->device.buildKernel(filename.c_str(), "sumBlockField", kernelInfo);
 
+        filename = oklpath + "addScalar.okl";
         elliptic->addScalarBlockFieldKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/addScalar.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "addBlockScalarField",
                                    kernelInfo);
 
+        filename = oklpath + "weightedInnerProduct1.okl";
         elliptic->weightedInnerProduct1Kernel =
-          mesh->device.buildKernel(DHOLMES "/okl/weightedInnerProduct1.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "weightedBlockInnerProduct1",
                                    kernelInfo);
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialWeightedInnerProduct2.c";
           elliptic->weightedInnerProduct2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialWeightedInnerProduct2.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedBlockInnerProduct2",
                                      kernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "weightedInnerProduct2.okl";
           elliptic->weightedInnerProduct2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/weightedInnerProduct2.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedBlockInnerProduct2",
                                      kernelInfo);
+        }
 
+        filename = oklpath + "innerProduct.okl";
         elliptic->innerProductKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/innerProduct.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "innerBlockProduct",
                                    kernelInfo);
 
+        filename = oklpath + "innerProduct.okl";
         elliptic->innerProductFieldKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/innerProduct.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "innerBlockProductField",
                                    kernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialWeightedNorm2.c";
           elliptic->weightedNorm2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialWeightedNorm2.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedBlockNorm2",
                                      kernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "weightedNorm2.okl";
           elliptic->weightedNorm2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/weightedNorm2.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedBlockNorm2",
                                      kernelInfo);
+        }
 
+        filename = oklpath + "norm2.okl";
         elliptic->norm2Kernel =
-          mesh->device.buildKernel(DHOLMES "/okl/norm2.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "normBlock2",
                                    kernelInfo);
+
+        filename = oklpath + "scaledAdd.okl";
         elliptic->scaledAddPfloatKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "scaledBlockAdd",
                                    pfloatKernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "/serialScaledAdd.c";
           elliptic->scaledAddKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialScaledAdd.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "scaledBlockAdd",
                                      kernelInfoNoOKL);
-        else{
+        } else {
+          filename = oklpath + "scaledAdd.okl";
           elliptic->scaledAddKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "scaledBlockAdd",
                                      kernelInfo);
         }
 
+        filename = oklpath + "dotMultiply.okl";
         elliptic->collocateKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "collocate",
                                    kernelInfo);
-          elliptic->dotMultiplyPfloatKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
-                                     "dotBlockMultiply",
-                                     pfloatKernelInfo);
+        filename = oklpath + "dotMultiply.okl";
+        elliptic->dotMultiplyPfloatKernel =
+          mesh->device.buildKernel(filename.c_str(),
+                                   "dotBlockMultiply",
+                                   pfloatKernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialDotMultiply.c";
           elliptic->dotMultiplyKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialDotMultiply.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "dotBlockMultiply",
                                      kernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "dotMultiply.okl";
           elliptic->dotMultiplyKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "dotBlockMultiply",
                                      kernelInfo);
+        }
 
+        filename = oklpath + "dotDivide.okl";
         elliptic->scalarDivideManyKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/dotDivide.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "scalarDivideMany",
                                    kernelInfo);
       }else{
+        filename = oklpath + "weightedInnerProduct1.okl";
         elliptic->weightedInnerProduct1Kernel =
-          mesh->device.buildKernel(DHOLMES "/okl/weightedInnerProduct1.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "weightedInnerProduct1",
                                    kernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialWeightedInnerProduct2.c";
           elliptic->weightedInnerProduct2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialWeightedInnerProduct2.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedInnerProduct2",
                                      kernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "weightedInnerProduct2.okl";
           elliptic->weightedInnerProduct2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/weightedInnerProduct2.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedInnerProduct2",
                                      kernelInfo);
+        }
 
+        filename = oklpath + "innerProduct.okl";
         elliptic->innerProductKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/innerProduct.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "innerProduct",
                                    kernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialWeightedNorm2.c";
           elliptic->weightedNorm2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialWeightedNorm2.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedNorm2",
                                      kernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "weightedNorm2.okl";
           elliptic->weightedNorm2Kernel =
-            mesh->device.buildKernel(DHOLMES "/okl/weightedNorm2.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "weightedNorm2",
                                      kernelInfo);
+        }
 
+        filename = oklpath + "norm2.okl";
         elliptic->norm2Kernel =
-          mesh->device.buildKernel(DHOLMES "/okl/norm2.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "norm2",
                                    kernelInfo);
-        elliptic->scaledAddPfloatKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
-                                   "scaledAdd",
-                                   pfloatKernelInfo);
+
+        filename = oklpath + "copyDfloatToPfloat.okl";
         elliptic->copyDfloatToPfloatKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/copyDfloatToPfloat.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "copyDfloatToPfloat",
                                    kernelInfo);
+
+        filename = oklpath + "copyPfloatToDfloat.okl";
         elliptic->copyPfloatToDPfloatKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/copyPfloatToDfloat.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "copyPfloatToDfloat",
                                    kernelInfo);
+
+        filename = oklpath + "scaledAdd.okl";
+        elliptic->scaledAddPfloatKernel =
+          mesh->device.buildKernel(filename.c_str(),
+                                   "scaledAdd",
+                                   pfloatKernelInfo);
+
         elliptic->updateSmoothedSolutionVecKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "updateSmoothedSolutionVec",
                                    pfloatKernelInfo);
         elliptic->updateChebyshevSolutionVecKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "updateChebyshevSolutionVec",
                                    pfloatKernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialScaledAdd.c";
           elliptic->scaledAddKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialScaledAdd.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "scaledAdd",
                                      kernelInfoNoOKL);
-        else{
+        } else {
+          filename = oklpath + "scaledAdd.okl";
           elliptic->scaledAddKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "scaledAdd",
                                      kernelInfo);
         }
 
+        filename = oklpath + "dotMultiply.okl";
         elliptic->collocateKernel =
-          mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
+          mesh->device.buildKernel(filename.c_str(),
                                    "collocate",
                                    kernelInfo);
-          elliptic->dotMultiplyKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
-                                     "dotMultiply",
-                                     kernelInfo);
-          elliptic->dotMultiplyPfloatKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
-                                     "dotMultiply",
-                                     pfloatKernelInfo);
+        elliptic->dotMultiplyKernel =
+          mesh->device.buildKernel(filename.c_str(),
+                                   "dotMultiply",
+                                   kernelInfo);
+        elliptic->dotMultiplyPfloatKernel =
+          mesh->device.buildKernel(filename.c_str(),
+                                   "dotMultiply",
+                                   pfloatKernelInfo);
 
-        if(serial)
+        if(serial) {
+          filename = oklpath + "serialDotMultiply.c";
           elliptic->dotMultiplyKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/serialDotMultiply.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "dotMultiply",
                                      kernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "dotMultiply.okl";
           elliptic->dotMultiplyKernel =
-            mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "dotMultiply",
                                      kernelInfo);
+        }
       }
     }
     MPI_Barrier(mesh->comm);
@@ -738,22 +756,22 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
   int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
   kernelInfo["defines/" "p_maxNodes"] = maxNodes;
 
-  int NblockV = mymax(1,maxNthreads / mesh->Np); // works for CUDA
+  int NblockV = mymax(1,BLOCKSIZE / mesh->Np);
   int NnodesV = 1; //hard coded for now
   kernelInfo["defines/" "p_NblockV"] = NblockV;
   kernelInfo["defines/" "p_NnodesV"] = NnodesV;
   kernelInfo["defines/" "p_NblockVFine"] = NblockV;
   kernelInfo["defines/" "p_NblockVCoarse"] = NblockV;
 
-  int NblockS = mymax(1,maxNthreads / maxNodes); // works for CUDA
+  int NblockS = mymax(1,BLOCKSIZE / maxNodes);
   kernelInfo["defines/" "p_NblockS"] = NblockS;
 
-  int NblockP = mymax(1,maxNthreads / (4 * mesh->Np)); // get close to maxNthreads threads
+  int NblockP = mymax(1,BLOCKSIZE / (4 * mesh->Np)); // get close to BLOCKSIZE threads
   kernelInfo["defines/" "p_NblockP"] = NblockP;
 
   int NblockG;
   if(mesh->Np <= 32) NblockG = ( 32 / mesh->Np );
-  else NblockG = maxNthreads / mesh->Np;
+  else NblockG = BLOCKSIZE / mesh->Np;
   kernelInfo["defines/" "p_NblockG"] = NblockG;
 
   kernelInfo["defines/" "p_halfC"] = (int)((mesh->cubNq + 1) / 2);
@@ -775,67 +793,64 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
 
   for (int r = 0; r < 2; r++) {
     if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
+      const string oklpath = install_dir + "/okl/elliptic/";
+      string filename;
 
       if(elliptic->var_coeff) {
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticBuildDiagonal%s.okl", suffix);
+        filename = oklpath + "ellipticBuildDiagonal" + suffix + ".okl";
         if(elliptic->blockSolver)
-          sprintf(kernelName, "ellipticBlockBuildDiagonal%s", suffix);
+          kernelName = "ellipticBlockBuildDiagonal" + suffix;
         else
-          sprintf(kernelName, "ellipticBuildDiagonal%s", suffix);
-        elliptic->updateDiagonalKernel = mesh->device.buildKernel(fileName,
-                                                                  kernelName,
+          kernelName = "ellipticBuildDiagonal" + suffix;
+        elliptic->updateDiagonalKernel = mesh->device.buildKernel(filename.c_str(),
+                                                                  kernelName.c_str(),
                                                                   dfloatKernelInfo);
       }
 
       if(elliptic->blockSolver) {
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticBlockAx%s.okl", suffix);
-        if(serial) sprintf(fileName,  DELLIPTIC "/okl/ellipticSerialAx%s.c", suffix);
-        if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA){
-          if(elliptic->stressForm){
-            sprintf(kernelName, "ellipticStressAxVar%s", suffix);
-          } else {
-            sprintf(kernelName, "ellipticBlockAxVar%s_N%d", suffix, elliptic->Nfields);
-          }
-        }
-        else{
-          if(elliptic->stressForm){
-            sprintf(kernelName, "ellipticStressAx%s", suffix);
-          } else {
-            sprintf(kernelName, "ellipticBlockAx%s_N%d", suffix, elliptic->Nfields);
-          }
+        filename =  oklpath + "ellipticBlockAx" + suffix + ".okl";
+        if(serial) filename = oklpath + "ellipticSerialAx" +  suffix + ".c";
+        if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA) {
+          if(elliptic->stressForm)
+            kernelName = "ellipticStressAxVar" + suffix;
+          else
+            kernelName = "ellipticBlockAxVar" + suffix + "_N" + std::to_string(elliptic->Nfields);
+        }else {
+          if(elliptic->stressForm)
+            kernelName = "ellipticStressAx" + suffix;
+          else
+            kernelName = "ellipticBlockAx", suffix + "_N" + std::to_string(elliptic->Nfields);
         }
       }else{
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticAx%s.okl", suffix);
-        if(serial) sprintf(fileName,  DELLIPTIC "/okl/ellipticSerialAx%s.c", suffix);
+        filename = oklpath + "ellipticAx" + suffix + ".okl";
+        if(serial) filename = oklpath + "ellipticSerialAx" + suffix + ".c";
         if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA)
-          sprintf(kernelName, "ellipticAxVar%s", suffix);
+          kernelName = "ellipticAxVar" + suffix;
         else
-          sprintf(kernelName, "ellipticAx%s", suffix);
+          kernelName =  "ellipticAx" + suffix;
       }
-      elliptic->AxStressKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
+      elliptic->AxStressKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
       if(elliptic->blockSolver) {
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticBlockAx%s.okl", suffix);
-        if(serial) sprintf(fileName,  DELLIPTIC "/okl/ellipticSerialAx%s.c", suffix);
-        if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA){
-            sprintf(kernelName, "ellipticBlockAxVar%s_N%d", suffix, elliptic->Nfields);
-        }
-        else{
-            sprintf(kernelName, "ellipticBlockAx%s_N%d", suffix, elliptic->Nfields);
-        }
+        filename = oklpath + "ellipticBlockAx" + suffix + ".okl";
+        if(serial) filename = oklpath + "ellipticSerialAx" + suffix + ".c";
+        if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA)
+          kernelName = "ellipticBlockAxVar" + suffix + "_N" + std::to_string(elliptic->Nfields);
+        else
+          kernelName = "ellipticBlockAx" + suffix + "_N" + std::to_string(elliptic->Nfields);
       }else{
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticAx%s.okl", suffix);
-        if(serial) sprintf(fileName,  DELLIPTIC "/okl/ellipticSerialAx%s.c", suffix);
+        filename = oklpath + "ellipticAx" + suffix + ".okl";
+        if(serial) filename = oklpath + "ellipticSerialAx" + suffix + ".c";
         if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA)
-          sprintf(kernelName, "ellipticAxVar%s", suffix);
+          kernelName = "ellipticAxVar" + suffix;
         else
-          sprintf(kernelName, "ellipticAx%s", suffix);
+          kernelName = "ellipticAx" + suffix;
       }
       // Keep other kernel around
-      elliptic->AxKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
+      elliptic->AxKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
 
       if(!serial) {
         if(elliptic->elementType != HEXAHEDRA) {
-          sprintf(kernelName, "ellipticPartialAx%s", suffix);
+          kernelName = "ellipticPartialAx" + suffix;
         }else {
           if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")) {
             if(elliptic->var_coeff || elliptic->blockSolver) {
@@ -843,187 +858,142 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
                 "ERROR: TRILINEAR form is not implemented for varibale coefficient and block solver yet \n");
               exit(-1);
             }
-            sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix);
+            kernelName = "ellipticPartialAxTrilinear" + suffix;
           }else {
             if(elliptic->blockSolver) {
-              if(elliptic->var_coeff){
-                if(elliptic->stressForm){
-                  sprintf(kernelName, "ellipticStressPartialAxVar%s", suffix);
-                } else {
-                  sprintf(kernelName, "ellipticBlockPartialAxVar%s_N%d", suffix, elliptic->Nfields);
-                }
-              }
-              else{
-                if(elliptic->stressForm){
-                  sprintf(kernelName, "ellipticStessPartialAx%s", suffix);
-                } else {
-                  sprintf(kernelName, "ellipticBlockPartialAx%s_N%d", suffix, elliptic->Nfields);
-                }
+              if(elliptic->var_coeff) {
+                if(elliptic->stressForm)
+                  kernelName = "ellipticStressPartialAxVar" + suffix;
+                else
+                  kernelName = "ellipticBlockPartialAxVar" + suffix + "_N" + std::to_string(elliptic->Nfields);
+              }else {
+                if(elliptic->stressForm)
+                  kernelName = "ellipticStessPartialAx" + suffix;
+                else
+                  kernelName = "ellipticBlockPartialAx" + suffix + "_N" + std::to_string(elliptic->Nfields);
               }
             }else {
               if(elliptic->var_coeff)
-                sprintf(kernelName, "ellipticPartialAxVar%s", suffix);
+                kernelName = "ellipticPartialAxVar" + suffix;
               else
-                sprintf(kernelName, "ellipticPartialAx%s", suffix);
+                kernelName = "ellipticPartialAx" + suffix;
             }
           }
         }
-        elliptic->partialAxKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
-        elliptic->partialAxKernel2 = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
-      }
-
-/*
-      // only for Hex3D - cubature Ax
-      if(elliptic->elementType == HEXAHEDRA && !elliptic->var_coeff && !elliptic->blockSolver) {
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticCubatureAx%s.okl", suffix);
-
-        sprintf(kernelName, "ellipticCubaturePartialAx%s", suffix);
-        elliptic->partialCubatureAxKernel = mesh->device.buildKernel(fileName,
-                                                                     kernelName,
-                                                                     dfloatKernelInfo);
+        elliptic->partialAxKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
+        elliptic->partialAxKernel2 = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),AxKernelInfo);
       }
-*/
 
       // combined PCG update and r.r kernel
       if(elliptic->blockSolver) {
-        if(serial)
+        if(serial) {
+          filename = oklpath + "ellipticSerialUpdatePCG.c";
           elliptic->updatePCGKernel =
-            mesh->device.buildKernel(DELLIPTIC "/okl/ellipticSerialUpdatePCG.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "ellipticUpdatePCG", dfloatKernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "ellipticUpdatePCG.okl";
           elliptic->updatePCGKernel =
-            mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdatePCG.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "ellipticBlockUpdatePCG", dfloatKernelInfo);
+        }
 
+/*
+        filename = oklpath + "ellipticUpdateNBPCG.okl;
         elliptic->update1NBPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBPCG.okl",
                                    "ellipticBlockUpdate1NBPCG", dfloatKernelInfo);
 
         elliptic->update2NBPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBPCG.okl",
                                    "ellipticBlockUpdate2NBPCG", dfloatKernelInfo);
 
-        // combined update for Non-blocking flexible PCG
+        filename = oklpath + "ellipticUpdateNBFPCG.okl;
         elliptic->update0NBFPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBFPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBFPCG.okl",
                                    "ellipticBlockUpdate0NBFPCG", dfloatKernelInfo);
 
         elliptic->update1NBFPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBFPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBFPCG.okl",
                                    "ellipticBlockUpdate1NBFPCG", dfloatKernelInfo);
+ */
       }else{
-        if(serial)
+        if(serial) {
+          filename = oklpath + "ellipticSerialUpdatePCG.c";
           elliptic->updatePCGKernel =
-            mesh->device.buildKernel(DELLIPTIC "/okl/ellipticSerialUpdatePCG.c",
+            mesh->device.buildKernel(filename.c_str(),
                                      "ellipticUpdatePCG", dfloatKernelInfoNoOKL);
-        else
+        } else {
+          filename = oklpath + "ellipticUpdatePCG.okl";
           elliptic->updatePCGKernel =
-            mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdatePCG.okl",
+            mesh->device.buildKernel(filename.c_str(),
                                      "ellipticUpdatePCG", dfloatKernelInfo);
-
-        // combined update for Non-blocking PCG
+        }
+/*
+        filename = oklpath + "ellipticUpdateNBPCG.okl";
         elliptic->update1NBPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBPCG.okl",
                                    "ellipticUpdate1NBPCG", dfloatKernelInfo);
 
         elliptic->update2NBPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBPCG.okl",
                                    "ellipticUpdate2NBPCG", dfloatKernelInfo);
 
-        // combined update for Non-blocking flexible PCG
+        filename = oklpath + "ellipticUpdateNBFPCG.okl";
         elliptic->update0NBFPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBFPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBFPCG.okl",
                                    "ellipticUpdate0NBFPCG", dfloatKernelInfo);
 
         elliptic->update1NBFPCGKernel =
-          mesh->device.buildKernel(DELLIPTIC "/okl/ellipticUpdateNBFPCG.okl",
+          mesh->device.buildKernel(oklpath + "ellipticUpdateNBFPCG.okl",
                                    "ellipticUpdate1NBFPCG", dfloatKernelInfo);
+ */
       }
 
       if(!elliptic->blockSolver) {
-        // Not implemented for Quad3D !!!!!
-        if (options.compareArgs("BASIS","BERN")) {
-          sprintf(fileName, DELLIPTIC "/okl/ellipticGradientBB%s.okl", suffix);
-          sprintf(kernelName, "ellipticGradientBB%s", suffix);
-
-          elliptic->gradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+        if (options.compareArgs("BASIS","NODAL")) {
+          filename = oklpath + "ellipticGradient" + suffix + ".okl";
+          kernelName = "ellipticGradient" + suffix;
+          elliptic->gradientKernel = mesh->device.buildKernel(filename.c_str(),kernelName,kernelInfo);
 
-          sprintf(kernelName, "ellipticPartialGradientBB%s", suffix);
+          kernelName = "ellipticPartialGradient" + suffix;
           elliptic->partialGradientKernel =
-            mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+            mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 /*
-          sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdgBB%s.okl", suffix);
-          sprintf(kernelName, "ellipticAxIpdgBB%s", suffix);
-          elliptic->ipdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-
-          sprintf(kernelName, "ellipticPartialAxIpdgBB%s", suffix);
-          elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-*/
-        } else if (options.compareArgs("BASIS","NODAL")) {
-          sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix);
-          sprintf(kernelName, "ellipticGradient%s", suffix);
-
-          elliptic->gradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-
-          sprintf(kernelName, "ellipticPartialGradient%s", suffix);
-          elliptic->partialGradientKernel =
-            mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-/*
-          sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdg%s.okl", suffix);
+          sprintf(filename.c_str(), oklpath + "ellipticAxIpdg%s.okl", suffix);
           sprintf(kernelName, "ellipticAxIpdg%s", suffix);
-          elliptic->ipdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+          elliptic->ipdgKernel = mesh->device.buildKernel(filename.c_str(),kernelName,kernelInfo);
 
           sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix);
-          elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-*/
-        }
-
-        // Use the same kernel with quads for the following kenels
-        if(elliptic->dim == 3) {
-          if(elliptic->elementType == QUADRILATERALS)
-            suffix = strdup("Quad2D");
-          else if(elliptic->elementType == TRIANGLES)
-            suffix = strdup("Tri2D");
+          elliptic->partialIpdgKernel = mesh->device.buildKernel(filename.c_str(),kernelName,kernelInfo);
+ */
         }
 
-        sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix);
-        sprintf(kernelName, "ellipticPreconCoarsen%s", suffix);
-        elliptic->precon->coarsenKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+        filename = oklpath + "ellipticPreconCoarsen" + suffix + ".okl";
+        kernelName = "ellipticPreconCoarsen" + suffix;
+        elliptic->precon->coarsenKernel = mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 
-        sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix);
-        sprintf(kernelName, "ellipticPreconProlongate%s", suffix);
+        filename = oklpath + "ellipticPreconProlongate" + suffix + ".okl";
+        kernelName = "ellipticPreconProlongate" + suffix;
         elliptic->precon->prolongateKernel =
-          mesh->device.buildKernel(fileName,kernelName,kernelInfo);
+          mesh->device.buildKernel(filename.c_str(),kernelName.c_str(),kernelInfo);
 
-        sprintf(fileName, DELLIPTIC "/okl/ellipticBlockJacobiPrecon.okl");
-        sprintf(kernelName, "ellipticBlockJacobiPrecon");
-        elliptic->precon->blockJacobiKernel = mesh->device.buildKernel(fileName,
-                                                                       kernelName,
+        filename = oklpath + "ellipticBlockJacobiPrecon.okl";
+        kernelName = "ellipticBlockJacobiPrecon";
+        elliptic->precon->blockJacobiKernel = mesh->device.buildKernel(filename.c_str(),
+                                                                       kernelName.c_str(),
                                                                        kernelInfo);
 
-        sprintf(kernelName, "ellipticPartialBlockJacobiPrecon");
-        elliptic->precon->partialblockJacobiKernel = mesh->device.buildKernel(fileName,
-                                                                              kernelName,
+        kernelName = "ellipticPartialBlockJacobiPrecon";
+        elliptic->precon->partialblockJacobiKernel = mesh->device.buildKernel(filename.c_str(),
+                                                                              kernelName.c_str(),
                                                                               kernelInfo);
 
-        sprintf(fileName, DELLIPTIC "/okl/ellipticPatchSolver.okl");
-        sprintf(kernelName, "ellipticApproxBlockJacobiSolver");
-        elliptic->precon->approxBlockJacobiSolverKernel = mesh->device.buildKernel(fileName,
-                                                                                   kernelName,
+        filename = oklpath + "ellipticPatchSolver.okl";
+        kernelName = "ellipticApproxBlockJacobiSolver";
+        elliptic->precon->approxBlockJacobiSolverKernel = mesh->device.buildKernel(filename.c_str(),
+                                                                                   kernelName.c_str(),
                                                                                    kernelInfo);
-
-        if (   elliptic->elementType == TRIANGLES
-               || elliptic->elementType == TETRAHEDRA) {
-          elliptic->precon->SEMFEMInterpKernel =
-            mesh->device.buildKernel(DELLIPTIC "/okl/ellipticSEMFEMInterp.okl",
-                                     "ellipticSEMFEMInterp",
-                                     kernelInfo);
-
-          elliptic->precon->SEMFEMAnterpKernel =
-            mesh->device.buildKernel(DELLIPTIC "/okl/ellipticSEMFEMAnterp.okl",
-                                     "ellipticSEMFEMAnterp",
-                                     kernelInfo);
-        }
       }
     }
 
@@ -1031,7 +1001,8 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
   }
 
   MPI_Barrier(mesh->comm);
-  if(mesh->rank == 0)  printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout);
+  if(mesh->rank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel);
+  fflush(stdout);
 
   if(elliptic->blockSolver) {
     elliptic->nullProjectBlockWeightGlobal = (dfloat*)calloc(elliptic->Nfields, sizeof(dfloat));
@@ -1059,7 +1030,6 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
       elliptic->nullProjectBlockWeightGlobal[fld] = 1.0 / nullProjectWeightGlobal;
     }
   }else{
-    // TW: WARNING C0 appropriate only
     mesh->sumKernel(mesh->Nelements * mesh->Np, elliptic->o_invDegree, elliptic->o_tmp);
     elliptic->o_tmp.copyTo(elliptic->tmp);
 
@@ -1084,10 +1054,10 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
   if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT;
   if(options.compareArgs("THREAD MODEL", "OPENMP")) oogsMode = OOGS_DEFAULT;
   auto callback = [&]() // hardwired to FP64 variable coeff
-    {
-      ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList,
-                 elliptic->o_p, elliptic->o_Ap, dfloatString);
-    };
+                  {
+                    ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList,
+                               elliptic->o_p, elliptic->o_Ap, dfloatString);
+                  };
   elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode);
   elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode);
 
@@ -1099,39 +1069,11 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties kernelInfo)
 
   if(options.compareArgs("RESIDUAL PROJECTION","TRUE")) {
     dlong nVecsProject = 8;
-    try {
-      nVecsProject = static_cast < dlong > (std::stoi(options.getArgs(
-                                                        "RESIDUAL PROJECTION VECTORS")));
-    } catch(std::invalid_argument& e) {
-      if(elliptic->mesh->rank == 0) {
-        std::cout << "Encountered invalid argument when getting RESIDUAL PROJECTION VECTORS!\n";
-        std::cout << e.what();
-      }
-      exit(-1);
-    } catch (std::out_of_range& e) {
-      if(elliptic->mesh->rank == 0) {
-        std::cout << "Encountered out_of_range error when getting RESIDUAL PROJECTION VECTORS!\n";
-        std::cout << e.what();
-      }
-      exit(-1);
-    }
+    options.getArgs("RESIDUAL PROJECTION VECTORS", nVecsProject);
+
     dlong nStepsStart = 5;
-    try {
-      nStepsStart = static_cast < dlong > (std::stoi(options.getArgs("RESIDUAL PROJECTION START")));
-    } catch(std::invalid_argument& e) {
-      if(elliptic->mesh->rank == 0) {
-        std::cout << "Encountered invalid argument when getting RESIDUAL PROJECTION START!\n";
-        std::cout << e.what();
-      }
-      exit(-1);
-    } catch (std::out_of_range& e) {
-      if(elliptic->mesh->rank == 0) {
-        std::cout << "Encountered out_of_range error when getting RESIDUAL PROJECTION START!\n";
-        std::cout << e.what();
-      }
-      exit(-1);
-    }
-    elliptic->residualProjection = new ResidualProjection(* elliptic, nVecsProject, nStepsStart);
-  }
+    options.getArgs("RESIDUAL PROJECTION START", nStepsStart);
 
+    elliptic->residualProjection = new ResidualProjection(*elliptic, nVecsProject, nStepsStart);
+  }
 }
diff --git a/src/libP/solvers/elliptic/src/ellipticThinOas.c b/src/elliptic/ellipticThinOas.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticThinOas.c
rename to src/elliptic/ellipticThinOas.cpp
diff --git a/src/libP/solvers/elliptic/src/ellipticThinOasSetup.c b/src/elliptic/ellipticThinOasSetup.cpp
similarity index 99%
rename from src/libP/solvers/elliptic/src/ellipticThinOasSetup.c
rename to src/elliptic/ellipticThinOasSetup.cpp
index 94940a418..d216f44cc 100644
--- a/src/libP/solvers/elliptic/src/ellipticThinOasSetup.c
+++ b/src/elliptic/ellipticThinOasSetup.cpp
@@ -219,7 +219,6 @@ void ellipticThinOasSetup(elliptic_t* elliptic)
         }
       }
 
-
   // --------------------------------------------------------------------------------------------------------------
   // 4. construct diagonal scaling for fast diagonal inverse
   // --------------------------------------------------------------------------------------------------------------
diff --git a/src/libP/solvers/elliptic/src/ellipticUpdateNBFPCG.c b/src/elliptic/ellipticUpdateNBFPCG.cpp
similarity index 84%
rename from src/libP/solvers/elliptic/src/ellipticUpdateNBFPCG.c
rename to src/elliptic/ellipticUpdateNBFPCG.cpp
index 1a1186b2a..182de55bd 100644
--- a/src/libP/solvers/elliptic/src/ellipticUpdateNBFPCG.c
+++ b/src/elliptic/ellipticUpdateNBFPCG.cpp
@@ -130,37 +130,37 @@ void ellipticSerialUpdate0NBFPCG(const int Nq, const hlong Nelements, int useWei
 
   switch(Nq) {
   case  2: ellipticSerialUpdate0NBFPCGKernel <  2 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  3: ellipticSerialUpdate0NBFPCGKernel <  3 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  4: ellipticSerialUpdate0NBFPCGKernel <  4 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  5: ellipticSerialUpdate0NBFPCGKernel <  5 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  6: ellipticSerialUpdate0NBFPCGKernel <  6 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  7: ellipticSerialUpdate0NBFPCGKernel <  7 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  8: ellipticSerialUpdate0NBFPCGKernel <  8 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  9: ellipticSerialUpdate0NBFPCGKernel <  9 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case 10: ellipticSerialUpdate0NBFPCGKernel < 10 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case 11: ellipticSerialUpdate0NBFPCGKernel < 11 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case 12: ellipticSerialUpdate0NBFPCGKernel < 12 >
-    (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   }
 }
@@ -188,37 +188,37 @@ void ellipticBlockSerialUpdate0NBFPCG(const int Nfields,
 
   switch(Nq) {
   case  2: ellipticBlockSerialUpdate0NBFPCGKernel <  2 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  3: ellipticBlockSerialUpdate0NBFPCGKernel <  3 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  4: ellipticBlockSerialUpdate0NBFPCGKernel <  4 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  5: ellipticBlockSerialUpdate0NBFPCGKernel <  5 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  6: ellipticBlockSerialUpdate0NBFPCGKernel <  6 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  7: ellipticBlockSerialUpdate0NBFPCGKernel <  7 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  8: ellipticBlockSerialUpdate0NBFPCGKernel <  8 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case  9: ellipticBlockSerialUpdate0NBFPCGKernel <  9 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case 10: ellipticBlockSerialUpdate0NBFPCGKernel < 10 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case 11: ellipticBlockSerialUpdate0NBFPCGKernel < 11 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   case 12: ellipticBlockSerialUpdate0NBFPCGKernel < 12 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_u, cpu_r, cpu_w, localdots);
     break;
   }
 }
@@ -265,7 +265,6 @@ void ellipticNonBlockingUpdate0NBFPCG(elliptic_t* elliptic,
                                   o_r,
                                   o_w,
                                   localdots);
-
   }else {
     // (u.r)
     // (u.w)
@@ -417,48 +416,48 @@ void ellipticSerialUpdate1NBFPCG(const int Nq,
 
   switch(Nq) {
   case  2: ellipticSerialUpdate1NBFPCGKernel <  2 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  3: ellipticSerialUpdate1NBFPCGKernel <  3 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  4: ellipticSerialUpdate1NBFPCGKernel <  4 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  5: ellipticSerialUpdate1NBFPCGKernel <  5 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  6: ellipticSerialUpdate1NBFPCGKernel <  6 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  7: ellipticSerialUpdate1NBFPCGKernel <  7 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  8: ellipticSerialUpdate1NBFPCGKernel <  8 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case  9: ellipticSerialUpdate1NBFPCGKernel <  9 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case 10: ellipticSerialUpdate1NBFPCGKernel < 10 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case 11: ellipticSerialUpdate1NBFPCGKernel < 11 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   case 12: ellipticSerialUpdate1NBFPCGKernel < 12 >
-    (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
-     cpu_w, localdots);
+      (Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x, cpu_r, cpu_u,
+      cpu_w, localdots);
     break;
   }
 }
@@ -578,48 +577,48 @@ void ellipticBlockSerialUpdate1NBFPCG(const int Nfields,
 
   switch(Nq) {
   case  2: ellipticBlockSerialUpdate1NBFPCGKernel <  2 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  3: ellipticBlockSerialUpdate1NBFPCGKernel <  3 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  4: ellipticBlockSerialUpdate1NBFPCGKernel <  4 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  5: ellipticBlockSerialUpdate1NBFPCGKernel <  5 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  6: ellipticBlockSerialUpdate1NBFPCGKernel <  6 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  7: ellipticBlockSerialUpdate1NBFPCGKernel <  7 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  8: ellipticBlockSerialUpdate1NBFPCGKernel <  8 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case  9: ellipticBlockSerialUpdate1NBFPCGKernel <  9 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case 10: ellipticBlockSerialUpdate1NBFPCGKernel < 10 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case 11: ellipticBlockSerialUpdate1NBFPCGKernel < 11 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   case 12: ellipticBlockSerialUpdate1NBFPCGKernel < 12 >
-    (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
-     cpu_r, cpu_u, cpu_w, localdots);
+      (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_p, cpu_s, cpu_q, cpu_z, alpha, cpu_x,
+      cpu_r, cpu_u, cpu_w, localdots);
     break;
   }
 }
diff --git a/src/libP/solvers/elliptic/src/ellipticUpdateNBPCG.c b/src/elliptic/ellipticUpdateNBPCG.cpp
similarity index 82%
rename from src/libP/solvers/elliptic/src/ellipticUpdateNBPCG.c
rename to src/elliptic/ellipticUpdateNBPCG.cpp
index 0e0fb5179..49f1cf993 100644
--- a/src/libP/solvers/elliptic/src/ellipticUpdateNBPCG.c
+++ b/src/elliptic/ellipticUpdateNBPCG.cpp
@@ -145,37 +145,37 @@ dfloat ellipticSerialUpdate1NBPCG(const int Nq,
 
   switch(Nq) {
   case  2: pdots = ellipticSerialUpdate1NBPCGKernel <  2 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  3: pdots = ellipticSerialUpdate1NBPCGKernel <  3 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  4: pdots = ellipticSerialUpdate1NBPCGKernel <  4 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  5: pdots = ellipticSerialUpdate1NBPCGKernel <  5 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  6: pdots = ellipticSerialUpdate1NBPCGKernel <  6 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  7: pdots = ellipticSerialUpdate1NBPCGKernel <  7 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  8: pdots = ellipticSerialUpdate1NBPCGKernel <  8 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case  9: pdots = ellipticSerialUpdate1NBPCGKernel <  9 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case 10: pdots = ellipticSerialUpdate1NBPCGKernel < 10 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case 11: pdots = ellipticSerialUpdate1NBPCGKernel < 11 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   case 12: pdots = ellipticSerialUpdate1NBPCGKernel < 12 >
-                   (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
+                     (Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p, cpu_s);
     break;
   }
 
@@ -211,48 +211,48 @@ dfloat ellipticBlockSerialUpdate1NBPCG(const int Nfields,
 
   switch(Nq) {
   case  2: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  2 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  3: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  3 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  4: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  4 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  5: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  5 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  6: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  6 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  7: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  7 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  8: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  8 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case  9: pdots = ellipticBlockSerialUpdate1NBPCGKernel <  9 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case 10: pdots = ellipticBlockSerialUpdate1NBPCGKernel < 10 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case 11: pdots = ellipticBlockSerialUpdate1NBPCGKernel < 11 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   case 12: pdots = ellipticBlockSerialUpdate1NBPCGKernel < 12 >
-                   (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
-                    cpu_s);
+                     (Nfields, offset, Nelements, useWeight, cpu_invDegree, cpu_z, cpu_Z, beta, cpu_p,
+                     cpu_s);
     break;
   }
 
@@ -469,37 +469,37 @@ void ellipticSerialUpdate2NBPCG(const int Nq,
 
   switch(Nq) {
   case  2: ellipticSerialUpdate2NBPCGKernel <  2 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  3: ellipticSerialUpdate2NBPCGKernel <  3 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  4: ellipticSerialUpdate2NBPCGKernel <  4 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  5: ellipticSerialUpdate2NBPCGKernel <  5 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  6: ellipticSerialUpdate2NBPCGKernel <  6 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  7: ellipticSerialUpdate2NBPCGKernel <  7 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  8: ellipticSerialUpdate2NBPCGKernel <  8 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case  9: ellipticSerialUpdate2NBPCGKernel <  9 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case 10: ellipticSerialUpdate2NBPCGKernel < 10 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case 11: ellipticSerialUpdate2NBPCGKernel < 11 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   case 12: ellipticSerialUpdate2NBPCGKernel < 12 >
-    (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
+      (Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z, localdots);
     break;
   }
 }
@@ -533,48 +533,48 @@ void ellipticBlockSerialUpdate2NBPCG(const int Nfields,
 
   switch(Nq) {
   case  2: ellipticBlockSerialUpdate2NBPCGKernel <  2 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  3: ellipticBlockSerialUpdate2NBPCGKernel <  3 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  4: ellipticBlockSerialUpdate2NBPCGKernel <  4 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  5: ellipticBlockSerialUpdate2NBPCGKernel <  5 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  6: ellipticBlockSerialUpdate2NBPCGKernel <  6 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  7: ellipticBlockSerialUpdate2NBPCGKernel <  7 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  8: ellipticBlockSerialUpdate2NBPCGKernel <  8 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case  9: ellipticBlockSerialUpdate2NBPCGKernel <  9 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case 10: ellipticBlockSerialUpdate2NBPCGKernel < 10 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case 11: ellipticBlockSerialUpdate2NBPCGKernel < 11 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   case 12: ellipticBlockSerialUpdate2NBPCGKernel < 12 >
-    (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
-     localdots);
+      (Nfields, offset, Nelements, useWeight,  cpu_invDegree,cpu_s, cpu_S, alpha, cpu_r, cpu_z,
+      localdots);
     break;
   }
 }
@@ -598,7 +598,6 @@ void ellipticNonBlockingUpdate2NBPCG(elliptic_t* elliptic,
   if(options.compareArgs("FIXED ITERATION COUNT", "TRUE"))
     fixedIterationCountFlag = 1;
 
-
   mesh_t* mesh = elliptic->mesh;
   const dlong Nlocal = mesh->Np * mesh->Nelements;
 
@@ -621,7 +620,6 @@ void ellipticNonBlockingUpdate2NBPCG(elliptic_t* elliptic,
                                  elliptic->o_invDegree,
                                  o_s, o_S, alpha, o_r, o_z,
                                  localdots);
-
   }else {
     // r <= r - alpha*s
     // z <= z - alpha*S
diff --git a/src/libP/solvers/elliptic/src/ellipticUpdatePCG.c b/src/elliptic/ellipticUpdatePCG.cpp
similarity index 98%
rename from src/libP/solvers/elliptic/src/ellipticUpdatePCG.c
rename to src/elliptic/ellipticUpdatePCG.cpp
index d34bc7e10..d16199760 100644
--- a/src/libP/solvers/elliptic/src/ellipticUpdatePCG.c
+++ b/src/elliptic/ellipticUpdatePCG.cpp
@@ -58,7 +58,7 @@ dfloat ellipticUpdatePCG(elliptic_t* elliptic,
                               elliptic->o_tmpNormr);
 
 #ifdef ELLIPTIC_ENABLE_TIMER
-  timer::tic("dotp",1);
+    timer::tic("dotp",1);
 #endif
     elliptic->o_tmpNormr.copyTo(&rdotr1, sizeof(dfloat));
     dfloat globalrdotr1 = 0;
@@ -67,7 +67,7 @@ dfloat ellipticUpdatePCG(elliptic_t* elliptic,
     else
       globalrdotr1 = 1;
 #ifdef ELLIPTIC_ENABLE_TIMER
-  timer::toc("dotp");
+    timer::toc("dotp");
 #endif
 
     return globalrdotr1;
diff --git a/src/libP/solvers/elliptic/src/ellipticVectors.c b/src/elliptic/ellipticVectors.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticVectors.c
rename to src/elliptic/ellipticVectors.cpp
diff --git a/src/libP/solvers/elliptic/src/ellipticWeightedInnerProduct.c b/src/elliptic/ellipticWeightedInnerProduct.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/ellipticWeightedInnerProduct.c
rename to src/elliptic/ellipticWeightedInnerProduct.cpp
diff --git a/src/libP/solvers/elliptic/src/ellipticWeightedNorm2.c b/src/elliptic/ellipticWeightedNorm2.cpp
similarity index 99%
rename from src/libP/solvers/elliptic/src/ellipticWeightedNorm2.c
rename to src/elliptic/ellipticWeightedNorm2.cpp
index 798c76723..4d74c0371 100644
--- a/src/libP/solvers/elliptic/src/ellipticWeightedNorm2.c
+++ b/src/elliptic/ellipticWeightedNorm2.cpp
@@ -56,7 +56,7 @@ dfloat ellipticWeightedNorm2(elliptic_t* elliptic, occa::memory &o_w, occa::memo
       elliptic->weightedNorm2Kernel(Nlocal, elliptic->Ntotal, o_w, o_a, o_tmp);
     else
       elliptic->weightedNorm2Kernel(Nlocal, o_w, o_a, o_tmp);
-  }else  {
+  }else {
     elliptic->innerProductKernel(Nlocal, o_a, o_a, o_tmp);
   }
 
@@ -70,7 +70,7 @@ dfloat ellipticWeightedNorm2(elliptic_t* elliptic, occa::memory &o_w, occa::memo
       mesh->sumKernel(Nblock, o_tmp, o_tmp2);
       o_tmp2.copyTo(tmp);
       Nfinal = Nblock2;
-    }else  {
+    }else {
       o_tmp.copyTo(tmp);
       Nfinal = Nblock;
     }
@@ -78,7 +78,6 @@ dfloat ellipticWeightedNorm2(elliptic_t* elliptic, occa::memory &o_w, occa::memo
     wa2 = 0;
     for(dlong n = 0; n < Nfinal; ++n)
       wa2 += tmp[n];
-
   }
 
   dfloat globalwa2 = 0;
diff --git a/src/libP/solvers/elliptic/src/ellipticZeroMean.c b/src/elliptic/ellipticZeroMean.cpp
similarity index 97%
rename from src/libP/solvers/elliptic/src/ellipticZeroMean.c
rename to src/elliptic/ellipticZeroMean.cpp
index 0b8ca7a4e..c0051e8a4 100644
--- a/src/libP/solvers/elliptic/src/ellipticZeroMean.c
+++ b/src/elliptic/ellipticZeroMean.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "elliptic.h"
-#include "ogsInterface.h"
+//#include "ogsInterface.h"
 
 #define USE_WEIGHTED 1
 
@@ -80,7 +80,7 @@ void ellipticZeroMean(elliptic_t* elliptic, occa::memory &o_q)
 #endif
 
 #ifdef ELLIPTIC_ENABLE_TIMER
-  timer::tic("dotp",1);
+    timer::tic("dotp",1);
 #endif
     o_tmp.copyTo(tmp);
 
@@ -92,7 +92,7 @@ void ellipticZeroMean(elliptic_t* elliptic, occa::memory &o_q)
     // globalize reduction
     MPI_Allreduce(&qmeanLocal, &qmeanGlobal, 1, MPI_DFLOAT, MPI_SUM, mesh->comm);
 #ifdef ELLIPTIC_ENABLE_TIMER
-  timer::toc("dotp");
+    timer::toc("dotp");
 #endif
 
     // normalize
diff --git a/src/libP/parAlmond/src/SpMV.cpp b/src/elliptic/parAlmond/SpMV.cpp
similarity index 100%
rename from src/libP/parAlmond/src/SpMV.cpp
rename to src/elliptic/parAlmond/SpMV.cpp
diff --git a/src/libP/parAlmond/include/agmg.hpp b/src/elliptic/parAlmond/agmg.hpp
similarity index 100%
rename from src/libP/parAlmond/include/agmg.hpp
rename to src/elliptic/parAlmond/agmg.hpp
diff --git a/src/libP/parAlmond/src/agmgLevel.cpp b/src/elliptic/parAlmond/agmgLevel.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgLevel.cpp
rename to src/elliptic/parAlmond/agmgLevel.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/adjustPartition.cpp b/src/elliptic/parAlmond/agmgSetup/adjustPartition.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/adjustPartition.cpp
rename to src/elliptic/parAlmond/agmgSetup/adjustPartition.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/agmgSetup.cpp b/src/elliptic/parAlmond/agmgSetup/agmgSetup.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/agmgSetup.cpp
rename to src/elliptic/parAlmond/agmgSetup/agmgSetup.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/constructProlongation.cpp b/src/elliptic/parAlmond/agmgSetup/constructProlongation.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/constructProlongation.cpp
rename to src/elliptic/parAlmond/agmgSetup/constructProlongation.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/formAggregates.cpp b/src/elliptic/parAlmond/agmgSetup/formAggregates.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/formAggregates.cpp
rename to src/elliptic/parAlmond/agmgSetup/formAggregates.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/galerkinProd.cpp b/src/elliptic/parAlmond/agmgSetup/galerkinProd.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/galerkinProd.cpp
rename to src/elliptic/parAlmond/agmgSetup/galerkinProd.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/strongGraph.cpp b/src/elliptic/parAlmond/agmgSetup/strongGraph.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/strongGraph.cpp
rename to src/elliptic/parAlmond/agmgSetup/strongGraph.cpp
diff --git a/src/libP/parAlmond/src/agmgSetup/transpose.cpp b/src/elliptic/parAlmond/agmgSetup/transpose.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSetup/transpose.cpp
rename to src/elliptic/parAlmond/agmgSetup/transpose.cpp
diff --git a/src/libP/parAlmond/src/agmgSmoother.cpp b/src/elliptic/parAlmond/agmgSmoother.cpp
similarity index 100%
rename from src/libP/parAlmond/src/agmgSmoother.cpp
rename to src/elliptic/parAlmond/agmgSmoother.cpp
diff --git a/src/libP/parAlmond/include/coarse.hpp b/src/elliptic/parAlmond/coarse.hpp
similarity index 100%
rename from src/libP/parAlmond/include/coarse.hpp
rename to src/elliptic/parAlmond/coarse.hpp
diff --git a/src/libP/parAlmond/src/coarseSolver.cpp b/src/elliptic/parAlmond/coarseSolver.cpp
similarity index 97%
rename from src/libP/parAlmond/src/coarseSolver.cpp
rename to src/elliptic/parAlmond/coarseSolver.cpp
index 8f8f179fb..e883800ed 100644
--- a/src/libP/parAlmond/src/coarseSolver.cpp
+++ b/src/elliptic/parAlmond/coarseSolver.cpp
@@ -57,6 +57,9 @@ void coarseSolver::setup(parCSR *A) {
   MPI_Comm_rank(comm,&rank);
   MPI_Comm_size(comm,&size);
 
+   if(options.compareArgs("BUILD ONLY", "TRUE"))
+    return; // bail early as this will not get used
+
   if(options.compareArgs("PARALMOND SMOOTH COARSEST", "TRUE"))
     return; // bail early as this will not get used
 
@@ -150,13 +153,11 @@ void coarseSolver::setup(parCSR *A) {
 
   coarseCounts = (int*) calloc(size,sizeof(int));
 
-#if USE_NULL_PROJECTION==1
-    if(A->nullSpace){
-      if(rank==0) printf("Current null space handling not available for parAlmond!\n");
-      fflush(stdout);
-      exit(1);
-    }
-#endif
+  if(A->nullSpace){
+    if(rank==0) printf("Current null space handling not available for parAlmond!\n");
+    fflush(stdout);
+    exit(1);
+  }
 
   int sendNNZ = (int) (A->diag->nnz+A->offd->nnz);
   int *rows;
diff --git a/src/libP/parAlmond/hypre/hypre.c b/src/elliptic/parAlmond/crs_hypre.cpp
similarity index 100%
rename from src/libP/parAlmond/hypre/hypre.c
rename to src/elliptic/parAlmond/crs_hypre.cpp
diff --git a/src/libP/parAlmond/hypre/crs_hypre.h b/src/elliptic/parAlmond/crs_hypre.h
similarity index 100%
rename from src/libP/parAlmond/hypre/crs_hypre.h
rename to src/elliptic/parAlmond/crs_hypre.h
diff --git a/src/libP/parAlmond/include/defines.hpp b/src/elliptic/parAlmond/defines.hpp
similarity index 98%
rename from src/libP/parAlmond/include/defines.hpp
rename to src/elliptic/parAlmond/defines.hpp
index 2e0425798..53ce69da9 100644
--- a/src/libP/parAlmond/include/defines.hpp
+++ b/src/elliptic/parAlmond/defines.hpp
@@ -27,7 +27,6 @@ SOFTWARE.
 #ifndef PARALMOND_DEFINES_HPP
 #define PARALMOND_DEFINES_HPP
 
-#define BLOCKSIZE 512
 #define NBLOCKS 128
 
 #define MAX_LEVELS 100
diff --git a/src/libP/parAlmond/src/kernels.cpp b/src/elliptic/parAlmond/kernels.cpp
similarity index 57%
rename from src/libP/parAlmond/src/kernels.cpp
rename to src/elliptic/parAlmond/kernels.cpp
index 1ab89a5be..916508edb 100644
--- a/src/libP/parAlmond/src/kernels.cpp
+++ b/src/elliptic/parAlmond/kernels.cpp
@@ -88,6 +88,12 @@ void buildParAlmondKernels(MPI_Comm comm, occa::device device){
   kernelInfo["defines/" "p_BLOCKSIZE"]= BLOCKSIZE;
 
   if(device.mode()=="OpenCL"){
+    kernelInfo["compiler_flags"] += " -cl-std=CL2.0 ";
+    kernelInfo["compiler_flags"] += " -cl-strict-aliasing ";
+    kernelInfo["compiler_flags"] += " -cl-mad-enable ";
+    kernelInfo["compiler_flags"] += " -cl-no-signed-zeros ";
+    kernelInfo["compiler_flags"] += " -cl-unsafe-math-optimizations ";
+    kernelInfo["compiler_flags"] += " -cl-fast-relaxed-math ";
     //kernelInfo["compiler_flags"] += "-cl-opt-disable";
   }
 
@@ -99,36 +105,60 @@ void buildParAlmondKernels(MPI_Comm comm, occa::device device){
     kernelInfo["compiler_flags"] += " --fmad=true "; // compiler option for cuda
   }
 
+  string install_dir;
+  install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
+
   if (rank==0) printf("Compiling parALMOND Kernels...");fflush(stdout);
 
   for (int r=0;r<2;r++){
     if ((r==0 && rank==0) || (r==1 && rank>0)) {      
+      const string oklpath = install_dir + "/okl/parAlmond/";
+      string filename;
+
+      filename = oklpath + "SpMVcsr.okl";
+      SpMVcsrKernel1  = device.buildKernel(filename.c_str(),  "SpMVcsr1",  kernelInfo);
+      SpMVcsrKernel2  = device.buildKernel(filename.c_str(),  "SpMVcsr2",  kernelInfo);
+
+      filename = oklpath + "SpMVell.okl";
+      SpMVellKernel1  = device.buildKernel(filename.c_str(),  "SpMVell1",  kernelInfo);
+      SpMVellKernel2  = device.buildKernel(filename.c_str(),  "SpMVell2",  kernelInfo);
+
+      filename = oklpath + "SpMVmcsr.okl";
+      SpMVmcsrKernel1 = device.buildKernel(filename.c_str(), "SpMVmcsr1", kernelInfo);
+      SpMVmcsrKernel2 = device.buildKernel(filename.c_str(), "SpMVmcsr2", kernelInfo);
+
+      filename = oklpath + "vectorSet.okl";
+      vectorSetKernel = device.buildKernel(filename.c_str(), "vectorSet", kernelInfo);
+
+      filename = oklpath + "vectorScale.okl";
+      vectorScaleKernel = device.buildKernel(filename.c_str(), "vectorScale", kernelInfo);
+
+      filename = oklpath + "vectorAddScalar.okl";
+      vectorAddScalarKernel = device.buildKernel(filename.c_str(), "vectorAddScalar", kernelInfo);
+
+      filename = oklpath + "vectorAdd.okl";
+      vectorAddKernel1 = device.buildKernel(filename.c_str(), "vectorAdd1", kernelInfo);
+      vectorAddKernel2 = device.buildKernel(filename.c_str(), "vectorAdd2", kernelInfo);
+
+      filename = oklpath + "vectorDotStar.okl";
+      vectorDotStarKernel1 = device.buildKernel(filename.c_str(), "vectorDotStar1", kernelInfo);
+      vectorDotStarKernel2 = device.buildKernel(filename.c_str(), "vectorDotStar2", kernelInfo);
+
+      filename = oklpath + "vectorInnerProd.okl";
+      vectorInnerProdKernel = device.buildKernel(filename.c_str(), "vectorInnerProd", kernelInfo);
+
+      filename = oklpath + "vectorAddInnerProd.okl";
+      vectorAddInnerProdKernel = device.buildKernel(filename.c_str(), "vectorAddInnerProd", kernelInfo);
+      vectorAddWeightedInnerProdKernel = device.buildKernel(filename.c_str(), "vectorAddWeightedInnerProd", kernelInfo);
+
+      filename = oklpath + "kcycleCombinedOp.okl";
+      kcycleCombinedOp1Kernel = device.buildKernel(filename.c_str(), "kcycleCombinedOp1", kernelInfo);
+      kcycleCombinedOp2Kernel = device.buildKernel(filename.c_str(), "kcycleCombinedOp2", kernelInfo);
+      kcycleWeightedCombinedOp1Kernel = device.buildKernel(filename.c_str(), "kcycleWeightedCombinedOp1", kernelInfo);
+      kcycleWeightedCombinedOp2Kernel = device.buildKernel(filename.c_str(), "kcycleWeightedCombinedOp2", kernelInfo);
 
-      SpMVcsrKernel1  = device.buildKernel(DPARALMOND"/okl/SpMVcsr.okl",  "SpMVcsr1",  kernelInfo);
-      SpMVcsrKernel2  = device.buildKernel(DPARALMOND"/okl/SpMVcsr.okl",  "SpMVcsr2",  kernelInfo);
-      SpMVellKernel1  = device.buildKernel(DPARALMOND"/okl/SpMVell.okl",  "SpMVell1",  kernelInfo);
-      SpMVellKernel2  = device.buildKernel(DPARALMOND"/okl/SpMVell.okl",  "SpMVell2",  kernelInfo);
-      SpMVmcsrKernel1 = device.buildKernel(DPARALMOND"/okl/SpMVmcsr.okl", "SpMVmcsr1", kernelInfo);
-      SpMVmcsrKernel2 = device.buildKernel(DPARALMOND"/okl/SpMVmcsr.okl", "SpMVmcsr2", kernelInfo);
-
-      vectorSetKernel = device.buildKernel(DPARALMOND"/okl/vectorSet.okl", "vectorSet", kernelInfo);
-      vectorScaleKernel = device.buildKernel(DPARALMOND"/okl/vectorScale.okl", "vectorScale", kernelInfo);
-      vectorAddScalarKernel = device.buildKernel(DPARALMOND"/okl/vectorAddScalar.okl", "vectorAddScalar", kernelInfo);
-      vectorAddKernel1 = device.buildKernel(DPARALMOND"/okl/vectorAdd.okl", "vectorAdd1", kernelInfo);
-      vectorAddKernel2 = device.buildKernel(DPARALMOND"/okl/vectorAdd.okl", "vectorAdd2", kernelInfo);
-      vectorDotStarKernel1 = device.buildKernel(DPARALMOND"/okl/vectorDotStar.okl", "vectorDotStar1", kernelInfo);
-      vectorDotStarKernel2 = device.buildKernel(DPARALMOND"/okl/vectorDotStar.okl", "vectorDotStar2", kernelInfo);
-      vectorInnerProdKernel = device.buildKernel(DPARALMOND"/okl/vectorInnerProd.okl", "vectorInnerProd", kernelInfo);
-
-      vectorAddInnerProdKernel = device.buildKernel(DPARALMOND"/okl/vectorAddInnerProd.okl", "vectorAddInnerProd", kernelInfo);
-      vectorAddWeightedInnerProdKernel = device.buildKernel(DPARALMOND"/okl/vectorAddInnerProd.okl", "vectorAddWeightedInnerProd", kernelInfo);
-
-      kcycleCombinedOp1Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp1", kernelInfo);
-      kcycleCombinedOp2Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp2", kernelInfo);
-      kcycleWeightedCombinedOp1Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleWeightedCombinedOp1", kernelInfo);
-      kcycleWeightedCombinedOp2Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleWeightedCombinedOp2", kernelInfo);
-
-      haloExtractKernel = device.buildKernel(DPARALMOND"/okl/haloExtract.okl", "haloExtract", kernelInfo);
+      filename = oklpath + "haloExtract.okl";
+      haloExtractKernel = device.buildKernel(filename.c_str(), "haloExtract", kernelInfo);
     }
     MPI_Barrier(comm);
   }
diff --git a/src/libP/parAlmond/include/kernels.hpp b/src/elliptic/parAlmond/kernels.hpp
similarity index 100%
rename from src/libP/parAlmond/include/kernels.hpp
rename to src/elliptic/parAlmond/kernels.hpp
diff --git a/src/libP/parAlmond/src/level.cpp b/src/elliptic/parAlmond/level.cpp
similarity index 100%
rename from src/libP/parAlmond/src/level.cpp
rename to src/elliptic/parAlmond/level.cpp
diff --git a/src/libP/parAlmond/include/level.hpp b/src/elliptic/parAlmond/level.hpp
similarity index 100%
rename from src/libP/parAlmond/include/level.hpp
rename to src/elliptic/parAlmond/level.hpp
diff --git a/src/libP/parAlmond/src/matrix.cpp b/src/elliptic/parAlmond/matrix.cpp
similarity index 100%
rename from src/libP/parAlmond/src/matrix.cpp
rename to src/elliptic/parAlmond/matrix.cpp
diff --git a/src/libP/parAlmond/include/matrix.hpp b/src/elliptic/parAlmond/matrix.hpp
similarity index 100%
rename from src/libP/parAlmond/include/matrix.hpp
rename to src/elliptic/parAlmond/matrix.hpp
diff --git a/src/libP/parAlmond/src/multigrid.cpp b/src/elliptic/parAlmond/multigrid.cpp
similarity index 100%
rename from src/libP/parAlmond/src/multigrid.cpp
rename to src/elliptic/parAlmond/multigrid.cpp
diff --git a/src/libP/parAlmond/src/parAlmond.cpp b/src/elliptic/parAlmond/parAlmond.cpp
similarity index 100%
rename from src/libP/parAlmond/src/parAlmond.cpp
rename to src/elliptic/parAlmond/parAlmond.cpp
diff --git a/src/libP/parAlmond/parAlmond.hpp b/src/elliptic/parAlmond/parAlmond.hpp
similarity index 82%
rename from src/libP/parAlmond/parAlmond.hpp
rename to src/elliptic/parAlmond/parAlmond.hpp
index d80aca45e..903ed19ee 100644
--- a/src/libP/parAlmond/parAlmond.hpp
+++ b/src/elliptic/parAlmond/parAlmond.hpp
@@ -29,22 +29,18 @@ SOFTWARE.
 
 #include <math.h>
 #include <stdlib.h>
-#include <occa.hpp>
-
-#include "mpi.h"
-#include "types.h"
-#include "ogs.hpp"
-#include "setupAide.hpp"
-
-#include "include/defines.hpp"
-#include "include/utils.hpp"
-#include "include/kernels.hpp"
-#include "include/vector.hpp"
-#include "include/matrix.hpp"
-#include "include/level.hpp"
-#include "include/agmg.hpp"
-#include "include/coarse.hpp"
-#include "include/solver.hpp"
+
+#include "nrssys.hpp"
+
+#include "defines.hpp"
+#include "utils.hpp"
+#include "kernels.hpp"
+#include "vector.hpp"
+#include "matrix.hpp"
+#include "level.hpp"
+#include "agmg.hpp"
+#include "coarse.hpp"
+#include "solver.hpp"
 
 
 namespace parAlmond {
diff --git a/src/libP/parAlmond/src/pcg.cpp b/src/elliptic/parAlmond/pcg.cpp
similarity index 100%
rename from src/libP/parAlmond/src/pcg.cpp
rename to src/elliptic/parAlmond/pcg.cpp
diff --git a/src/libP/parAlmond/src/pgmres.cpp b/src/elliptic/parAlmond/pgmres.cpp
similarity index 100%
rename from src/libP/parAlmond/src/pgmres.cpp
rename to src/elliptic/parAlmond/pgmres.cpp
diff --git a/src/libP/parAlmond/src/solver.cpp b/src/elliptic/parAlmond/solver.cpp
similarity index 100%
rename from src/libP/parAlmond/src/solver.cpp
rename to src/elliptic/parAlmond/solver.cpp
diff --git a/src/libP/parAlmond/include/solver.hpp b/src/elliptic/parAlmond/solver.hpp
similarity index 100%
rename from src/libP/parAlmond/include/solver.hpp
rename to src/elliptic/parAlmond/solver.hpp
diff --git a/src/libP/parAlmond/src/timer.cpp b/src/elliptic/parAlmond/timer.cpp
similarity index 100%
rename from src/libP/parAlmond/src/timer.cpp
rename to src/elliptic/parAlmond/timer.cpp
diff --git a/src/libP/parAlmond/src/utils.cpp b/src/elliptic/parAlmond/utils.cpp
similarity index 100%
rename from src/libP/parAlmond/src/utils.cpp
rename to src/elliptic/parAlmond/utils.cpp
diff --git a/src/libP/parAlmond/include/utils.hpp b/src/elliptic/parAlmond/utils.hpp
similarity index 100%
rename from src/libP/parAlmond/include/utils.hpp
rename to src/elliptic/parAlmond/utils.hpp
diff --git a/src/libP/parAlmond/src/vector.cpp b/src/elliptic/parAlmond/vector.cpp
similarity index 100%
rename from src/libP/parAlmond/src/vector.cpp
rename to src/elliptic/parAlmond/vector.cpp
diff --git a/src/libP/parAlmond/include/vector.hpp b/src/elliptic/parAlmond/vector.hpp
similarity index 100%
rename from src/libP/parAlmond/include/vector.hpp
rename to src/elliptic/parAlmond/vector.hpp
diff --git a/src/io/io.hpp b/src/io/io.hpp
new file mode 100644
index 000000000..95f37265a
--- /dev/null
+++ b/src/io/io.hpp
@@ -0,0 +1,7 @@
+#include "nrs.hpp"
+
+void writeFld(nrs_t *nrs, dfloat t);
+void writeFld(nrs_t *nrs, dfloat t, int FP64);
+void writeFld(const char* suffix, dfloat t, int coords, int FP64,
+              occa::memory &o_u, occa::memory &o_p, occa::memory &o_s,
+              int NSfields);
diff --git a/src/io/writeFld.cpp b/src/io/writeFld.cpp
new file mode 100644
index 000000000..39d651dcb
--- /dev/null
+++ b/src/io/writeFld.cpp
@@ -0,0 +1,26 @@
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
+
+void writeFld(const char* suffix, dfloat t, int coords, int FP64,
+              occa::memory &o_u, occa::memory &o_p, occa::memory &o_s,
+              int NSfields)
+{
+  nek_outfld(suffix, t, coords, FP64, o_u, o_p, o_s, NSfields); 
+}
+
+void writeFld(nrs_t *nrs, dfloat t, int FP64) 
+{
+  int coords = 1;
+  int Nscalar = 0;
+  occa::memory o_s;
+  if(nrs->Nscalar) {
+    o_s = nrs->cds->o_S;
+    Nscalar = nrs->Nscalar;
+  }
+  nek_outfld("   ", t, coords, FP64, nrs->o_U, nrs->o_P, o_s, Nscalar); 
+}
+
+void writeFld(nrs_t *nrs, dfloat t) 
+{
+  writeFld(nrs, t, 0); 
+}
diff --git a/src/nekrs.cpp b/src/lib/nekrs.cpp
similarity index 57%
rename from src/nekrs.cpp
rename to src/lib/nekrs.cpp
index 1657eb910..52c85372d 100644
--- a/src/nekrs.cpp
+++ b/src/lib/nekrs.cpp
@@ -1,6 +1,6 @@
 #include "nrs.hpp"
 #include "meshSetup.hpp"
-#include "insSetup.hpp"
+#include "setup.hpp"
 #include "nekInterfaceAdapter.hpp"
 #include "udf.hpp"
 #include "parReader.hpp"
@@ -10,16 +10,23 @@
 static int rank, size;
 static MPI_Comm comm;
 static occa::device device;
-static ins_t* ins;
-static libParanumal::setupAide options;
+static nrs_t* nrs;
+static setupAide options;
 static int ioStep;
 
 static void setOccaVars(string dir);
-static void setOUDF(libParanumal::setupAide &options);
-static void dryRun(libParanumal::setupAide &options, int npTarget);
+static void setOUDF(setupAide &options);
+static void dryRun(setupAide &options, int npTarget);
 
 namespace nekrs
 {
+const double startTime(void)
+{
+  double val = 0;
+  nrs->options.getArgs("START TIME", val);
+  return val;
+}
+
 void setup(MPI_Comm comm_in, int buildOnly, int sizeTarget,
            int ciMode, string cacheDir, string _setupFile,
            string _backend, string _deviceID)
@@ -56,9 +63,6 @@ void setup(MPI_Comm comm_in, int buildOnly, int sizeTarget,
   timer::init(comm, device, 0);
 
   if (buildOnly) {
-    int rank, size;
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &size);
     dryRun(options, sizeTarget);
     return;
   }
@@ -80,71 +84,39 @@ void setup(MPI_Comm comm_in, int buildOnly, int sizeTarget,
 
   if(udf.setup0) udf.setup0(comm, options);
 
-  int nscal;
-  options.getArgs("NUMBER OF SCALARS", nscal);
-
-  // jit compile nek
-  int N;
-  string casename;
-  options.getArgs("CASENAME", casename);
-  options.getArgs("POLYNOMIAL DEGREE", N);
-  if(rank == 0) buildNekInterface(casename.c_str(), mymax(1,nscal), N, size);
-  MPI_Barrier(comm);
-
-  // init nek
-  nek_setup(comm, options, &ins);
-  nek_setic();
-  nek_userchk();
+  nrs = nrsSetup(comm, device, options, buildOnly);
 
-  // init solver
-  ins = insSetup(comm, device, options, buildOnly);
-
-  // set initial condition
-  int readRestartFile;
-  options.getArgs("RESTART FROM FILE", readRestartFile);
-  if(readRestartFile) nek_copyRestart();
-  if(udf.setup) udf.setup(ins);
-
-/*
-  if(options.compareArgs("VARIABLEPROPERTIES", "TRUE")) {
-    if(!udf.properties) {
-      if (rank ==
-          0) cout << "ERROR: variableProperties requires assigned udf.properties pointer" << "!\n";
-      EXIT(1);
-    }
-  }
-*/
-  ins->o_U.copyFrom(ins->U);
-  ins->o_P.copyFrom(ins->P);
-  ins->o_prop.copyFrom(ins->prop);
-  if(ins->Nscalar) {
-    ins->cds->o_S.copyFrom(ins->cds->S);
-    ins->cds->o_prop.copyFrom(ins->cds->prop);
+  nrs->o_U.copyFrom(nrs->U);
+  nrs->o_P.copyFrom(nrs->P);
+  nrs->o_prop.copyFrom(nrs->prop);
+  if(nrs->Nscalar) {
+    nrs->cds->o_S.copyFrom(nrs->cds->S);
+    nrs->cds->o_prop.copyFrom(nrs->cds->prop);
   }
 
   if(udf.properties) {
-    occa::memory o_S = ins->o_wrk0;
-    occa::memory o_SProp = ins->o_wrk0;
-    if(ins->Nscalar) {
-      o_S = ins->cds->o_S;
-      o_SProp = ins->cds->o_prop;
+    occa::memory o_S = nrs->o_wrk0;
+    occa::memory o_SProp = nrs->o_wrk0;
+    if(nrs->Nscalar) {
+      o_S = nrs->cds->o_S;
+      o_SProp = nrs->cds->o_prop;
     }
-    udf.properties(ins, ins->startTime, ins->o_U, o_S,
-                   ins->o_prop, o_SProp);
-    ins->o_prop.copyTo(ins->prop);
-    if(ins->Nscalar) ins->cds->o_prop.copyTo(ins->cds->prop);
+    udf.properties(nrs, startTime(), nrs->o_U, o_S,
+                   nrs->o_prop, o_SProp);
+    nrs->o_prop.copyTo(nrs->prop);
+    if(nrs->Nscalar) nrs->cds->o_prop.copyTo(nrs->cds->prop);
   }
 
-  if(udf.executeStep) udf.executeStep(ins, ins->startTime, 0);
-  nek_ocopyFrom(ins->startTime, 0);
+  if(udf.executeStep) udf.executeStep(nrs, startTime(), 0);
+  nek_ocopyFrom(startTime(), 0);
 
   timer::toc("setup");
   const double setupTime = timer::query("setup", "DEVICE:MAX");
   if(rank == 0) {
     cout << "\nsettings:\n" << endl << options << endl;
-    size_t dMB = ins->mesh->device.memoryAllocated() / 1e6;
+    size_t dMB = nrs->mesh->device.memoryAllocated() / 1e6;
     cout << "device memory allocation: " << dMB << " MB" << endl;
-    cout << "initialization took " <<  setupTime << " seconds" << endl;
+    cout << "initialization took " <<  setupTime << " s" << endl;
   }
   fflush(stdout);
 
@@ -154,7 +126,7 @@ void setup(MPI_Comm comm_in, int buildOnly, int sizeTarget,
 
 void runStep(double time, double dt, int tstep)
 {
-  runStep(ins, time, dt, tstep);
+  runStep(nrs, time, dt, tstep);
 }
 
 void copyToNek(double time, int tstep)
@@ -167,13 +139,13 @@ void udfExecuteStep(double time, int tstep, int isOutputStep)
   timer::tic("udfExecuteStep", 1);
   if (isOutputStep) {
     nek_ifoutfld(1);
-    ins->isOutputStep = 1;
+    nrs->isOutputStep = 1;
   }
 
-  if (udf.executeStep) udf.executeStep(ins, time, tstep);
+  if (udf.executeStep) udf.executeStep(nrs, time, tstep);
 
   nek_ifoutfld(0);
-  ins->isOutputStep = 0;
+  nrs->isOutputStep = 0;
   timer::toc("udfExecuteStep");
 }
 
@@ -182,34 +154,57 @@ void nekUserchk(void)
   nek_userchk();
 }
 
-void nekOutfld(void)
+const double dt(void)
 {
-  nek_outfld();
+  // TODO: adjust dt for target CFL
+  return nrs->dt[0];
 }
 
-const double dt(void)
+const double writeInterval(void)
 {
-  return ins->dt;
+  double val = 0;
+  nrs->options.getArgs("SOLUTION OUTPUT INTERVAL", val);
+  return val;
 }
 
-const int outputStep(void)
+const int writeControlRunTime(void) 
 {
-  return ins->outputStep;
+  return nrs->options.compareArgs("SOLUTION OUTPUT CONTROL", "RUNTIME");
 }
 
-const int NtimeSteps(void)
+void outfld(double time, double outputTime)
 {
-  return ins->NtimeSteps;
+  writeFld(nrs, time, 0);
 }
 
-const double startTime(void)
+const double endTime(void)
+{
+  double endTime = -1;
+  nrs->options.getArgs("END TIME", endTime);
+  return endTime;
+}
+
+const int numSteps(void)
 {
-  return ins->startTime;
+  int numSteps = -1;
+  nrs->options.getArgs("NUMBER TIMESTEPS", numSteps);
+  return numSteps;
 }
 
-const double finalTime(void)
+const int lastStep(double time, int tstep, double elapsedTime)
 {
-  return ins->finalTime;
+  if(!nrs->options.getArgs("STOP AT ELAPSED TIME").empty()) {
+    double maxElaspedTime;
+    nrs->options.getArgs("STOP AT ELAPSED TIME", maxElaspedTime);
+    if(elapsedTime > 60.0*maxElaspedTime) nrs->lastStep = 1; 
+  } else if (endTime() > 0) { 
+     const double eps = 1e-12;
+     nrs->lastStep = fabs((time+nrs->dt[0]) - endTime()) < eps || (time+nrs->dt[0]) > endTime();
+  } else {
+    nrs->lastStep = (tstep+1 > numSteps());
+  }
+
+  return nrs->lastStep;
 }
 
 void* nekPtr(const char* id)
@@ -217,50 +212,47 @@ void* nekPtr(const char* id)
   return nek_ptr(id);
 }
 
+void* nrsPtr(void)
+{
+  return nrs;
+}
+
+
+
 void printRuntimeStatistics()
 {
   timer::printRunStat();
 }
 } // namespace
 
-static void dryRun(libParanumal::setupAide &options, int npTarget)
+static void dryRun(setupAide &options, int npTarget)
 {
   if (rank == 0)
     cout << "performing dry-run for "
          << npTarget
          << " MPI ranks ...\n" << endl;
 
-  string udfFile;
-  options.getArgs("UDF FILE", udfFile);
+  options.setArgs("NP TARGET", std::to_string(npTarget));
 
   // jit compile udf
+  string udfFile;
+  options.getArgs("UDF FILE", udfFile);
   if (!udfFile.empty()) {
     if(rank == 0) udfBuild(udfFile.c_str());
     MPI_Barrier(comm);
-    *(void**)(&udf.loadKernels) = udfLoadFunction("UDF_LoadKernels",1);
+    *(void**)(&udf.loadKernels) = udfLoadFunction("UDF_LoadKernels",0);
     *(void**)(&udf.setup0) = udfLoadFunction("UDF_Setup0",0);
   }
 
   if(udf.setup0) udf.setup0(comm, options);
 
-  int N;
-  string casename;
-  options.getArgs("CASENAME", casename);
-  options.getArgs("POLYNOMIAL DEGREE", N);
-
-  // jit compile nek
-  int nscal;
-  options.getArgs("NUMBER OF SCALARS", nscal);
-  if (rank == 0) buildNekInterface(casename.c_str(), nscal, N, npTarget);
-  MPI_Barrier(comm);
-
   // init solver
-  ins = insSetup(comm, device, options, 1);
+  nrs = nrsSetup(comm, device, options, 1);
 
   if (rank == 0) cout << "\nBuild successful." << endl;
 }
 
-static void setOUDF(libParanumal::setupAide &options)
+static void setOUDF(setupAide &options)
 {
   std::string oklFile;
   options.getArgs("UDF OKL FILE",oklFile);
@@ -295,30 +287,34 @@ static void setOUDF(libParanumal::setupAide &options)
 
     out << buffer.str();
 
-    out << "// automatically added \n"
-        << "void insFlowField3D(bcData *bc){}\n"
-        << "void insPressureNeumannConditions3D(bcData *bc){}\n";
-
     std::size_t found;
-    found = buffer.str().find("void insVelocityDirichletConditions");
+    found = buffer.str().find("void nrsVelocityDirichletConditions");
+    if (found == std::string::npos) found = buffer.str().find("void insVelocityDirichletConditions");
+    if (found == std::string::npos) found = buffer.str().find("void velocityDirichletConditions");
     if (found == std::string::npos)
-      out << "void insVelocityDirichletConditions3D(bcData *bc){}\n";
+      out << "void velocityDirichletConditions(bcData *bc){}\n";
 
-    found = buffer.str().find("void insVelocityNeumannConditions");
+    found = buffer.str().find("void nrsVelocityNeumannConditions");
+    if (found == std::string::npos) found = buffer.str().find("void insVelocityNeumannConditions");
+    if (found == std::string::npos) found = buffer.str().find("void velocityNeumannConditions");
     if (found == std::string::npos)
-      out << "void insVelocityNeumannConditions3D(bcData *bc){}\n";
+      out << "void velocityNeumannConditions(bcData *bc){}\n";
 
-    found = buffer.str().find("void insPressureDirichletConditions");
+    found = buffer.str().find("void nrsPressureDirichletConditions");
+    if (found == std::string::npos) found = buffer.str().find("void insPressureDirichletConditions");
+    if (found == std::string::npos) found = buffer.str().find("void pressureDirichletConditions");
     if (found == std::string::npos)
-      out << "void insPressureDirichletConditions3D(bcData *bc){}\n";
+      out << "void pressureDirichletConditions(bcData *bc){}\n";
 
     found = buffer.str().find("void cdsNeumannConditions");
+    if (found == std::string::npos) found = buffer.str().find("void scalarNeumannConditions");
     if (found == std::string::npos)
-      out << "void cdsNeumannConditions3D(bcData *bc){}\n";
+      out << "void scalarNeumannConditions(bcData *bc){}\n";
 
     found = buffer.str().find("void cdsDirichletConditions");
+    if (found == std::string::npos) found = buffer.str().find("void scalarDirichletConditions");
     if (found == std::string::npos)
-      out << "void cdsDirichletConditions3D(bcData *bc){}\n";
+      out << "void scalarDirichletConditions(bcData *bc){}\n";
 
     out <<
       "@kernel void __dummy__(int N) {"
diff --git a/src/nekrs.hpp b/src/lib/nekrs.hpp
similarity index 69%
rename from src/nekrs.hpp
rename to src/lib/nekrs.hpp
index c3c2ad93f..3b88c9e2a 100644
--- a/src/nekrs.hpp
+++ b/src/lib/nekrs.hpp
@@ -13,16 +13,18 @@ void setup(MPI_Comm comm, int buildOnly, int sizeTarget,
 void runStep(double time, double dt, int tstep);
 void copyToNek(double time, int tstep);
 void udfExecuteStep(double time, int tstep, int isOutputStep);
-void nekOutfld(void);
+void outfld(double time, double outputTime);
 void nekUserchk(void);
 void printRuntimeStatistics(void);
-
+const double writeInterval(void);
 const double dt(void);
-const int outputStep(void);
-const int NtimeSteps(void);
 const double startTime(void);
-const double finalTime(void);
+const double endTime(void);
+const int numSteps(void);
+const int lastStep(double time, int tstep, double elapsedTime);
+const int writeControlRunTime(void);
 
+void* nrsPtr(void);
 void* nekPtr(const char* id);
 }
 
diff --git a/src/libP/LICENSE b/src/libP/LICENSE
deleted file mode 100644
index d53237660..000000000
--- a/src/libP/LICENSE
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
diff --git a/src/libP/include/mesh.h b/src/libP/include/mesh.h
deleted file mode 100644
index 6b0001aff..000000000
--- a/src/libP/include/mesh.h
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef MESH_H
-#define MESH_H 1
-
-#include <unistd.h>
-
-#include "mpi.h"
-#include <math.h>
-#include <stdlib.h>
-#include <occa.hpp>
-
-#include "types.h"
-#include "ogs.hpp"
-
-#include "timer.h"
-
-#include "setupAide.hpp"
-
-#define TRIANGLES 3
-#define QUADRILATERALS 4
-#define TETRAHEDRA 6
-#define HEXAHEDRA 12
-
-extern "C" { // Start C linkage
-typedef struct {
-
-  MPI_Comm comm;
-  int rank, size; // MPI rank and size (process count)
-  
-  int dim;
-  int Nverts, Nfaces, NfaceVertices;
-
-  hlong Nnodes;
-  dfloat *EX; // coordinates of vertices for each element
-  dfloat *EY;
-  dfloat *EZ;
-
-  dlong Nelements;
-  hlong *EToV; // element-to-vertex connectivity
-  dlong *EToE; // element-to-element connectivity
-  int   *EToF; // element-to-(local)face connectivity
-  int   *EToP; // element-to-partition/process connectivity
-  int   *EToB; // element-to-boundary condition type
-
-  hlong *elementInfo; //type of element
-
-  // boundary faces
-  hlong NboundaryFaces; // number of boundary faces
-  hlong *boundaryInfo; // list of boundary faces (type, vertex-1, vertex-2, vertex-3)
-
-  // MPI halo exchange info
-  dlong  totalHaloPairs;  // number of elements to be sent in halo exchange
-  dlong *haloElementList; // sorted list of elements to be sent in halo exchange
-  int *NhaloPairs;      // number of elements worth of data to send/recv
-  int  NhaloMessages;     // number of messages to send
-
-  dlong *haloGetNodeIds; // volume node ids of outgoing halo nodes
-  dlong *haloPutNodeIds; // volume node ids of incoming halo nodes
-
-  void *haloSendRequests;
-  void *haloRecvRequests;
-
-  dlong NinternalElements; // number of elements that can update without halo exchange
-  dlong NnotInternalElements; // number of elements that cannot update without halo exchange
-
-  // CG gather-scatter info
-  hlong *globalIds;
-  hlong *maskedGlobalIds;
-  void *gsh, *hostGsh; // gslib struct pointer
-  ogs_t *ogs; //occa gs pointer
-
-  // list of elements that are needed for global gather-scatter
-  dlong NglobalGatherElements;
-  dlong *globalGatherElementList;
-  occa::memory o_globalGatherElementList;
-
-  // list of elements that are not needed for global gather-scatter
-  dlong NlocalGatherElements;
-  dlong *localGatherElementList;
-  occa::memory o_localGatherElementList;
-
-  //list of fair pairs
-  dlong NfacePairs;
-  dlong *EToFPairs;
-  dlong *FPairsToE;
-  int *FPairsToF;
-
-  // NBN: streams / command queues
-  occa::stream stream0, stream1;
-
-  // volumeGeometricFactors;
-  dlong Nvgeo;
-  dfloat *vgeo;
-
-  // second order volume geometric factors
-  dlong Nggeo;
-  dfloat *ggeo;
-
-  // volume node info
-  int N, Np;
-  dfloat *r, *s, *t;    // coordinates of local nodes
-  dfloat *Dr, *Ds, *Dt; // collocation differentiation matrices
-  dfloat *Dmatrices;
-  dfloat *MM, *invMM;           // reference mass matrix
-  dfloat *LMM, *invLMM;
-  dfloat *Srr,*Srs, *Srt; //element stiffness matrices
-  dfloat *Ssr,*Sss, *Sst;
-  dfloat *Str,*Sts, *Stt;
-  dfloat *Smatrices;
-  int maxNnzPerRow;
-  dfloat *x, *y, *z;    // coordinates of physical nodes
-  
-  dfloat sphereRadius;  // for Quad3D 
-  
-  dfloat volume;
-
-  // indices of vertex nodes
-  int *vertexNodes;
-
-  // quad specific quantity
-  int Nq, NqP, NpP;
-  
-  dfloat *D; // 1D differentiation matrix (for tensor-product)
-  dfloat *DW; // weak 1D differentiation matrix (for tensor-product)
-  dfloat *gllz; // 1D GLL quadrature nodes
-  dfloat *gllw; // 1D GLL quadrature weights
-
-  int gjNq;
-  dfloat *gjr,*gjw; // 1D nodes and weights for Gauss Jacobi quadature
-  dfloat *gjI,*gjD; // 1D GLL to Gauss node interpolation and differentiation matrices
-  dfloat *gjD2;     // 1D GJ to GJ node differentiation
-
-  // transform to/from eigenmodes of 1D laplacian (with built in weighting)
-  dfloat *oasForward;
-  dfloat *oasBack;
-  dfloat *oasDiagOp;
-
-  // transform to/from eigenmode of IPDG 1D laplacian
-  dfloat *oasForwardDg;
-  dfloat *oasBackDg;
-  dfloat *oasDiagOpDg;
-
-  //rotated node ids
-  int *rmapP;
-
-  //reference patch inverse (for OAS precon)
-  dfloat *invAP;
-
-  // face node info
-  int Nfp;        // number of nodes per face
-  int *faceNodes; // list of element reference interpolation nodes on element faces
-  dlong *vmapM;     // list of volume nodes that are face nodes
-  dlong *vmapP;     // list of volume nodes that are paired with face nodes
-  dlong *mapP;     // list of surface nodes that are paired with -ve surface  nodes
-  int *faceVertices; // list of mesh vertices on each face
-
-  dfloat *LIFT; // lift matrix
-  dfloat *FMM;  // Face Mass Matrix
-  dfloat *sMT; // surface mass (MM*LIFT)^T
-
-  dlong   Nsgeo;
-  dfloat *sgeo;
-
-  // field info for PDE solver
-  int Nfields;
-  dfloat *q;    // solution data array
-  dfloat *fQM, *fQP; //solution trace arrays
-  dfloat *rhsq, *rhsq2, *rhsq3; // right hand side data array
-  dfloat *resq; // residual data array (for LSERK time-stepping)
-
-  dfloat Lambda2; // square of penalty paramater used in constructing q^*
-
-  // cubature
-  int cubNp, cubNfp, cubNq;
-  dfloat *cubr, *cubs, *cubt, *cubw; // coordinates and weights of local cubature nodes
-  dfloat *cubx, *cuby, *cubz;    // coordinates of physical nodes
-  dfloat *cubInterp; // interpolate from W&B to cubature nodes
-  dfloat *cubProject; // projection matrix from cubature nodes to W&B nodes
-  dfloat *cubD;       // 1D differentiation matrix
-  dfloat *cubDiffInterp;     // 1D weak differentiation matrix
-  dfloat *cubDW;     // 1D weak differentiation matrix
-  dfloat *cubDrW;    // 'r' weak differentiation matrix
-  dfloat *cubDsW;    // 's' weak differentiation matrix
-  dfloat *cubDtW;    // 't' weak differentiation matrix
-  dfloat *cubDWmatrices;
-
-  dfloat *cubvgeo;  //volume geometric data at cubature points
-  dfloat *cubsgeo;  //surface geometric data at cubature points
-  dfloat *cubggeo;  //second type volume geometric data at cubature points
-  
-  // c2 at cubature points (for wadg)
-  dfloat *c2;
-
-  //source injection
-  dfloat *sourceq;
-  dfloat sourceX0, sourceY0, sourceZ0, sourceT0, sourceC2, sourceFreq;
-  int sourceNelements;
-  dlong *MRABsourceNelements;
-  dlong *sourceElements;
-
-  // surface integration node info
-  int    intNfp;    // number of integration nodes on each face
-  dfloat *intInterp; // interp from surface node to integration nodes
-  dfloat *intLIFT;   // lift from surface integration nodes to W&B volume nodes
-  dfloat *intx, *inty, *intz; // coordinates of suface integration nodes
-
-  // Bernstein-Bezier info
-  dfloat *VB, *invVB; // Bernstein Vandermonde matrices
-  dfloat *BBMM;
-  dfloat *invVB1D, *invVB2D;
-  int *D0ids, *D1ids, *D2ids, *D3ids; // Bernstein deriv matrix indices
-  dfloat *Dvals; // Bernstein deriv matrix values
-  int *D0Tids, *D1Tids, *D2Tids, *D3Tids; // Bernstein transpose deriv matrix indices
-  dfloat *DTvals; // Bernstein transpose deriv matrix values
-  dfloat *VBq, *PBq; // cubature interpolation/projection matrices
-  int *L0ids; // L0 matrix ids
-  dfloat *L0vals; // L0 values (L0 tridiagonal in 2D)
-  int *ELids; // lift reduction matrix indices
-  dfloat *ELvals; // lift reduction matrix values
-  int max_EL_nnz; // max number of non-zeros per row of EL
-  int *BBRaiseids; //Bernstein elevate matrix indices
-  dfloat *BBRaiseVals; //Bernstein elevate matrix values
-  dfloat *BBLower; //Berstein projection matrix.
-
-  //degree raising and lowering interpolation matrices
-  dfloat *interpRaise;
-  dfloat *interpLower;
-
-  //sparse basis info
-  dfloat *sparseV, *invSparseV;
-  dfloat *sparseMM;
-  int* FaceModes;
-  int SparseNnzPerRow;
-  int SparseNnzPerRowNonPadded;
-  int *sparseStackedNZ;
-  dfloat *sparseSrrT;
-  dfloat *sparseSrsT;
-  dfloat *sparseSssT;
-  int *Ind;
-
-  dlong *mmapM, *mmapP; 
-  int   *mmapS;
-  dfloat *mapSgn;
-
-  // time stepping info
-  dfloat dt; // time step
-  dfloat startTime ; // Start Time
-  dfloat finalTime; // final time to run acoustics to
-  int   NtimeSteps;// number of time steps
-  int   errorStep; // number of steps between error calculations
-  int   Nrk;
-  dfloat rka[5], rkb[5], rkc[6]; // AK: deprecated
-
-  // MRAB,SAAB coefficients
-  dfloat mrab[3], mrabb[3], saab[3], saabexp; // AK: deprecated 
-  int MRABNlevels;
-  int *MRABlevel;
-  dlong *MRABNelements, *MRABNhaloElements;
-  dlong **MRABelementIds, **MRABhaloIds;
-  int *MRABshiftIndex;
-
-  dlong *MRABpmlNelements, *MRABpmlNhaloElements;
-  dlong **MRABpmlElementIds, **MRABpmlIds;
-  dlong **MRABpmlHaloElementIds, **MRABpmlHaloIds;
-
-  dlong pmlNelements, nonPmlNelements;
-  dlong *nonPmlElementIds, *pmlElementIds, *pmlIds;  
-  int shiftIndex;
-
-  dfloat dtfactor; //Delete later for script run 
-  dfloat maxErrorBoltzmann;
-
-  dfloat *errtmp;
-  dfloat rkC[7], rkA[7*7], rkE[7];
-
-  occa::memory o_rkq, o_rkrhsq, o_rkerr; // deprecated, AK.
-  occa::memory o_errtmp;
-  occa::memory o_rkA, o_rkE;
-
-  // ploting info for generating field vtu
-  int    plotNverts;    // number of vertices for each plot element
-  int    plotNp;        // number of plot nodes per element
-  int    plotNelements; // number of "plot elements" per element
-  int   *plotEToV;      // triangulation of plot nodes
-  dfloat *plotR, *plotS, *plotT; // coordinates of plot nodes in reference element
-  dfloat *plotInterp;    // warp & blend to plot node interpolation matrix
-
-  int *contourEToV;
-  dfloat *contourVX, *contourVY, *contourVZ;
-  dfloat *contourInterp, *contourInterp1, *contourFilter; 
-
-  //SEMFEM data
-  int NpFEM, NelFEM;
-  int *FEMEToV;
-  dfloat *rFEM, *sFEM, *tFEM;
-  dfloat *SEMFEMInterp;
-
-  occa::memory o_SEMFEMInterp;
-  occa::memory o_SEMFEMAnterp;
-
-  // Boltzmann specific stuff
-  dfloat RT, sqrtRT, tauInv, Ma, Re; // Deprecated: AK
-
-  // pml stuff
-  int    pmlNfields;
-  //  dlong    pmlNelements; // deprecated
-  dlong   *pmlElementList; // deprecated
-
-  int Ntscale; // Will be removed, for time accuracy test
-  
-  dfloat *invTau; // deprecated in Boltzmann
-
-
-  // Probe Data
-  int probeN, probeNTotal; 
-  dfloat *probeR, *probeS, *probeT;
-  // dfloat *probeX, *probeY, *probeZ;  
-  dlong *probeElementIds, *probeIds;  
-  dfloat *probeI; 
-
-  // occa stuff
-  occa::device device;
-
-  occa::stream defaultStream;
-  occa::stream dataStream;
-  occa::stream computeStream;
-
-  occa::memory o_q, o_rhsq, o_resq, o_fQM, o_fQP;
-
-  occa::memory o_Dr, o_Ds, o_Dt, o_LIFT, o_MM, o_invMM, o_MMPfloat;
-  occa::memory o_DrT, o_DsT, o_DtT, o_LIFTT;
-  occa::memory o_LMM, o_invLMM;
-  occa::memory o_Dmatrices;
-  occa::memory o_DmatricesPfloat;
-  occa::memory o_FMMT;
-  occa::memory o_sMT;
-
-  occa::memory o_D; // tensor product differentiation matrix (for Hexes)
-  occa::memory o_DW; // tensor product differentiation matrix (for Hexes)
-  occa::memory o_SrrT, o_SrsT, o_SrtT; //element stiffness matrices
-  occa::memory o_SsrT, o_SssT, o_SstT;
-  occa::memory o_Srr, o_Srs, o_Srt, o_Sss, o_Sst, o_Stt; // for char4-based kernels
-  occa::memory o_Smatrices;
-  occa::memory o_SmatricesPfloat;
-  occa::memory o_IndT, o_IndTchar;
-  occa::memory o_India, o_Indja;
-  occa::memory o_StrT, o_StsT, o_SttT;
-  occa::memory o_Ind; // for sparse index storage
-
-  occa::memory o_vgeo, o_sgeo;
-  occa::memory o_vmapM, o_vmapP, o_mapP;
-
-  occa::memory o_rmapP;
-
-  occa::memory o_EToE, o_EToF, o_EToB, o_x, o_y, o_z;
-
-  occa::memory o_EToFPairs, o_FPairsToE, o_FPairsToF;
-
-  // cubature (for wadg)
-  occa::memory o_intLIFTT, o_intInterpT, o_intx, o_inty, o_intz;
-  occa::memory o_cubDWT, o_cubD;
-  occa::memory o_cubDrWT, o_cubDsWT, o_cubDtWT, o_cubDiffInterpT;
-  occa::memory o_cubDWmatrices;
-  occa::memory o_cubInterpT, o_cubProjectT;
-  occa::memory o_invMc; // for comparison: inverses of weighted mass matrices
-
-  occa::memory o_cubvgeo, o_cubsgeo, o_cubggeo;
-
-  occa::memory o_c2;
-
-  //MRAB element lists
-  occa::memory *o_MRABelementIds;
-  occa::memory *o_MRABhaloIds;
-  occa::memory *o_MRABpmlElementIds;
-  occa::memory *o_MRABpmlIds;
-  occa::memory *o_MRABpmlHaloElementIds;
-  occa::memory *o_MRABpmlHaloIds;
-
-
-  // DG halo exchange info
-  occa::memory o_haloElementList;
-  occa::memory o_haloBuffer;
-  occa::memory o_haloGetNodeIds;
-  occa::memory o_haloPutNodeIds;
-  
-  occa::memory o_internalElementIds;
-  occa::memory o_notInternalElementIds;
-
-  // Bernstein-Bezier occa arrays
-  occa::memory o_BBMM;
-  occa::memory o_D0ids, o_D1ids, o_D2ids, o_D3ids, o_Dvals; // Bernstein deriv matrix indices
-  occa::memory o_packedDids; // char4 packed increments (D1ids-D0ids)
-
-  occa::memory o_invVB1DT, o_invVB2DT;
-  occa::memory o_VBq, o_PBq; // cubature interpolation/projection matrices
-  occa::memory o_L0ids, o_L0vals, o_ELids, o_ELvals;
-
-  /* sparse basis occa arrays */
-  occa::memory o_sparseStackedNZ;
-  occa::memory o_sparseSrrT;
-  occa::memory o_sparseSrsT;
-  occa::memory o_sparseSssT;
-  occa::memory o_mapSgn;
-
-  // pml vars
-  occa::memory o_sigmax, o_sigmay, o_sigmaz; // AK: deprecated
-
-
-  occa::memory o_pmlElementIds;
-  occa::memory o_nonPmlElementIds;
-  occa::memory o_pmlIds;
-
-  occa::memory o_pmlElementList;
-  
-  occa::memory o_ggeo; // second order geometric factors
-  occa::memory o_ggeoPfloat; // second order geometric factors
-  occa::memory o_projectL2; // local weights for projection.
-
-  occa::kernel volumeKernel;
-  occa::kernel surfaceKernel;
-  occa::kernel updateKernel;
-  occa::kernel traceUpdateKernel;
-  occa::kernel haloExtractKernel;
-  occa::kernel partialSurfaceKernel;
-  occa::kernel haloGetKernel;
-  occa::kernel haloPutKernel;
-
-  // Just for test will be deleted after temporal testsAK
-  occa::kernel RKupdateKernel;
-  occa::kernel RKpmlUpdateKernel;
-
-
-  occa::kernel gatherKernel;
-  occa::kernel scatterKernel;
-  occa::kernel gatherScatterKernel;
-
-  occa::kernel getKernel;
-  occa::kernel putKernel;
-
-  occa::kernel sumKernel;
-  occa::kernel addScalarKernel;
-
-  occa::kernel AxKernel;
-  occa::kernel innerProductKernel;
-  occa::kernel weightedInnerProduct1Kernel;
-  occa::kernel weightedInnerProduct2Kernel;
-  occa::kernel scaledAddKernel;
-  occa::kernel dotMultiplyKernel;
-  occa::kernel dotDivideKernel;
-
-  occa::kernel gradientKernel;
-  occa::kernel ipdgKernel;
-
-  occa::kernel maskKernel;
-  occa::kernel maskPfloatKernel;
-
-  // Boltzmann Specific Kernels
-  occa::kernel relaxationKernel;
-  occa::kernel pmlRelaxationKernel;
-  
-}mesh_t;
-
-// serial sort
-void mysort(hlong *data, int N, const char *order);
-
-// sort entries in an array in parallel
-void parallelSort(int size, int rank, MPI_Comm comm,
-		  int N, void *vv, size_t sz,
-		  int (*compare)(const void *, const void *),
-		  void (*match)(void *, void *)
-		  );
-
-#define mymax(a,b) (((a)>(b))?(a):(b))
-#define mymin(a,b) (((a)<(b))?(a):(b))
-
-/* hash function */
-unsigned int hash(const unsigned int value) ;
-
-/* dimension independent mesh operations */
-void meshConnect(mesh_t *mesh);
-
-/* build parallel face connectivity */
-void meshParallelConnect(mesh_t *mesh);
-
-/* build global connectivity in parallel */
-void meshParallelConnectNodes(mesh_t *mesh, int isTmesh, int nrsBuildOnly);
-
-void meshHaloSetup(mesh_t *mesh);
-
-/* extract whole elements for the halo exchange */
-void meshHaloExtract(mesh_t *mesh, size_t Nbytes, void *sourceBuffer, void *haloBuffer);
-
-void meshHaloExchange(mesh_t *mesh,
-    size_t Nbytes,         // message size per element
-    void *sourceBuffer,
-    void *sendBuffer,    // temporary buffer
-    void *recvBuffer);
-
-void meshHaloExchangeStart(mesh_t *mesh,
-    size_t Nbytes,       // message size per element
-    void *sendBuffer,    // temporary buffer
-    void *recvBuffer);
-
-
-void meshHaloExchangeFinish(mesh_t *mesh);
-
-void meshHaloExchangeBlocking(mesh_t *mesh,
-			     size_t Nbytes,       // message size per element
-			     void *sendBuffer,    // temporary buffer
-			      void *recvBuffer);
-
-// print out parallel partition i
-void meshPartitionStatistics(mesh_t *mesh);
-
-// build element-boundary connectivity
-void meshConnectBoundary(mesh_t *mesh);
-
-void meshParallelGatherScatterSetup(mesh_t *mesh,
-                                      dlong N,
-                                      hlong *globalIds,
-                                      MPI_Comm &comm,
-                                      int verbose);
-
-// generic mesh setup
-mesh_t *meshSetup(char *filename, int N, setupAide &options);
-void meshFree(mesh_t*);
-
-void occaTimerTic(occa::device device,std::string name);
-void occaTimerToc(occa::device device,std::string name);
-
-extern "C"
-{
-  void * xxtSetup(uint num_local_rows,
-      void* row_ids,
-      uint nnz,
-      void*   A_i,
-      void*   A_j,
-      void* A_vals,
-      int null_space,
-      const char* inttype,
-      const char* floattype);
-
-  void xxtSolve(void* x,
-      void* A,
-      void* rhs);
-
-  void xxtFree(void* A) ;
-}
-
-extern "C"
-{
-  void dgesv_ ( int     *N, int     *NRHS, double  *A,
-                int     *LDA,
-                int     *IPIV, 
-                double  *B,
-                int     *LDB,
-                int     *INFO );
-
-  // void dgemm_(const char *TRANSA, const char *TRANSB, const int *M, 
-  //             const int *N, const int *K, double *ALPHA, double *A, const int *LDA, double *B, 
-  //             const int *LDB, double *BETA, double *C, const int *LDC);
-
-   void dgemm_ (char *, char *, int *, int *, int *,
-         const dfloat *, const dfloat * __restrict, int *,
-         const dfloat * __restrict, int *,
-         const dfloat *, dfloat * __restrict, int *);
-
-  void sgesv_(int *N, int *NRHS,float  *A, int *LDA, int *IPIV, float  *B, int *LDB,int *INFO);
-
-  void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO);
-  void dgetri_(int* N, double* A, int* lda, int* IPIV, double* WORK, int* lwork, int* INFO);
-  void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI,
-              double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO );
-  
-  double dlange_(char *NORM, int *M, int *N, double *A, int *LDA, double *WORK);
-  void dgecon_(char *NORM, int *N, double *A, int *LDA, double *ANORM,
-                double *RCOND, double *WORK, int *IWORK, int *INFO );
-}
-
-void readDfloatArray(FILE *fp, const char *label, dfloat **A, int *Nrows, int* Ncols);
-void readIntArray   (FILE *fp, const char *label, int **A   , int *Nrows, int* Ncols);
-
-void meshApplyElementMatrix(mesh_t *mesh, dfloat *A, dfloat *q, dfloat *Aq);
-void meshApplyVectorElementMatrix(mesh_t *mesh, int Nfield, const dlong offset, dfloat *A, dfloat *q, dfloat *Aq);
-
-void meshRecursiveSpectralBisectionPartition(mesh_t *mesh);
-
-void matrixInverse(int N, dfloat *A);
-dfloat matrixConditionNumber(int N, dfloat *A);
-
-#if 0
-void *occaHostMallocPinned(occa::device &device, size_t size, void *source, occa::memory &mem);
-#else
-void *occaHostMallocPinned(occa::device &device, size_t size, void *source, occa::memory &mem, occa::memory &h_mem);
-#endif
-
-void matrixRightSolve(int NrowsA, int NcolsA, dfloat *A, int NrowsB, int NcolsB, dfloat *B, dfloat *C);
-void matrixEig(int N, dfloat *A, dfloat *VR, dfloat *WR, dfloat *WI);
-void matrixTranspose(const int M, const int N,
-                     const dfloat  *A, const int LDA,
-                           dfloat *AT, const int LDAT);
-
-// 1D mesh basis functions
-void Nodes1D(int _N, dfloat *_r);
-void EquispacedNodes1D(int _N, dfloat *_r);
-void OrthonormalBasis1D(dfloat a, int i, dfloat *P);
-void GradOrthonormalBasis1D(dfloat a, int i, dfloat *Pr);
-void Vandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *V);
-void GradVandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *Vr);
-void MassMatrix1D(int _Np, dfloat *V, dfloat *_MM);
-void Dmatrix1D(int _N, int NpointsIn, dfloat *_rIn,
-                               int NpointsOut, dfloat *_rOut, dfloat *_Dr);
-void DWmatrix1D(int _N, dfloat *_D, dfloat *_DT);
-
-void InterpolationMatrix1D(int _N,
-                               int NpointsIn, dfloat *rIn,
-                               int NpointsOut, dfloat *rOut,
-                               dfloat *I);
-void DegreeRaiseMatrix1D(int Nc, int Nf, dfloat *P);
-void CubatureWeakDmatrix1D(int _Nq, int _cubNq,
-                                     dfloat *_cubProject, dfloat *_cubD, dfloat *_cubPDT);
-dfloat JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N);
-dfloat GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N);
-void JacobiGLL(int _N, dfloat *_x, dfloat *_w = nullptr);
-void JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w);
-} // end C Linkage
-#endif
-
diff --git a/src/libP/include/mesh2D.h b/src/libP/include/mesh2D.h
deleted file mode 100644
index 0a47ed49f..000000000
--- a/src/libP/include/mesh2D.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef MESH2D_H 
-#define MESH2D_H 1
-
-#include "mesh.h"
-
-extern "C" { // Begin C Linkage
-// will eventually rename mesh2D to mesh_t in src
-#define mesh2D mesh_t
-
-mesh2D* meshReaderTri2D(char *fileName);
-mesh2D* meshReaderQuad2D(char *fileName);
-
-// mesh readers
-mesh2D* meshParallelReaderTri2D(char *fileName);
-mesh2D* meshParallelReaderQuad2D(char *fileName);
-
-// build connectivity in serial
-void meshConnect2D(mesh2D *mesh);
-
-// build element-boundary connectivity
-void meshConnectBoundary2D(mesh2D *mesh);
-
-// build connectivity in parallel
-void meshParallelConnect2D(mesh2D *mesh);
-
-// build global connectivity in parallel
-void meshParallelConnectNodesQuad2D(mesh2D *mesh);
-
-// create global number of nodes
-void meshNumberNodes2D(mesh2D *mesh);
-
-// repartition elements in parallel
-void meshGeometricPartition2D(mesh2D *mesh);
-
-// print out mesh 
-void meshPrint2D(mesh2D *mesh);
-
-// print out mesh in parallel from the root process
-void meshParallelPrint2D(mesh2D *mesh);
-
-// print out mesh partition in parallel
-void meshVTU2D(mesh2D *mesh, char *fileName);
-
-// print out solution at plot nodes 
-void meshPlotVTU2D(mesh2D *mesh, char *fileNameBase, int fld);
-
-// compute geometric factors for local to physical map 
-void meshGeometricFactorsTri2D(mesh2D *mesh);
-void meshGeometricFactorsQuad2D(mesh2D *mesh);
-
-void meshSurfaceGeometricFactorsTri2D(mesh2D *mesh);
-void meshSurfaceGeometricFactorsQuad2D(mesh2D *mesh);
-
-void meshPhysicalNodesTri2D(mesh2D *mesh);
-void meshPhysicalNodesQuad2D(mesh2D *mesh);
-
-void meshLoadReferenceNodesTri2D(mesh2D *mesh, int N);
-void meshLoadReferenceNodesQuad2D(mesh2D *mesh, int N);
-
-void meshGradientTri2D(mesh2D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy);
-void meshGradientQuad2D(mesh2D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy);
-
-// print out parallel partition i
-void meshPartitionStatistics2D(mesh2D *mesh);
-
-// functions that call OCCA kernels
-void occaTest(mesh2D *mesh);
-
-// 
-void occaOptimizeGradientTri2D(mesh2D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy);
-void occaOptimizeGradientQuad2D(mesh2D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy);
-
-// serial face-node to face-node connection
-void meshConnectFaceNodes2D(mesh2D *mesh);
-
-// serial face-mode to face-mode connection
-void meshConnectFaceModes2D(mesh2D *mesh, int *faceModes, dfloat *V);
-
-// halo connectivity information
-void meshHaloSetup2D(mesh2D *mesh);
-
-// perform complete halo exchange
-void meshHaloExchange2D(mesh2D *mesh,
-			size_t Nbytes, // number of bytes per element
-			void *sourceBuffer, 
-			void *sendBuffer, 
-			void *recvBuffer);
-
-// start halo exchange
-void meshHaloExchangeStart2D(mesh2D *mesh,
-			     size_t Nbytes,       // message size per element
-			     void *sendBuffer,    // outgoing halo
-			     void *recvBuffer);   // incoming halo
-
-// finish halo exchange
-void meshHaloExchangeFinish2D(mesh2D *mesh);
-
-// extract halo data from sourceBuffer and save to sendBuffer
-void meshHaloExtract2D(mesh2D *mesh, size_t Nbytes, void *sourceBuffer, void *sendBuffer);
-
-// build list of nodes on each face of the reference element
-void meshBuildFaceNodesTri2D(mesh2D *mesh);
-void meshBuildFaceNodesQuad2D(mesh2D *mesh);
-
-mesh2D *meshSetupTri2D(char *filename, int N);
-mesh2D *meshSetupQuad2D(char *filename, int N);
-
-// set up OCCA device and copy generic element info to device
-void meshOccaSetup2D(mesh2D *mesh, setupAide &newOptions, occa::properties &kernelInfo);
-
-// void meshMRABSetup2D(mesh2D *mesh, dfloat *EToDT, int maxLevels); 
-dfloat meshMRABSetup2D(mesh2D *mesh, dfloat *EToDT, int maxLevels, dfloat finalTime); 
-
-
-//MRAB weighted mesh partitioning
-void meshMRABWeightedPartition2D(mesh2D *mesh, dfloat *weights,
-                                      int numLevels, int *levels);
-
-
-// Setup probe information
-// Probe Setup : AK
-void meshProbeSetup2D(mesh2D *mesh, dfloat *pX, dfloat *pY);
-void meshVandermonde2D(int N, int sizeR, dfloat *r, dfloat *s, dfloat *V);
-dfloat meshSimplex2D(dfloat a, dfloat b, int i, int j);
-dfloat meshJacobiP(dfloat a, dfloat alpha, dfloat beta, int N);
-dfloat meshFactorial(int n);
-
-
-#define norm2(a,b) ( sqrt((a)*(a)+(b)*(b)) )
-
-
-/* offsets for geometric factors */
-#define RXID 0  
-#define RYID 1  
-#define SXID 2  
-#define SYID 3  
-#define  JID 4
-#define JWID 5
-#define IJWID 6
-
-
-/* offsets for second order geometric factors */
-#define G00ID 0  
-#define G01ID 1  
-#define G11ID 2
-#define GWJID 3
-
-/* offsets for nx, ny, sJ, 1/J */
-#define NXID 0  
-#define NYID 1  
-#define SJID 2  
-#define IJID 3  
-#define IHID 4
-#define WSJID 5
-#define WIJID 6
-
-
-mesh2D *meshSetupBoxQuad2D(int N, setupAide &options);
-void meshConnectPeriodicFaceNodes2D(mesh2D *mesh, dfloat xper, dfloat yper);
-
-} // end C Linkage
-#endif
-
diff --git a/src/libP/include/mesh3D.h b/src/libP/include/mesh3D.h
deleted file mode 100644
index a37c15225..000000000
--- a/src/libP/include/mesh3D.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef MESH3D_H 
-#define MESH3D_H 1
-
-// generic mesh structure 
-#include "mesh.h"
-
-extern "C" { // Begin C Linkage
-#define mesh3D mesh_t
-
-// mesh readers
-mesh3D* meshParallelReaderTri3D(char *fileName);
-mesh3D* meshParallelReaderQuad3D(char *fileName);
-mesh3D* meshParallelReaderTet3D(char *fileName);
-mesh3D* meshParallelReaderHex3D(char *fileName);
-
-// build connectivity in serial
-void meshConnect3D(mesh3D *mesh);
-
-// build element-boundary connectivity
-void meshConnectBoundary3D(mesh3D *mesh);
-
-// build connectivity in parallel
-void meshParallelConnect3D(mesh3D *mesh);
-
-// repartition elements in parallel
-void meshGeometricPartition3D(mesh3D *mesh);
-
-// print out mesh 
-void meshPrint3D(mesh3D *mesh);
-
-// print out mesh in parallel from the root process
-void meshParallelPrint3D(mesh3D *mesh);
-
-// print out mesh partition in parallel
-void meshVTU3D(mesh3D *mesh, char *fileName);
-
-// print out mesh field
-void meshPlotVTU3D(mesh3D *mesh, char *fileNameBase, int fld);
-void meshPlotContour3D(mesh_t *mesh, char *fname, dfloat *u, int Nlevels, dfloat *levels);
-void meshPlotAdaptiveContour3D(mesh_t *mesh, char *fname, dfloat *u, int Nlevels, dfloat *levels, dfloat tol);
-
-// compute geometric factors for local to physical map
-void meshGeometricFactorsTri3D(mesh3D *mesh);
-void meshGeometricFactorsQuad3D(mesh3D *mesh);
-void meshGeometricFactorsTet3D(mesh3D *mesh);
-void meshGeometricFactorsHex3D(mesh3D *mesh);
-
-void meshSurfaceGeometricFactorsTri3D(mesh3D *mesh);
-void meshSurfaceGeometricFactorsQuad3D(mesh3D *mesh);
-void meshSurfaceGeometricFactorsTet3D(mesh3D *mesh);
-void meshSurfaceGeometricFactorsHex3D(mesh3D *mesh);
-
-void meshPhysicalNodesTri3D(mesh3D *mesh);
-void meshPhysicalNodesQuad3D(mesh3D *mesh);
-void meshPhysicalNodesTet3D(mesh3D *mesh);
-void meshPhysicalNodesHex3D(mesh3D *mesh, int nrsBuildOnly);
-
-void meshLoadReferenceNodesTet3D(mesh3D *mesh, int N);
-void meshLoadReferenceNodesHex3D(mesh3D *mesh, int N, int cubN);
-
-void meshGradientTet3D(mesh3D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy, dfloat *dqdz);
-void meshGradientHex3D(mesh3D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy, dfloat *dqdz);
-
-// print out parallel partition i
-void meshPartitionStatistics3D(mesh3D *mesh);
-
-// default occa set up
-void meshOccaSetup3D(mesh3D *mesh, setupAide &newOptions, occa::properties &kernelInfo);
-void meshOccaSetupQuad3D(mesh_t *mesh, setupAide &newOptions, occa::properties &kernelInfo);
-void meshOccaSetupTri3D(mesh_t *mesh, setupAide &newOptions, occa::properties &kernelInfo);
-
-void meshOccaPopulateDevice3D(mesh3D *mesh, setupAide &newOptions, occa::properties &kernelInfo);
-void meshOccaCloneDevice(mesh_t *donorMesh, mesh_t *mesh);
-
-// functions that call OCCA kernels
-void occaTest3D(mesh3D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy, dfloat *dqdz);
-
-// 
-void occaOptimizeGradientTet3D(mesh3D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy, dfloat *dqdz);
-void occaOptimizeGradientHex3D(mesh3D *mesh, dfloat *q, dfloat *dqdx, dfloat *dqdy, dfloat *dqdz);
-
-// serial face-node to face-node connection
-void meshConnectFaceNodes3D(mesh3D *mesh);
-
-//
-mesh3D *meshSetupTri3D(char *filename, int N, dfloat sphereRadius);
-mesh3D *meshSetupQuad3D(char *filename, int N, dfloat sphereRadius);
-mesh3D *meshSetupTet3D(char *filename, int N);
-mesh3D *meshSetupHex3D(char *filename, int N);
-
-void meshParallelConnectNodesHex3D(mesh3D *mesh);
-
-// halo connectivity information
-void meshHaloSetup3D(mesh3D *mesh);
-
-// perform halo exchange
-void meshHaloExchange3D(mesh3D *mesh,
-			size_t Nbytes,  // number of bytes per element
-			void *sourceBuffer, 
-			void *sendBuffer, 
-			void *recvBuffer);
-
-void meshHaloExchangeStart3D(mesh3D *mesh,
-			     size_t Nbytes,       // message size per element
-			     void *sendBuffer,    // temporary buffer
-			     void *recvBuffer);
-
-void meshHaloExchangeFinish3D(mesh3D *mesh);
-
-// build list of nodes on each face of the reference element
-void meshBuildFaceNodes3D(mesh3D *mesh);
-void meshBuildFaceNodesHex3D(mesh3D *mesh);
-
-
-
-dfloat meshMRABSetup3D(mesh3D *mesh, dfloat *EToDT, int maxLevels, dfloat finalTime); 
-
-//MRAB weighted mesh partitioning
-void meshMRABWeightedPartition3D(mesh3D *mesh, dfloat *weights,
-                                      int numLevels, int *levels);
-
-void interpolateHex3D(dfloat *Inter, dfloat *x, int N, dfloat *Ix, int M);
-
-#define norm3(a,b,c) ( sqrt((a)*(a)+(b)*(b)+(c)*(c)) )
-
-/* offsets for geometric factors */
-#define RXID 0  
-#define RYID 1  
-#define SXID 2  
-#define SYID 3
-#define  JID 4
-#define JWID 5
-#define IJWID 6
-#define RZID 7
-#define SZID 8  
-#define TXID 9  
-#define TYID 10  
-#define TZID 11  
-
-
-
-/* offsets for second order geometric factors */
-#define G00ID 0  
-#define G01ID 1  
-#define G11ID 2
-#define GWJID 3  
-#define G12ID 4
-#define G02ID 5
-#define G22ID 6  
-
-
-/* offsets for nx, ny, sJ, 1/J */
-#define NXID 0  
-#define NYID 1  
-#define SJID 2
-#define IJID 3
-#define IHID 4
-#define WSJID 5
-#define WIJID 6
-#define NZID 7
-#define STXID 8
-#define STYID 9  
-#define STZID 10 
-#define SBXID 11 
-#define SBYID 12 
-#define SBZID 13
-#define SURXID 14
-#define SURYID 15
-#define SURZID 16
-//
-//offsets for boltzmann PML variables
-#define QXID1 0  
-#define QXID2 1  
-#define QXID3 2
-#define QXID4 3  
-#define QXID5 4  
-#define QXID6 5  
-#define QXID8 6 
-//
-#define QYID1 7  
-#define QYID2 8  
-#define QYID3 9
-#define QYID4 10  
-#define QYID5 11  
-#define QYID7 12  
-#define QYID9 13 
-//
-#define QZID1 14  
-#define QZID2 15  
-#define QZID3 16
-#define QZID4 17  
-#define QZID6 18  
-#define QZID7 19  
-#define QZID10  20   
-
-mesh3D *meshSetupBoxHex3D(int N, setupAide &options);
-void meshConnectPeriodicFaceNodes3D(mesh3D *mesh, dfloat xper, dfloat yper, dfloat zper);
-
-// Mesh generation
-void NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t);
-void FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes);
-
-} // end C Linkage
-#endif
-
diff --git a/src/libP/include/timer.h b/src/libP/include/timer.h
deleted file mode 100644
index 1e757fd37..000000000
--- a/src/libP/include/timer.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef OCCA_TIMER_HEADER
-#define OCCA_TIMER_HEADER
-
-#include "occa.hpp"
-
-#include <iostream>
-#include <fstream>
-#include <assert.h>
-#include <vector>
-#include <stack>
-#include <map>
-#include <iomanip>
-#include <utility>
-#include <algorithm>
-
-namespace occa {
-
-  double currentTime();
-  
-  class timerTraits{
-  public:
-    double timeTaken;
-    double selfTime;
-    int    numCalls;
-    double flopCount;
-    double bandWidthCount;
-    int treeDepth;
-    std::vector<std::string> childs;
-
-    timerTraits();
-  };
-
-  class timer{
-
-    bool profileKernels;
-    bool profileApplication;
-    bool deviceInitialized;
-
-    occa::device occaHandle;
-
-  public:
-
-    timer();
-
-    void initTimer(const occa::device &deviceHandle);
-
-    // NBN: allow toggle from menu
-    inline void setKernelProfiling(bool b) { profileKernels = b; }
-    inline void setApplicationProfiling(bool b) { profileApplication = b; }
-
-    std::stack<std::string> keyStack;
-    std::stack<double> timeStack;
-
-    std::map<std::stack<std::string>, timerTraits> times;
-
-    void tic(std::string key);
-
-    double toc(std::string key);
-
-    double toc(std::string key, double flops);
-
-    double toc(std::string key, occa::kernel &kernel);
-
-    double toc(std::string key, occa::kernel &kernel, double flops);
-
-    double toc(std::string key, double flops, double bw);
-
-    double toc(std::string key, occa::kernel &kernel, double flops, double bw);
-
-    double print_recursively(std::vector<std::string> &childs,
-                             double parentTime,
-                             double overallTime);
-
-    // struct myclass {
-    //   bool operator() (std::pair<std::string, timerTraits> &a,
-    // 		     std::pair<std::string, timerTraits> &b){
-    //     return(a.second.selfTime > b.second.selfTime);
-    //   }
-    // } compareSelfTimes;
-
-
-    void printTimer();
-  };
-
-
-  extern timer globalTimer;
-
-  extern double dataTransferred;
-
-  void initTimer(const occa::device &deviceHandle);
-
-  void tic(std::string key);
-
-  double toc(std::string key);
-
-  double toc(std::string key, occa::kernel &kernel);
-
-  double toc(std::string key, double fp);
-
-  double toc(std::string key, occa::kernel &kernel, double fp);
-
-  double toc(std::string key, double fp, double bw);
-
-  double toc(std::string key, occa::kernel &kernel, double fp, double bw);
-
-  void printTimer();
-}
-
-extern "C" { // Start C Linkage
-void occaTimerTic(occa::device device,std::string name);
-void occaTimerToc(occa::device device,std::string name);
-} // End C Linkage
-
-
-#endif
-
diff --git a/src/libP/include/trace.hpp b/src/libP/include/trace.hpp
deleted file mode 100644
index 2fd80a5f6..000000000
--- a/src/libP/include/trace.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef __TRACE
-#define __TRACE
-
-// using backtrace
-// http://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-// demangling output:
-// http://gcc.gnu.org/onlinedocs/libstdc++/manual/ext_demangling.html
-// spec: http://www.ib.cnea.gov.ar/~oop/biblio/libstdc++/namespaceabi.html
-
-
-#include <iostream>
-#include <execinfo.h>
-#include <stdlib.h>
-#include <cstring>
-#include <cxxabi.h>
-
-using std::ostream;
-//#include <headers2d.hpp>
-// GNU specialization. Not portable.
-ostream &trace(ostream &stream, int stack_size);
-
-#endif
diff --git a/src/libP/include/types.h b/src/libP/include/types.h
deleted file mode 100644
index 5db1f67e7..000000000
--- a/src/libP/include/types.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-//float data type
-#if 0
-#define DFLOAT_SINGLE
-#define dfloat float
-#define MPI_DFLOAT MPI_FLOAT
-#define dfloatFormat "%f"
-#define dfloatString "float"
-#else
-#define DFLOAT_DOUBLE
-#define dfloat double
-#define MPI_DFLOAT MPI_DOUBLE
-#define dfloatFormat "%lf"
-#define dfloatString "double"
-#endif
-
-//smoother float data type
-#if 1
-#define pfloat float
-#define MPI_PFLOAT MPI_FLOAT
-#define pfloatFormat "%f"
-#define pfloatString "float"
-#else
-#define pfloat double
-#define MPI_PFLOAT MPI_DOUBLE
-#define pfloatFormat "%lf"
-#define pfloatString "double"
-#endif
-
-//host index data type
-#if 0
-#define hlong int
-#define MPI_HLONG MPI_INT
-#define hlongFormat "%d"
-#define hlongString "int"
-#else
-#define hlong long long int
-#define MPI_HLONG MPI_LONG_LONG_INT
-#define hlongFormat "%lld"
-#define hlongString "long long int"
-#endif
-
-//device index data type
-#if 1
-#define dlong int
-#define MPI_DLONG MPI_INT
-#define dlongFormat "%d"
-#define dlongString "int"
-#else
-#define dlong long long int
-#define MPI_DLONG MPI_LONG_LONG_INT
-#define dlongFormat "%lld"
-#define dlongString "long long int"
-#endif
diff --git a/src/libP/parAlmond/hypre/install b/src/libP/parAlmond/hypre/install
deleted file mode 100755
index 3b5ddfd9d..000000000
--- a/src/libP/parAlmond/hypre/install
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-set -xe
-
-VER=2.18.2
-
-if [ "$1" == "clean" ]; then
-  rm -rf hypre lib obj include *.tgz *.o 2>/dev/null
-  exit 0
-fi
-
-if [ ! -f ./lib/libHYPRE.a ]; then
-  rm -rf hypre 2>/dev/null
-
-  HYPRE_TAR=hypre.tgz
-  if [ ! -f ${HYPRE_TAR} ]; then
-    wget --no-check-certificate -O ${HYPRE_TAR} https://github.com/hypre-space/hypre/archive/v${VER}.tar.gz 
-  fi
-
-  mkdir -p hypre
-  tar -zxf ${HYPRE_TAR} -C ./hypre --strip-components=1
-
-  cd hypre
-  mkdir -p build
-  cd build
-
-  set -x
-  cmake \
-  -DHYPRE_MIXEDINT=ON \
-  -DHYPRE_SINGLE=OFF \
-  -DHYPRE_INSTALL_PREFIX=`pwd`/../.. \
-  -DBUILD_SHARED_LIBS=OFF \
-  -DHYPRE_USING_HYPRE_BLAS=OFF \
-  -DHYPRE_USING_HYPRE_LAPACK=OFF \
-  -DHYPRE_USING_OPENMP=OFF \
-  -DMPI_C_COMPILER=`which $CC` \
-  -DHYPRE_USING_FEI=OFF \
-  ../src
-  set +x
-  cd ../..
-
-fi
-
-cd hypre/build
-make -j4 install
diff --git a/src/libP/solvers/elliptic/data/cavity2D.h b/src/libP/solvers/elliptic/data/cavity2D.h
deleted file mode 100644
index b95e31685..000000000
--- a/src/libP/solvers/elliptic/data/cavity2D.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Dirichlet boundary condition   */
-#define ellipticDirichletCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = occaCos(OCCA_PI * x) * occaCos(OCCA_PI * y);   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-  }
-
-/* Neumann boundary condition   */
-#define ellipticNeumannCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = uM;    \
-    uxB = -OCCA_PI * occaSin(OCCA_PI * x) * occaCos(OCCA_PI * y);   \
-    uyB = -OCCA_PI * occaCos(OCCA_PI * x) * occaSin(OCCA_PI * y);   \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticBlockBoundary2D.h b/src/libP/solvers/elliptic/data/ellipticBlockBoundary2D.h
deleted file mode 100644
index 685369f13..000000000
--- a/src/libP/solvers/elliptic/data/ellipticBlockBoundary2D.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticBoundaryConditions2D(bc,vid,t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {                 \
-    if     (bc == 1) ellipticDirichletCondition2D(t,vid, x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB) \
-    else if(bc == 2) ellipticNeumannCondition2D(t,vid,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-    else           ellipticNeumannCondition2D(t,vid,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  }
-
-/*-----------------------------------------------------------------------------------------------*/
-/* Homogeneuous Boundary conditions used in ellipticAx.
-   /*-----------------------------------------------------------------------------------------------*/
-
-/* Homogeneous Dirichlet boundary condition   */
-#define ellipticHomogeneousDirichlet2D(uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = 0.f;   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-  }
-
-/* Homogeneous Neumann boundary condition   */
-#define ellipticHomogeneousNeumann2D(uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB = uM;     \
-    uxB = 0.f;   \
-    uyB = 0.f;   \
-  }
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticHomogeneousBC2D(bc,uM,uxM,uyM,uB,uxB,uyB)  \
-  {                 \
-    if     (bc == 1) ellipticHomogeneousDirichlet2D(uM,uxM,uyM,uB,uxB,uyB) \
-    else if(bc == 2) ellipticHomogeneousNeumann2D(uM,uxM,uyM,uB,uxB,uyB)  \
-    else           ellipticHomogeneousNeumann2D(uM,uxM,uyM,uB,uxB,uyB)  \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticBlockBoundary3D.h b/src/libP/solvers/elliptic/data/ellipticBlockBoundary3D.h
deleted file mode 100644
index e2759c31e..000000000
--- a/src/libP/solvers/elliptic/data/ellipticBlockBoundary3D.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticBoundaryConditions3D(bc,vid,t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {                 \
-    if     (bc == 1) ellipticDirichletCondition3D(t,vid,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB, \
-                                                  uzB) \
-    else if(bc == 2) ellipticNeumannCondition3D(t,vid,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-    else           ellipticNeumannCondition3D(t,vid, x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  }
-
-/*-----------------------------------------------------------------------------------------------*/
-/* Homogeneuous Boundary conditions used in ellipticAx.
-   /*-----------------------------------------------------------------------------------------------*/
-
-/* Homogeneous Dirichlet boundary condition   */
-#define ellipticHomogeneousDirichlet3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = 0.f;   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-    uzB = uzM;   \
-  }
-
-/* Homogeneous Neumann boundary condition   */
-#define ellipticHomogeneousNeumann3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB = uM;     \
-    uxB = 0.f;   \
-    uyB = 0.f;   \
-    uzB = 0.f;   \
-  }
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticHomogeneousBC3D(bc,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {                 \
-    if     (bc == 1) ellipticHomogeneousDirichlet3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB) \
-    else if(bc == 2) ellipticHomogeneousNeumann3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-    else           ellipticHomogeneousNeumann3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticBlockSine3D.h b/src/libP/solvers/elliptic/data/ellipticBlockSine3D.h
deleted file mode 100644
index e571cfcfb..000000000
--- a/src/libP/solvers/elliptic/data/ellipticBlockSine3D.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-#define VARIABLE 1
-#define PI 3.14159265358979323846
-
-#if VARIABLE
-#define ellipticCoefficient3D(vid,x, y, z, lambda_0, lambda_1)    \
-  {                                         \
-    if(vid == 0) { \
-      lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-      lambda_1 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-    } \
-    if(vid == 1) { \
-      lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-      lambda_1 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-    } \
-    if(vid == 2) { \
-      lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-      lambda_1 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-    } \
-  }
-
-/* forcing function   */
-#define ellipticForcing3D(vid,x, y, z, lambda, f)  \
-  {                                         \
-    if(vid == 0) { \
-      dfloat sxy  = sin(PI * x) * sin(PI * y); \
-      dfloat sxz  = sin(PI * x) * sin(PI * z); \
-      dfloat syz  = sin(PI * y) * sin(PI * z); \
-      dfloat sxyz  = sin(PI * x) * sin(PI * y) * sin(PI * z); \
-      f = sxyz * (1.0 + 3.0 * PI * PI + 1.0 * sxyz + 6.0 * PI * PI * sxyz) - PI * PI * \
-          (sxy * sxy + sxz * sxz + syz * syz); \
-    } \
-    if(vid == 1) { \
-      dfloat sxy  = sin(PI * x) * sin(PI * y); \
-      dfloat sxz  = sin(PI * x) * sin(PI * z); \
-      dfloat syz  = sin(PI * y) * sin(PI * z); \
-      dfloat sxyz  = sin(PI * x) * sin(PI * y) * sin(PI * z); \
-      f = sxyz * (1.0 + 3.0 * PI * PI + 1.0 * sxyz + 6.0 * PI * PI * sxyz) - PI * PI * \
-          (sxy * sxy + sxz * sxz + syz * syz); \
-    } \
-    if(vid == 2) { \
-      dfloat sxy  = sin(PI * x) * sin(PI * y); \
-      dfloat sxz  = sin(PI * x) * sin(PI * z); \
-      dfloat syz  = sin(PI * y) * sin(PI * z); \
-      dfloat sxyz  = sin(PI * x) * sin(PI * y) * sin(PI * z); \
-      f = sxyz * (1.0 + 3.0 * PI * PI + 1.0 * sxyz + 6.0 * PI * PI * sxyz) - PI * PI * \
-          (sxy * sxy + sxz * sxz + syz * syz); \
-    } \
-  }
-
-/* Dirichlet boundary condition   */
-#define ellipticDirichletCondition3D(t,vid,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    if(vid == 0) {   \
-      uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uxB = uxM;   \
-      uyB = uyM;   \
-      uzB = uzM;   \
-    } \
-    if(vid == 1) { \
-      uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uxB = uxM;   \
-      uyB = uyM;   \
-      uzB = uzM;   \
-    } \
-    if(vid == 2) { \
-      uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uxB = uxM;   \
-      uyB = uyM;   \
-      uzB = uzM;   \
-    } \
-  }
-
-/* Neumann boundary condition   */
-#define ellipticNeumannCondition3D(t, vid, x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    if(vid == 0) {   \
-      uB  = uM;    \
-      dfloat lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0; \
-      uxB = -lambda_0 * PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uyB = -lambda_0 * PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-      uzB = -lambda_0 * PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-    } \
-    if(vid == 1) {   \
-      uB  = uM;    \
-      dfloat lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0; \
-      uxB = -lambda_0 * PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uyB = -lambda_0 * PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-      uzB = -lambda_0 * PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-    } \
-    if(vid == 2) {   \
-      uB  = uM;    \
-      dfloat lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0; \
-      uxB = -lambda_0 * PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uyB = -lambda_0 * PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-      uzB = -lambda_0 * PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-    } \
-  }
-#else // if VARIABLE
-/* forcing function   */
-#define ellipticForcing3D(vid,x, y, z, lambda, f)  \
-  {                                         \
-    if(vid == 0) { \
-      f  = (3 * PI * PI + lambda) * sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-    } \
-    if(vid == 1) { \
-      f  = (3 * PI * PI + lambda) * sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-    } \
-    if(vid == 2) { \
-      f  = (3 * PI * PI + lambda) * sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-    } \
-  }
-
-/* Dirichlet boundary condition   */
-#define ellipticDirichletCondition3D(t,vid,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    if(vid == 0) {   \
-      uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uxB = uxM;   \
-      uyB = uyM;   \
-      uzB = uzM;   \
-    } \
-    if(vid == 1) { \
-      uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uxB = uxM;   \
-      uyB = uyM;   \
-      uzB = uzM;   \
-    } \
-    if(vid == 2) { \
-      uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uxB = uxM;   \
-      uyB = uyM;   \
-      uzB = uzM;   \
-    } \
-  }
-
-/* Neumann boundary condition   */
-#define ellipticNeumannCondition3D(t, vid, x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    if(vid == 0) {   \
-      uB  = uM;    \
-      uxB = -PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uyB = -PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-      uzB = -PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-    } \
-    if(vid == 1) {   \
-      uB  = uM;    \
-      uxB = -PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uyB = -PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-      uzB = -PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-    } \
-    if(vid == 2) {   \
-      uB  = uM;    \
-      uxB = -PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-      uyB = -PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-      uzB = -PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-    } \
-  }
-
-#endif
diff --git a/src/libP/solvers/elliptic/data/ellipticBoundary2D.h b/src/libP/solvers/elliptic/data/ellipticBoundary2D.h
deleted file mode 100644
index c1e20d2cc..000000000
--- a/src/libP/solvers/elliptic/data/ellipticBoundary2D.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticBoundaryConditions2D(bc,t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {                 \
-    if     (bc == 1) ellipticDirichletCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB) \
-    else if(bc == 2) ellipticNeumannCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-    else           ellipticNeumannCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  }
-
-/*-----------------------------------------------------------------------------------------------*/
-/* Homogeneuous Boundary conditions used in ellipticAx.
-   /*-----------------------------------------------------------------------------------------------*/
-
-/* Homogeneous Dirichlet boundary condition   */
-#define ellipticHomogeneousDirichlet2D(uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = 0.f;   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-  }
-
-/* Homogeneous Neumann boundary condition   */
-#define ellipticHomogeneousNeumann2D(uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB = uM;     \
-    uxB = 0.f;   \
-    uyB = 0.f;   \
-  }
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticHomogeneousBC2D(bc,uM,uxM,uyM,uB,uxB,uyB)  \
-  {                 \
-    if     (bc == 1) ellipticHomogeneousDirichlet2D(uM,uxM,uyM,uB,uxB,uyB) \
-    else if(bc == 2) ellipticHomogeneousNeumann2D(uM,uxM,uyM,uB,uxB,uyB)  \
-    else           ellipticHomogeneousNeumann2D(uM,uxM,uyM,uB,uxB,uyB)  \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticBoundary3D.h b/src/libP/solvers/elliptic/data/ellipticBoundary3D.h
deleted file mode 100644
index c69e4aaf3..000000000
--- a/src/libP/solvers/elliptic/data/ellipticBoundary3D.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticBoundaryConditions3D(bc,t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {                 \
-    if     (bc == 1) ellipticDirichletCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB) \
-    else if(bc == 2) ellipticNeumannCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-    else           ellipticNeumannCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  }
-
-/*-----------------------------------------------------------------------------------------------*/
-/* Homogeneuous Boundary conditions used in ellipticAx.
-   /*-----------------------------------------------------------------------------------------------*/
-
-/* Homogeneous Dirichlet boundary condition   */
-#define ellipticHomogeneousDirichlet3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = 0.f;   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-    uzB = uzM;   \
-  }
-
-/* Homogeneous Neumann boundary condition   */
-#define ellipticHomogeneousNeumann3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB = uM;     \
-    uxB = 0.f;   \
-    uyB = 0.f;   \
-    uzB = 0.f;   \
-  }
-
-/* Dirichlet 1, Neumann 2, Robin 3 (defaulted to Neumann for now) */
-#define ellipticHomogeneousBC3D(bc,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {                 \
-    if     (bc == 1) ellipticHomogeneousDirichlet3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB) \
-    else if(bc == 2) ellipticHomogeneousNeumann3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-    else           ellipticHomogeneousNeumann3D(uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticHomogeneous2D.h b/src/libP/solvers/elliptic/data/ellipticHomogeneous2D.h
deleted file mode 100644
index 4bf2ec65c..000000000
--- a/src/libP/solvers/elliptic/data/ellipticHomogeneous2D.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Homogeneous Dirichlet boundary condition   */
-#define ellipticDirichletCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = 0.f;   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-  }
-
-/* Homogeneous Neumann boundary condition   */
-#define ellipticNeumannCondition2D(t,x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = uM;    \
-    uxB = 0.f;   \
-    uyB = 0.f;   \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticHomogeneous3D.h b/src/libP/solvers/elliptic/data/ellipticHomogeneous3D.h
deleted file mode 100644
index 859b4fc19..000000000
--- a/src/libP/solvers/elliptic/data/ellipticHomogeneous3D.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-/* Homogeneous Dirichlet boundary condition   */
-#define ellipticDirichletCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = 0.f;   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-    uzB = uzM;   \
-  }
-
-/* Homogeneous Neumann boundary condition   */
-#define ellipticNeumannCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = uM;    \
-    uxB = 0.f;   \
-    uyB = 0.f;   \
-    uzB = 0.f;   \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticSine2D.h b/src/libP/solvers/elliptic/data/ellipticSine2D.h
deleted file mode 100644
index ca56b9c6e..000000000
--- a/src/libP/solvers/elliptic/data/ellipticSine2D.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#define PI 3.14159265358979323846
-
-/* forcing function   */
-#define ellipticForcing2D(x, y, lambda, f)  \
-  {                                         \
-    f  = (2 * PI * PI + lambda) * sin(PI * x) * sin(PI * y);   \
-  }
-
-/* Dirichlet boundary condition   */
-#define ellipticDirichletCondition2D(x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = sin(PI * x) * sin(PI * y);   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-  }
-
-/* Neumann boundary condition   */
-#define ellipticNeumannCondition2D(x,y,nx,ny,uM,uxM,uyM,uB,uxB,uyB)  \
-  {              \
-    uB  = uM;    \
-    uxB = -PI * cos(PI * x) * sin(PI * y);   \
-    uyB = -PI * sin(PI * x) * cos(PI * y);   \
-  }
diff --git a/src/libP/solvers/elliptic/data/ellipticSine3D.h b/src/libP/solvers/elliptic/data/ellipticSine3D.h
deleted file mode 100644
index dd4fc0090..000000000
--- a/src/libP/solvers/elliptic/data/ellipticSine3D.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#define VARIABLE 1
-#define PI 3.14159265358979323846
-
-#if VARIABLE
-
-#define ellipticCoefficient3D(x, y, z, lambda_0, lambda_1)    \
-  {                                         \
-    lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-    lambda_1 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0;            \
-  }
-
-/* forcing function   */
-#define ellipticForcing3D(x, y, z, lambda, f)  \
-  {                                         \
-    dfloat sxy  = sin(PI * x) * sin(PI * y); \
-    dfloat sxz  = sin(PI * x) * sin(PI * z); \
-    dfloat syz  = sin(PI * y) * sin(PI * z); \
-    dfloat sxyz  = sin(PI * x) * sin(PI * y) * sin(PI * z); \
-    f = sxyz * (1.0 + 3.0 * PI * PI + 1.0 * sxyz + 6.0 * PI * PI * sxyz) - PI * PI * \
-        (sxy * sxy + sxz * sxz + syz * syz); \
-  }
-
-/* Dirichlet boundary condition   */
-#define ellipticDirichletCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-    uzB = uzM;   \
-  }
-
-/* Neumann boundary condition   */
-#define ellipticNeumannCondition3D(t, x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = uM;    \
-    dfloat lambda_0 = sin(PI * x) * sin(PI * y) * sin(PI * z) + 1.0; \
-    uxB = -lambda_0 * PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-    uyB = -lambda_0 * PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-    uzB = -lambda_0 * PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-  }
-
-#else // if VARIABLE
-
-/* forcing function   */
-#define ellipticForcing3D(x, y, z, lambda, f)  \
-  {                                         \
-    f  = (3 * PI * PI + lambda) * sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-  }
-
-/* Dirichlet boundary condition   */
-#define ellipticDirichletCondition3D(t,x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = sin(PI * x) * sin(PI * y) * sin(PI * z);   \
-    uxB = uxM;   \
-    uyB = uyM;   \
-    uzB = uzM;   \
-  }
-
-/* Neumann boundary condition   */
-#define ellipticNeumannCondition3D(t, x,y,z,nx,ny,nz,uM,uxM,uyM,uzM,uB,uxB,uyB,uzB)  \
-  {              \
-    uB  = uM;    \
-    uxB = -PI * cos(PI * x) * sin(PI * y) * sin(PI * z);   \
-    uyB = -PI * sin(PI * x) * cos(PI * y) * sin(PI * z);   \
-    uzB = -PI * sin(PI * x) * sin(PI * y) * cos(PI * z);   \
-  }
-
-#endif
diff --git a/src/libP/solvers/elliptic/okl/ellipticAddBCHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticAddBCHex3D.okl
deleted file mode 100644
index 97de9abbd..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAddBCHex3D.okl
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAddBCHex3D(const dlong Nelements,
-                                const dfloat t,
-                                @restrict const dfloat*  x,
-                                @restrict const dfloat*  y,
-                                @restrict const dfloat*  z,
-                                @restrict const int*  mapB,
-                                @restrict dfloat*  q)
-{
-  for(dlong e = 0; e < Nelements; e++; @outer(0))
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const dlong id = n + e * p_Np;
-      const int bc = mapB[n + e * p_Np];
-
-      dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-      if(bc == 1) {
-        ellipticBoundaryConditions3D(bc,
-                                     t,
-                                     x[id],
-                                     y[id],
-                                     z[id],
-                                     nx,
-                                     ny,
-                                     nz,
-                                     0.f,
-                                     0.f,
-                                     0.f,
-                                     0.f,
-                                     uP,
-                                     dudxP,
-                                     dudyP,
-                                     dudzP);
-        q[id] = uP;
-      }
-    }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAddBCQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticAddBCQuad2D.okl
deleted file mode 100644
index 540860887..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAddBCQuad2D.okl
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAddBCQuad2D(const int Nelements,
-                                 const dfloat t,
-                                 @restrict const dfloat*  x,
-                                 @restrict const dfloat*  y,
-                                 @restrict const dfloat*  z,
-                                 @restrict const int*  mapB,
-                                 @restrict dfloat*  q)
-{
-  for(int e = 0; e < Nelements; e++; @outer(0))
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const int id = n + e * p_Np;
-      const int bc = mapB[n + e * p_Np];
-
-      dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-      if(bc == 1) {
-        ellipticBoundaryConditions2D(bc, t, x[id], y[id], 0.f, 0.f, \
-                                     0.f, 0.f, 0.f,                \
-                                     uP, dudxP, dudyP);
-        q[id] = uP;
-      }
-    }
-}
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/okl/ellipticAddBCTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticAddBCTet3D.okl
deleted file mode 100644
index 66a547437..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAddBCTet3D.okl
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAddBCTet3D(const int Nelements,
-                                const dfloat t,
-                                @restrict const dfloat*  x,
-                                @restrict const dfloat*  y,
-                                @restrict const dfloat*  z,
-                                @restrict const int*  mapB,
-                                @restrict dfloat*  q)
-{
-  for(int e = 0; e < Nelements; e++; @outer(0))
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const int id = n + e * p_Np;
-      const int bc = mapB[n + e * p_Np];
-
-      dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-      if(bc == 1) {
-        ellipticBoundaryConditions3D(bc, t, x[id], y[id], z[id], nx, ny, nz, \
-                                     0.f, 0.f, 0.f, 0.f,               \
-                                     uP, dudxP, dudyP, dudzP);
-        q[id] = uP;
-      }
-    }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAddBCTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticAddBCTri2D.okl
deleted file mode 100644
index b8530f5d9..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAddBCTri2D.okl
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAddBCTri2D(const int Nelements,
-                                const dfloat t,
-                                @restrict const dfloat*  x,
-                                @restrict const dfloat*  y,
-                                @restrict const dfloat*  z,
-                                @restrict const int*  mapB,
-                                @restrict dfloat*  q)
-{
-  for(int e = 0; e < Nelements; e++; @outer(0))
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const int id = n + e * p_Np;
-      const int bc = mapB[n + e * p_Np];
-
-      dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-      if(bc == 1) {
-        ellipticBoundaryConditions2D(bc, t, x[id], y[id], 0.f, 0.f, \
-                                     0.f, 0.f, 0.f,                \
-                                     uP, dudxP, dudyP);
-        q[id] = uP;
-      }
-    }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgBBTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgBBTri2D.okl
deleted file mode 100644
index 6dce5fb41..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgBBTri2D.okl
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-// nx,ny,nz,sJ,invJ - need WsJ
-
-@kernel void ellipticAxIpdgBBTri2D(const int Nelements,
-                                   @restrict const int*  vmapM,
-                                   @restrict const int*  vmapP,
-                                   const dfloat lambda,
-                                   const dfloat tau,
-                                   @restrict const dfloat*  vgeo,
-                                   @restrict const dfloat*  sgeo,
-                                   @restrict const int*  EToB,
-                                   @restrict const int*  D1ids,
-                                   @restrict const int*  D2ids,
-                                   @restrict const int*  D3ids,
-                                   @restrict const dfloat*  Dvals,
-                                   @restrict const dfloat*  L0vals,
-                                   @restrict const int*  ELids,
-                                   @restrict const dfloat*  ELvals,
-                                   @restrict const dfloat*  MM,
-                                   @restrict const dfloat4*  gradq,
-                                   @restrict dfloat*  Aq)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_nxdq_copy[p_NfacesNfp];
-    @shared dfloat s_nydq_copy[p_NfacesNfp];
-    @shared dfloat s_lapflux_copy[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @exclusive int idM;
-    @exclusive dfloat nx, ny, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        // assume that this stores (qx, qy, qz, q) as dfloat4
-        const dfloat4 gradqn = gradq[e * p_Np + n];
-
-        s_dqdx[n] = gradqn.x;
-        s_dqdy[n] = gradqn.y;
-        s_lapq[n] = lambda * gradqn.w;
-      }
-
-      if(n < p_NfacesNfp) {
-        const int id  = n + e * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-        const int idP = vmapP[id];
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        const dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-        dfloat4 gradqP = gradq[idP];
-
-        // load surface geofactors for this face
-        const int sid = p_Nsgeo * (e * p_Nfaces + face);
-        nx   = sgeo[sid + p_NXID];
-        ny   = sgeo[sid + p_NYID];
-        sJ   = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        const int bc = EToB[face + p_Nfaces * e];
-        if(bc > 0) {
-          ellipticHomogeneousBC2D(bc, gradqM.w, gradqM.x, gradqM.y, gradqP.w, gradqP.x, gradqP.y);
-          gradqP = 2 * gradqP - gradqM;
-        }
-
-        const dfloat dq = gradqP.w - gradqM.w;
-        const dfloat sc = 0.5 * invJ * sJ;
-
-        s_nxdq[n] = sc * nx * dq;
-        s_nydq[n] = sc * ny * dq;
-
-        s_lapflux[n] = sc * (-nx * (gradqP.x - gradqM.x)
-                             - ny * (gradqP.y - gradqM.y)
-                             - tau * hinv * dq);
-      }
-    }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_Nfp * p_Nfaces) {
-        const int id = n % p_Nfp;
-
-        dfloat tmpnxdq = L0vals[id + p_Nfp] * s_nxdq[n];
-        dfloat tmpnydq = L0vals[id + p_Nfp] * s_nydq[n];
-
-        if (id > 0) {
-          tmpnxdq += L0vals[id] * s_nxdq[n - 1]; // add previous term
-          tmpnydq += L0vals[id] * s_nydq[n - 1]; // add previous term
-        }
-        if (id < p_Nfp - 1) {
-          tmpnxdq += L0vals[id + 2 * p_Nfp] * s_nxdq[n + 1];// add next term
-          tmpnydq += L0vals[id + 2 * p_Nfp] * s_nydq[n + 1];// add next term
-        }
-        s_nxdq_copy[n] = tmpnxdq;
-        s_nydq_copy[n] = tmpnydq;
-      }
-
-    @barrier("local");
-
-    //lift reduction
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const int gid = e * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-
-#pragma unroll p_max_EL_nnz
-        for (int m = 0; m < p_max_EL_nnz; ++m) {
-          const int iid = n + m * p_Np;
-          const dfloat ELval = ELvals[iid];
-          const int ELid = ELids[iid];
-          Lnxdq += ELval * s_nxdq_copy[ELid];
-          Lnydq += ELval * s_nydq_copy[ELid];
-        }
-
-        const dfloat dqdx = s_dqdx[n] + Lnxdq;
-        const dfloat dqdy = s_dqdy[n] + Lnydq;
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        const int id = idM % p_Np;
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id]);
-      }
-
-      if(n < p_Np) {
-        const int D1i1 = D1ids[n];
-        const int D2i1 = D2ids[n];
-        const int D3i1 = D3ids[n];
-        const dfloat Dval1 = Dvals[n];
-
-        const int D1i2 = D1ids[n + p_Np];
-        const int D2i2 = D2ids[n + p_Np];
-        const int D3i2 = D3ids[n + p_Np];
-        const dfloat Dval2 = Dvals[n + p_Np];
-
-        const int D1i3 = D1ids[n + 2 * p_Np];
-        const int D2i3 = D2ids[n + 2 * p_Np];
-        const int D3i3 = D3ids[n + 2 * p_Np];
-        const dfloat Dval3 = Dvals[n + 2 * p_Np];
-
-        const dfloat lapr = .5f * (Dval1 * (s_dqdx[D2i1] - s_dqdx[D1i1]) +
-                                   Dval2 * (s_dqdx[D2i2] - s_dqdx[D1i2]) +
-                                   Dval3 * (s_dqdx[D2i3] - s_dqdx[D1i3]));
-        const dfloat laps = .5f * (Dval1 * (s_dqdy[D3i1] - s_dqdy[D1i1]) +
-                                   Dval2 * (s_dqdy[D3i2] - s_dqdy[D1i2]) +
-                                   Dval3 * (s_dqdy[D3i3] - s_dqdy[D1i3]));
-        s_lapq[n] -= lapr + laps;
-      }
-    }
-
-    @barrier("local");
-
-    // lift remaining surface terms
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_Nfp * p_Nfaces) {
-        const int id = n % p_Nfp;
-
-        dfloat tmplapflux = L0vals[id + p_Nfp] * s_lapflux[n];
-        if (id > 0) tmplapflux += L0vals[id] * s_lapflux[n - 1];   // add previous term
-        if (id < p_Nfp - 1) tmplapflux += L0vals[id + 2 * p_Nfp] * s_lapflux[n + 1]; // add next term
-
-        s_lapflux_copy[n] = tmplapflux;
-      }
-
-    @barrier("local");
-
-    //lift reduction
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-#pragma unroll p_max_EL_nnz
-        for (int m = 0; m < p_max_EL_nnz; ++m) {
-          const int iid = n + m * p_Np;
-          const dfloat ELval = ELvals[iid];
-          const int ELid = ELids[iid];
-          lap += ELval * s_lapflux_copy[ELid];
-        }
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[e * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        Aq[n + e * p_Np] = J * Mlapq;
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialAxIpdgBBTri2D(const int Nelements,
-                                          @restrict const int*  elementList,
-                                          @restrict const int*  vmapM,
-                                          @restrict const int*  vmapP,
-                                          const dfloat lambda,
-                                          const dfloat tau,
-                                          @restrict const dfloat*  vgeo,
-                                          @restrict const dfloat*  sgeo,
-                                          @restrict const int*  EToB,
-                                          @restrict const int*  D1ids,
-                                          @restrict const int*  D2ids,
-                                          @restrict const int*  D3ids,
-                                          @restrict const dfloat*  Dvals,
-                                          @restrict const dfloat*  L0vals,
-                                          @restrict const int*  ELids,
-                                          @restrict const dfloat*  ELvals,
-                                          @restrict const dfloat*  MM,
-                                          @restrict const dfloat4*  gradq,
-                                          @restrict dfloat*  Aq)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_nxdq_copy[p_NfacesNfp];
-    @shared dfloat s_nydq_copy[p_NfacesNfp];
-    @shared dfloat s_lapflux_copy[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @exclusive int element;
-    @exclusive int idM;
-    @exclusive dfloat nx, ny, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      element = elementList[e];
-      if(n < p_Np) {
-        // assume that this stores (qx, qy, qz, q) as dfloat4
-        const dfloat4 gradqn = gradq[element * p_Np + n];
-
-        s_dqdx[n] = gradqn.x;
-        s_dqdy[n] = gradqn.y;
-        s_lapq[n] = lambda * gradqn.w;
-      }
-
-      if(n < p_NfacesNfp) {
-        const int id  = n + element * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-        const int idP = vmapP[id];
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        const dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-        dfloat4 gradqP = gradq[idP];
-
-        // load surface geofactors for this face
-        const int sid = p_Nsgeo * (element * p_Nfaces + face);
-        nx   = sgeo[sid + p_NXID];
-        ny   = sgeo[sid + p_NYID];
-        sJ   = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        const int bc = EToB[face + p_Nfaces * element];
-        if(bc > 0) {
-          ellipticHomogeneousBC2D(bc, gradqM.w, gradqM.x, gradqM.y, gradqP.w, gradqP.x, gradqP.y);
-          gradqP = 2 * gradqP - gradqM;
-        }
-
-        const dfloat dq = gradqP.w - gradqM.w;
-        const dfloat sc = 0.5 * invJ * sJ;
-
-        s_nxdq[n] = sc * nx * dq;
-        s_nydq[n] = sc * ny * dq;
-
-        s_lapflux[n] = sc * (-nx * (gradqP.x - gradqM.x)
-                             - ny * (gradqP.y - gradqM.y)
-                             - tau * hinv * dq);
-      }
-    }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_Nfp * p_Nfaces) {
-        const int id = n % p_Nfp;
-
-        dfloat tmpnxdq = L0vals[id + p_Nfp] * s_nxdq[n];
-        dfloat tmpnydq = L0vals[id + p_Nfp] * s_nydq[n];
-
-        if (id > 0) {
-          tmpnxdq += L0vals[id] * s_nxdq[n - 1]; // add previous term
-          tmpnydq += L0vals[id] * s_nydq[n - 1]; // add previous term
-        }
-        if (id < p_Nfp - 1) {
-          tmpnxdq += L0vals[id + 2 * p_Nfp] * s_nxdq[n + 1];// add next term
-          tmpnydq += L0vals[id + 2 * p_Nfp] * s_nydq[n + 1];// add next term
-        }
-        s_nxdq_copy[n] = tmpnxdq;
-        s_nydq_copy[n] = tmpnydq;
-      }
-
-    @barrier("local");
-
-    //lift reduction
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const int gid = element * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-
-#pragma unroll p_max_EL_nnz
-        for (int m = 0; m < p_max_EL_nnz; ++m) {
-          const int iid = n + m * p_Np;
-          const dfloat ELval = ELvals[iid];
-          const int ELid = ELids[iid];
-          Lnxdq += ELval * s_nxdq_copy[ELid];
-          Lnydq += ELval * s_nydq_copy[ELid];
-        }
-
-        const dfloat dqdx = s_dqdx[n] + Lnxdq;
-        const dfloat dqdy = s_dqdy[n] + Lnydq;
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        const int id = idM % p_Np;
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id]);
-      }
-
-      if(n < p_Np) {
-        const int D1i1 = D1ids[n];
-        const int D2i1 = D2ids[n];
-        const int D3i1 = D3ids[n];
-        const dfloat Dval1 = Dvals[n];
-
-        const int D1i2 = D1ids[n + p_Np];
-        const int D2i2 = D2ids[n + p_Np];
-        const int D3i2 = D3ids[n + p_Np];
-        const dfloat Dval2 = Dvals[n + p_Np];
-
-        const int D1i3 = D1ids[n + 2 * p_Np];
-        const int D2i3 = D2ids[n + 2 * p_Np];
-        const int D3i3 = D3ids[n + 2 * p_Np];
-        const dfloat Dval3 = Dvals[n + 2 * p_Np];
-
-        const dfloat lapr = .5f * (Dval1 * (s_dqdx[D2i1] - s_dqdx[D1i1]) +
-                                   Dval2 * (s_dqdx[D2i2] - s_dqdx[D1i2]) +
-                                   Dval3 * (s_dqdx[D2i3] - s_dqdx[D1i3]));
-        const dfloat laps = .5f * (Dval1 * (s_dqdy[D3i1] - s_dqdy[D1i1]) +
-                                   Dval2 * (s_dqdy[D3i2] - s_dqdy[D1i2]) +
-                                   Dval3 * (s_dqdy[D3i3] - s_dqdy[D1i3]));
-
-        s_lapq[n] -= (lapr + laps);
-      }
-    }
-
-    @barrier("local");
-
-    // lift remaining surface terms
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_Nfp * p_Nfaces) {
-        const int id = n % p_Nfp;
-
-        dfloat tmplapflux = L0vals[id + p_Nfp] * s_lapflux[n];
-        if (id > 0) tmplapflux += L0vals[id] * s_lapflux[n - 1];   // add previous term
-        if (id < p_Nfp - 1) tmplapflux += L0vals[id + 2 * p_Nfp] * s_lapflux[n + 1]; // add next term
-
-        s_lapflux_copy[n] = tmplapflux;
-      }
-
-    @barrier("local");
-
-    //lift reduction
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-#pragma unroll p_max_EL_nnz
-        for (int m = 0; m < p_max_EL_nnz; ++m) {
-          const int iid = n + m * p_Np;
-          const dfloat ELval = ELvals[iid];
-          const int ELid = ELids[iid];
-          lap += ELval * s_lapflux_copy[ELid];
-        }
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[element * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        Aq[n + element * p_Np] = J * Mlapq;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl
deleted file mode 100644
index c303daaca..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// increments gradients of pressure with pseudo-gradients at faces and
-// increments rhs with pseudo-gradient/penalty terms at faces (need to double check scaling with tau)
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-
-// nx,ny,nz,sJ,invJ - need WsJ
-#define surfaceTerms(emap, \
-                     sk, \
-                     face, \
-                     m, \
-                     i, \
-                     j, \
-                     tau, \
-                     sgeo, \
-                     vmapM, \
-                     vmapP, \
-                     EToB, \
-                     gradq, \
-                     s_dqdx, \
-                     s_dqdy, \
-                     s_dqdz, \
-                     s_Aq) \
-  {                                                                       \
-    const dlong idM = vmapM[sk];                                       \
-    const dlong idP = vmapP[sk];                                       \
-    const dfloat nx  = sgeo[sk * p_Nsgeo + p_NXID];                         \
-    const dfloat ny  = sgeo[sk * p_Nsgeo + p_NYID];                         \
-    const dfloat nz  = sgeo[sk * p_Nsgeo + p_NZID];                         \
-    const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];                        \
-    const dfloat hinv = sgeo[sk * p_Nsgeo + p_IHID];                         \
-    const dfloat4 gradqM = gradq[idM];                                  \
-    dfloat4 gradqP = gradq[idP];                                        \
-    const int bc = EToB[face + p_Nfaces * emap];                            \
-    if(bc > 0) {                                                          \
-      ellipticHomogeneousBC3D(bc, \
-                              gradqM.w, \
-                              gradqM.x, \
-                              gradqM.y, \
-                              gradqM.z, \
-                              gradqP.w, \
-                              gradqP.x, \
-                              gradqP.y, \
-                              gradqP.z); \
-      gradqP.x = 2.f * gradqP.x - gradqM.x;                               \
-      gradqP.y = 2.f * gradqP.y - gradqM.y;                               \
-      gradqP.z = 2.f * gradqP.z - gradqM.z;                               \
-      gradqP.w = 2.f * gradqP.w - gradqM.w;                               \
-    }                                                                   \
-    const dfloat dq = gradqP.w - gradqM.w;                              \
-    s_dqdx[m][j][i] = 0.5f * WsJ * nx * dq;                                   \
-    s_dqdy[m][j][i] = 0.5f * WsJ * ny * dq;                                   \
-    s_dqdz[m][j][i] = 0.5f * WsJ * nz * dq;                                   \
-    s_Aq[m][j][i] = -0.5f * WsJ * \
-                    (nx * (gradqP.x + gradqM.x) +  ny * (gradqP.y + gradqM.y) + nz * \
-                     (gradqP.z + gradqM.z) +  tau * \
-                     hinv * dq); \
-  }
-
-// (grad phi, grad q) + ([phi], n.{grad q}) + ({grad phi}, n[u]) + (tau[phi],[u])
-
-@kernel void ellipticAxIpdgHex3D(const dlong Nelements,
-                                 @restrict const dlong*  vmapM,
-                                 @restrict const dlong*  vmapP,
-                                 const dfloat lambda,
-                                 const dfloat tau,
-                                 @restrict const dfloat*  vgeo,
-                                 @restrict const dfloat*  sgeo,
-                                 @restrict const int*  EToB,
-                                 @restrict const dfloat*  const D,
-                                 @restrict const dfloat*  LIFTT,
-                                 @restrict const dfloat*  MM,
-                                 @restrict const dfloat4*  gradq,
-                                 @restrict dfloat*  Aq)
-{
-#if 0
-  // assume the following are precomputed:
-  // p, px, py at SEM nodes
-  // +/- traces of p, px, py at SEM surface nodes
-
-  0 <= i,j,k,m <= N AND 0 <= e < Nelements
-
-    (phix, px) _e
-  + (phiy, py)_e
-  + (phix, nx* (p + -p -) / 2)_de
-  + (phiy, ny* (p + -p -) / 2)_de
-  - (phi -, nx* (px + +px -) / 2)_de
-  - (phi -, ny* (py + +py -) / 2)_de
-  - (phi -, tau* (p + -p -) / 2) _de
-
-  // here w is one component of the product TP quadrature weights
-    (phir, rx* (px + Fsc * nx * dp) + ry* (py + Fsc * ny * dp) + rz* (pz + Fsc * nz * dp)) )_e
-  +   (phir, sx* (px + Fsc * nx * dp) + sy * (py + Fsc * ny * dp) + sz * (pz + Fsc * nz * dp)) )_e
-  +   (phir, tx* (px + Fsc* nx* dp) + ty* (py + Fsc* ny* dp) + tz* (pz + Fsc* nz* dp)) ) _e
-    (phi -,
-    Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) / 2) _e
-
-  px = > px + Fsc * nx * dp (i.e.add pseudo - gradient at end points
-                             py = > py + Fsc * ny * dp
-                                  pz = > pz + Fsc * nz * dp
-                                       Fsc = delta * (Js / J) * (1 / w)
-                                             dp = (p + -p -) / 2;
-
-                             // simplify
-                             (phir, rx * px + ry * py + rz * pz) ) _e
-       +   (phir, sx* px + sy * py + sz * pz) )_e
-       +   (phir, tx* px + ty * py + tz * pz) )_e
-       +   (phi -,
-            Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) /
-            2)_e
-
-#endif
-
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-  {
-    @shared dfloat s_dqdx[2][p_Nq][p_Nq];
-    @shared dfloat s_dqdy[2][p_Nq][p_Nq];
-    @shared dfloat s_dqdz[2][p_Nq][p_Nq];
-    @shared dfloat s_Aq[2][p_Nq][p_Nq];
-
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dfloat r_dqdx[p_Nq], r_dqdy[p_Nq], r_dqdz[p_Nq], r_Aq[p_Nq];
-    @exclusive dfloat r_dqdt;
-
-    @exclusive dlong emap;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        emap = e;
-
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          // assume that this stores (p, px, py, pz) as dfloat4
-          const dfloat4 gradqn = gradq[e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i];
-          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID * p_Np];
-
-          r_dqdx[k] = JW * gradqn.x;
-          r_dqdy[k] = JW * gradqn.y;
-          r_dqdz[k] = JW * gradqn.z;
-          r_Aq[k] = JW * lambda * gradqn.w;
-        }
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + j * p_Nq + i;
-        const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + j * p_Nq + i;
-
-        //        surfaceTerms(emap,sk0,0,0,i,j);
-        surfaceTerms(emap,sk0, 0, 0, i, j, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-
-        //        surfaceTerms(emap,sk5,5,1,i,j);
-        surfaceTerms(emap,sk5, 5, 1, i, j, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-      }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_dqdx[0] += s_dqdx[0][j][i];
-        r_dqdy[0] += s_dqdy[0][j][i];
-        r_dqdz[0] += s_dqdz[0][j][i];
-        r_Aq  [0] += s_Aq  [0][j][i];
-
-        //face 5
-        r_dqdx[p_Nq - 1] += s_dqdx[1][j][i];
-        r_dqdy[p_Nq - 1] += s_dqdy[1][j][i];
-        r_dqdz[p_Nq - 1] += s_dqdz[1][j][i];
-        r_Aq  [p_Nq - 1] += s_Aq  [1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + k * p_Nq + i;
-        const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + k * p_Nq + i;
-
-        //        surfaceTerms(emap,sk1,1,0,i,k);
-        surfaceTerms(emap,sk1, 1, 0, i, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-
-        //        surfaceTerms(emap,sk3,3,1,i,k);
-        surfaceTerms(emap,sk3, 3, 1, i, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[0][k][i];
-            r_dqdy[k] += s_dqdy[0][k][i];
-            r_dqdz[k] += s_dqdz[0][k][i];
-            r_Aq  [k] += s_Aq  [0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[1][k][i];
-            r_dqdy[k] += s_dqdy[1][k][i];
-            r_dqdz[k] += s_dqdz[1][k][i];
-            r_Aq  [k] += s_Aq  [1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + k * p_Nq + j;
-        const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + k * p_Nq + j;
-
-        //        surfaceTerms(emap,sk2,2,0,j,k);
-        surfaceTerms(emap,sk2, 2, 0, j, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-
-        //surfaceTerms(emap,sk4,4,1,j,k);
-        surfaceTerms(emap,sk4, 4, 1, j, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[0][k][j];
-            r_dqdy[k] += s_dqdy[0][k][j];
-            r_dqdz[k] += s_dqdz[0][k][j];
-            r_Aq  [k] += s_Aq  [0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[1][k][j];
-            r_dqdy[k] += s_dqdy[1][k][j];
-            r_dqdz[k] += s_dqdz[1][k][j];
-            r_Aq  [k] += s_Aq  [1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    //layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; ++k) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
-
-          const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-          const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-          const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-          const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-          const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-          const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-          const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-          const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-          const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-          const dfloat dqdx = r_dqdx[k];
-          const dfloat dqdy = r_dqdy[k];
-          const dfloat dqdz = r_dqdz[k];
-
-          s_dqdx[0][j][i] = (drdx * dqdx + drdy * dqdy + drdz * dqdz);
-          s_dqdy[0][j][i] = (dsdx * dqdx + dsdy * dqdy + dsdz * dqdz);
-          r_dqdt = (dtdx * dqdx + dtdy * dqdy + dtdz * dqdz);
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat dr = 0, ds = 0;
-
-#pragma unroll p_Nq
-          for(int n = 0; n < p_Nq; ++n) {
-            dr += s_D[n][i] * s_dqdx[0][j][n];
-            r_Aq[n] += s_D[k][n] * r_dqdt; // DT(m,k)*ut(i,j,k,e)
-            ds += s_D[n][j] * s_dqdy[0][n][i];
-          }
-
-          r_Aq[k] += dr + ds;
-        }
-      }
-
-      @barrier("local");
-    }
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          Aq[id] = r_Aq[k];
-        }
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialAxIpdgHex3D(const dlong Nelements,
-                                        @restrict const dlong*  elementList,
-                                        @restrict const dlong*  vmapM,
-                                        @restrict const dlong*  vmapP,
-                                        const dfloat lambda,
-                                        const dfloat tau,
-                                        @restrict const dfloat*  vgeo,
-                                        @restrict const dfloat*  sgeo,
-                                        @restrict const int*  EToB,
-                                        @restrict const dfloat*  const D,
-                                        @restrict const dfloat*  LIFTT,
-                                        @restrict const dfloat*  MM,
-                                        @restrict const dfloat4*  gradq,
-                                        @restrict dfloat*  Aq)
-{
-#if 0
-  // assume the following are precomputed:
-  // p, px, py at SEM nodes
-  // +/- traces of p, px, py at SEM surface nodes
-
-  0 <= i,j,k,m <= N AND 0 <= e < Nelements
-
-    (phix, px) _e
-  + (phiy, py)_e
-  + (phix, nx* (p + -p -) / 2)_de
-  + (phiy, ny* (p + -p -) / 2)_de
-  - (phi -, nx* (px + +px -) / 2)_de
-  - (phi -, ny* (py + +py -) / 2)_de
-  - (phi -, tau* (p + -p -) / 2) _de
-
-  // here w is one component of the product TP quadrature weights
-    (phir, rx* (px + Fsc* nx* dp) + ry* (py + Fsc* ny* dp) + rz* (pz + Fsc* nz* dp)) )_e
-  +   (phir, sx* (px + Fsc * nx * dp) + sy * (py + Fsc * ny * dp) + sz * (pz + Fsc * nz * dp)) )_e
-  +   (phir, tx* (px + Fsc* nx* dp) + ty* (py + Fsc* ny* dp) + tz* (pz + Fsc* nz* dp)) ) _e
-    (phi -,
-    Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) / 2) _e
-
-  px = > px + Fsc * nx * dp (i.e.add pseudo - gradient at end points
-                             py = > py + Fsc * ny * dp
-                                  pz = > pz + Fsc * nz * dp
-                                       Fsc = delta * (Js / J) * (1 / w)
-                                             dp = (p + -p -) / 2;
-
-                             // simplify
-                             (phir, rx * px + ry * py + rz * pz) ) _e
-       +   (phir, sx* px + sy * py + sz * pz) )_e
-       +   (phir, tx* px + ty * py + tz * pz) )_e
-       +   (phi -,
-            Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) /
-            2)_e
-
-#endif
-
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-  {
-    @shared dfloat s_dqdx[2][p_Nq][p_Nq];
-    @shared dfloat s_dqdy[2][p_Nq][p_Nq];
-    @shared dfloat s_dqdz[2][p_Nq][p_Nq];
-    @shared dfloat s_Aq[2][p_Nq][p_Nq];
-
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dfloat r_dqdx[p_Nq], r_dqdy[p_Nq], r_dqdz[p_Nq], r_Aq[p_Nq];
-    @exclusive dfloat r_dqdt;
-
-    @exclusive dlong emap;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        emap = elementList[e];
-
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          // assume that this stores (p, px, py, pz) as dfloat4
-          const dfloat4 gradqn = gradq[emap * p_Np + k * p_Nq * p_Nq + j * p_Nq + i];
-          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + emap * p_Np * p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID * p_Np];
-
-          r_dqdx[k] = JW * gradqn.x;
-          r_dqdy[k] = JW * gradqn.y;
-          r_dqdz[k] = JW * gradqn.z;
-          r_Aq[k] = JW * lambda * gradqn.w;
-        }
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk0 = emap * p_Nfp * p_Nfaces + 0 * p_Nfp + j * p_Nq + i;
-        const dlong sk5 = emap * p_Nfp * p_Nfaces + 5 * p_Nfp + j * p_Nq + i;
-
-        //        surfaceTerms(emap,sk0,0,0,i,j);
-        surfaceTerms(emap,sk0, 0, 0, i, j, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-
-        //        surfaceTerms(emap,sk5,5,1,i,j);
-        surfaceTerms(emap,sk5, 5, 1, i, j, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-      }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_dqdx[0] += s_dqdx[0][j][i];
-        r_dqdy[0] += s_dqdy[0][j][i];
-        r_dqdz[0] += s_dqdz[0][j][i];
-        r_Aq  [0] += s_Aq  [0][j][i];
-
-        //face 5
-        r_dqdx[p_Nq - 1] += s_dqdx[1][j][i];
-        r_dqdy[p_Nq - 1] += s_dqdy[1][j][i];
-        r_dqdz[p_Nq - 1] += s_dqdz[1][j][i];
-        r_Aq  [p_Nq - 1] += s_Aq  [1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = emap * p_Nfp * p_Nfaces + 1 * p_Nfp + k * p_Nq + i;
-        const dlong sk3 = emap * p_Nfp * p_Nfaces + 3 * p_Nfp + k * p_Nq + i;
-
-        //        surfaceTerms(emap,sk1,1,0,i,k);
-        //        surfaceTerms(emap,sk3,3,1,i,k);
-
-        //        surfaceTerms(emap,sk1,1,0,i,k);
-        surfaceTerms(emap,sk1, 1, 0, i, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-
-        //        surfaceTerms(emap,sk3,3,1,i,k);
-        surfaceTerms(emap,sk3, 3, 1, i, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[0][k][i];
-            r_dqdy[k] += s_dqdy[0][k][i];
-            r_dqdz[k] += s_dqdz[0][k][i];
-            r_Aq  [k] += s_Aq  [0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[1][k][i];
-            r_dqdy[k] += s_dqdy[1][k][i];
-            r_dqdz[k] += s_dqdz[1][k][i];
-            r_Aq  [k] += s_Aq  [1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = emap * p_Nfp * p_Nfaces + 2 * p_Nfp + k * p_Nq + j;
-        const dlong sk4 = emap * p_Nfp * p_Nfaces + 4 * p_Nfp + k * p_Nq + j;
-
-        //        surfaceTerms(emap,sk2,2,0,j,k);
-        surfaceTerms(emap,sk2, 2, 0, j, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-
-        //surfaceTerms(emap,sk4,4,1,j,k);
-        surfaceTerms(emap,sk4, 4, 1, j, k, tau,
-                     sgeo, vmapM, vmapP, EToB, gradq,
-                     s_dqdx, s_dqdy, s_dqdz, s_Aq);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[0][k][j];
-            r_dqdy[k] += s_dqdy[0][k][j];
-            r_dqdz[k] += s_dqdz[0][k][j];
-            r_Aq  [k] += s_Aq  [0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[1][k][j];
-            r_dqdy[k] += s_dqdy[1][k][j];
-            r_dqdz[k] += s_dqdz[1][k][j];
-            r_Aq  [k] += s_Aq  [1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    //layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; ++k) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + emap * p_Np * p_Nvgeo;
-
-          const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-          const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-          const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-          const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-          const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-          const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-          const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-          const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-          const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-          const dfloat dqdx = r_dqdx[k];
-          const dfloat dqdy = r_dqdy[k];
-          const dfloat dqdz = r_dqdz[k];
-
-          s_dqdx[0][j][i] = (drdx * dqdx + drdy * dqdy + drdz * dqdz);
-          s_dqdy[0][j][i] = (dsdx * dqdx + dsdy * dqdy + dsdz * dqdz);
-          r_dqdt = (dtdx * dqdx + dtdy * dqdy + dtdz * dqdz);
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat dr = 0, ds = 0;
-
-#pragma unroll p_Nq
-          for(int n = 0; n < p_Nq; ++n) {
-            dr += s_D[n][i] * s_dqdx[0][j][n];
-            r_Aq[n] += s_D[k][n] * r_dqdt; // DT(m,k)*ut(i,j,k,e)
-            ds += s_D[n][j] * s_dqdy[0][n][i];
-          }
-
-          r_Aq[k] += dr + ds;
-        }
-      }
-
-      @barrier("local");
-    }
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          const dlong id = emap * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          Aq[id] = r_Aq[k];
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl
deleted file mode 100644
index ec1605745..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// increments gradients of pressure with pseudo-gradients at faces and
-// increments rhs with pseudo-gradient/penalty terms at faces (need to double check scaling with tau)
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-
-// nx,ny,nz,sJ,invJ - need WsJ
-
-void surfaceTerms(const int element,
-                  const int sk,
-                  const int face,
-                  const int i,
-                  const int j,
-                  const dfloat tau,
-                  const dfloat* sgeo,
-                  const int* vmapM,
-                  const int* vmapP,
-                  const int* EToB,
-                  const dfloat4* gradq,
-                  dfloat s_dqdx[p_Nq][p_Nq],
-                  dfloat s_dqdy[p_Nq][p_Nq],
-                  dfloat s_rhsq[p_Nq][p_Nq])
-{
-  const dlong idM = vmapM[sk];
-  const dlong idP = vmapP[sk];
-
-  const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-  const dfloat hinv = sgeo[sk * p_Nsgeo + p_IHID];
-
-  const dfloat4 gradqM = gradq[idM];
-  dfloat4 gradqP = gradq[idP];
-
-  int bc = EToB[face + p_Nfaces * element];
-  if(bc > 0) {
-    ellipticHomogeneousBC2D(bc, gradqM.w, gradqM.x, gradqM.y,gradqP.w, gradqP.x, gradqP.y);
-    gradqP.x = 2.f * gradqP.x - gradqM.x;
-    gradqP.y = 2.f * gradqP.y - gradqM.y;
-    gradqP.w = 2.f * gradqP.w - gradqM.w;
-  }
-
-  dfloat dq = gradqP.w - gradqM.w;
-
-  s_dqdx[j][i] += 0.5f * WsJ * nx * dq;
-  s_dqdy[j][i] += 0.5f * WsJ * ny * dq;
-
-  s_rhsq[j][i] -= 0.5f * WsJ * (nx * (gradqP.x + gradqM.x) +
-                                ny * (gradqP.y + gradqM.y) +
-                                tau * dq * hinv);
-}
-
-@kernel void ellipticAxIpdgQuad2D(const dlong Nelements,
-                                  @restrict const dlong*  vmapM,
-                                  @restrict const dlong*  vmapP,
-                                  const dfloat lambda,
-                                  const dfloat tau,
-                                  @restrict const dfloat*  vgeo,
-                                  @restrict const dfloat*  sgeo,
-                                  @restrict const int*  EToB,
-                                  @restrict const dfloat*  D,
-                                  @restrict const dfloat*  LIFTT,
-                                  @restrict const dfloat*  MM,
-                                  @restrict const dfloat4*  gradq,
-                                  @restrict dfloat*  Aq)
-{
-#if 0
-  // assume the following are precomputed:
-  // p, px, py at SEM nodes
-  // +/- traces of p, px, py at SEM surface nodes
-
-  0 <= i,j,k,m <= N AND 0 <= e < Nelements
-
-    (phix, px) _e
-  + (phiy, py)_e
-  + (phix, nx* (p + -p -) / 2)_de
-  + (phiy, ny* (p + -p -) / 2)_de
-  - (phi -, nx* (px + +px -) / 2)_de
-  - (phi -, ny* (py + +py -) / 2)_de
-  - (phi -, tau* (p + -p -) / 2) _de
-
-  // here w is one component of the product TP quadrature weights
-    (phir, rx* (px + Fsc* nx* dp) + ry* (py + Fsc* ny* dp) + rz* (pz + Fsc* nz* dp)) )_e
-  +   (phir, sx* (px + Fsc * nx * dp) + sy * (py + Fsc * ny * dp) + sz * (pz + Fsc * nz * dp)) )_e
-  +   (phir, tx* (px + Fsc* nx* dp) + ty* (py + Fsc* ny* dp) + tz* (pz + Fsc* nz* dp)) ) _e
-    (phi -,
-    Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) / 2) _e
-
-  px = > px + Fsc * nx * dp (i.e.add pseudo - gradient at end points
-                             py = > py + Fsc * ny * dp
-                                  pz = > pz + Fsc * nz * dp
-                                       Fsc = delta * (Js / J) * (1 / w)
-                                             dp = (p + -p -) / 2;
-
-                             // simplify
-                             (phir, rx * px + ry * py + rz * pz) ) _e
-       +   (phir, sx* px + sy * py + sz * pz) )_e
-       +   (phir, tx* px + ty * py + tz * pz) )_e
-       +   (phi -,
-            Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) /
-            2)_e
-
-#endif
-
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-  {
-    @shared dfloat s_dqdx[p_Nq][p_Nq];
-    @shared dfloat s_dqdy[p_Nq][p_Nq];
-    @shared dfloat s_rhsq[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dlong element;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        element = e;
-
-        // assume that this stores (p, px, py, pz) as dfloat4
-        const dfloat4 gradqn = gradq[e * p_Np + j * p_Nq + i];
-
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-        dfloat JW = vgeo[gid + p_JWID * p_Np];
-
-        s_dqdx[j][i] = JW * gradqn.x;
-        s_dqdy[j][i] = JW * gradqn.y;
-        s_rhsq[j][i] = JW * lambda * gradqn.w;
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-
-#if 1
-    @barrier("local");
-
-    // loop over faces to add pseudo-gradient
-
-    // face 0 & 2
-    for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-      dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i;
-      dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + i;
-
-      //      surfaceTerms(sk0,0,i,0);
-      surfaceTerms(e, sk0, 0, i, 0, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
-
-      //      surfaceTerms(sk2,2,i,p_Nq-1);
-      surfaceTerms(e,
-                   sk2,
-                   2,
-                   i,
-                   p_Nq - 1,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   EToB,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_rhsq);
-    }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-      dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + j;
-      dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + j;
-
-      //      surfaceTerms(sk1,1,p_Nq-1,j);
-      surfaceTerms(e,
-                   sk1,
-                   1,
-                   p_Nq - 1,
-                   j,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   EToB,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_rhsq);
-
-      //surfaceTerms(sk3,3,0,j);
-      surfaceTerms(e, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
-    }
-#endif
-    @barrier("local");
-
-    // prescale by geofacs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // does need the nasty geofacs
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-
-        // chain rule (need to scale by wei
-        const dfloat dqdx = s_dqdx[j][i];
-        const dfloat dqdy = s_dqdy[j][i];
-
-        s_dqdx[j][i] = (drdx * dqdx + drdy * dqdy);
-        s_dqdy[j][i] = (dsdx * dqdx + dsdy * dqdy);
-      }
-
-    @barrier("local");
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat dr = 0, ds = 0;
-
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          dr += s_D[n][i] * s_dqdx[j][n];
-          ds += s_D[n][j] * s_dqdy[n][i];
-        }
-
-        const dlong id = e * p_Np + j * p_Nq + i;
-        Aq[id] = s_rhsq[j][i]  + dr + ds;
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialAxIpdgQuad2D(const dlong Nelements,
-                                         @restrict const dlong*  elementList,
-                                         @restrict const dlong*  vmapM,
-                                         @restrict const dlong*  vmapP,
-                                         const dfloat lambda,
-                                         const dfloat tau,
-                                         @restrict const dfloat*  vgeo,
-                                         @restrict const dfloat*  sgeo,
-                                         @restrict const int*  EToB,
-                                         @restrict const dfloat*  D,
-                                         @restrict const dfloat*  LIFTT,
-                                         @restrict const dfloat*  MM,
-                                         @restrict const dfloat4*  gradq,
-                                         @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Nq][p_Nq];
-    @shared dfloat s_dqdy[p_Nq][p_Nq];
-    @shared dfloat s_rhsq[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dlong element;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        element = elementList[e];
-
-        // assume that this stores (p, px, py, pz) as dfloat4
-        const dfloat4 gradqn = gradq[element * p_Np + j * p_Nq + i];
-
-        const dlong gid = i + j * p_Nq + element * p_Np * p_Nvgeo;
-        dfloat JW = vgeo[gid + p_JWID * p_Np];
-
-        s_dqdx[j][i] = JW * gradqn.x;
-        s_dqdy[j][i] = JW * gradqn.y;
-        s_rhsq[j][i] = JW * lambda * gradqn.w;
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-
-    @barrier("local");
-
-    // loop over faces to add pseudo-gradient
-
-    // face 0 & 2
-    for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-      dlong sk0 = element * p_Nfp * p_Nfaces + 0 * p_Nfp + i;
-      dlong sk2 = element * p_Nfp * p_Nfaces + 2 * p_Nfp + i;
-
-      //      surfaceTerms(sk0,0,i,0);
-      surfaceTerms(element,
-                   sk0,
-                   0,
-                   i,
-                   0,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   EToB,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_rhsq);
-
-      //      surfaceTerms(sk2,2,i,p_Nq-1);
-      surfaceTerms(element,
-                   sk2,
-                   2,
-                   i,
-                   p_Nq - 1,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   EToB,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_rhsq);
-    }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-      dlong sk1 = element * p_Nfp * p_Nfaces + 1 * p_Nfp + j;
-      dlong sk3 = element * p_Nfp * p_Nfaces + 3 * p_Nfp + j;
-
-      //      surfaceTerms(sk1,1,p_Nq-1,j);
-      surfaceTerms(element,
-                   sk1,
-                   1,
-                   p_Nq - 1,
-                   j,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   EToB,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_rhsq);
-
-      //      surfaceTerms(sk3,3,0,j);
-      surfaceTerms(element,
-                   sk3,
-                   3,
-                   0,
-                   j,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   EToB,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_rhsq);
-    }
-
-    @barrier("local");
-
-    // prescale by geofacs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // does need the nasty geofacs
-        const dlong gid = i + j * p_Nq + element * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-
-        // chain rule (need to scale by wei
-        const dfloat dqdx = s_dqdx[j][i];
-        const dfloat dqdy = s_dqdy[j][i];
-
-        s_dqdx[j][i] = (drdx * dqdx + drdy * dqdy);
-        s_dqdy[j][i] = (dsdx * dqdx + dsdy * dqdy);
-      }
-
-    @barrier("local");
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat dr = 0, ds = 0;
-
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          dr += s_D[n][i] * s_dqdx[j][n];
-          ds += s_D[n][j] * s_dqdy[n][i];
-        }
-
-        const dlong id = element * p_Np + j * p_Nq + i;
-        Aq[id] = s_rhsq[j][i]  + dr + ds;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl
deleted file mode 100644
index 4653d2f49..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// increments gradients of pressure with pseudo-gradients at faces and
-// increments rhs with pseudo-gradient/penalty terms at faces (need to double check scaling with tau)
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-
-// nx,ny,nz,sJ,invJ - need WsJ
-
-// !!!!!!!!!!!!!!!!!!!!!!!!!!!!This Kenrel is not completed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-void surfaceTerms(const int element,
-                  const int sk,
-                  const int face,
-                  const int i,
-                  const int j,
-                  const dfloat tau,
-                  @global const dfloat* sgeo,
-                  @global const int* vmapM,
-                  @global const int* vmapP,
-                  @global const dfloat4* gradq,
-                  dfloat s_dqdx[p_Nq][p_Nq],
-                  dfloat s_dqdy[p_Nq][p_Nq],
-                  dfloat s_dqdz[p_Nq][p_Nq],
-                  dfloat s_rhsq[p_Nq][p_Nq])
-{
-  const dlong idM = vmapM[sk];
-  const dlong idP = vmapP[sk];
-
-  const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat nz = sgeo[sk * p_Nsgeo + p_NZID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-  const dfloat hinv = sgeo[sk * p_Nsgeo + p_IHID];
-
-  const dfloat4 gradqM = gradq[idM];
-  dfloat4 gradqP = gradq[idP];
-
-  dfloat dq = gradqP.w - gradqM.w;
-
-  s_dqdx[j][i] += 0.5f * WsJ * nx * dq;
-  s_dqdy[j][i] += 0.5f * WsJ * ny * dq;
-  s_dqdz[j][i] += 0.5f * WsJ * nz * dq;
-
-  s_rhsq[j][i] -= 0.5f * WsJ * (nx * (gradqP.x + gradqM.x) +
-                                ny * (gradqP.y + gradqM.y) +
-                                nz * (gradqP.z + gradqM.z) +
-                                tau * dq * hinv);
-}
-
-@kernel void ellipticAxIpdgQuad3D(const dlong Nelements,
-                                  @restrict const dlong*  vmapM,
-                                  @restrict const dlong*  vmapP,
-                                  const dfloat lambda,
-                                  const dfloat tau,
-                                  @restrict const dfloat*  vgeo,
-                                  @restrict const dfloat*  sgeo,
-                                  @restrict const int*  EToB,
-                                  @restrict const dfloat*  D,
-                                  @restrict const dfloat*  LIFTT,
-                                  @restrict const dfloat*  MM,
-                                  @restrict const dfloat4*  gradq,
-                                  @restrict dfloat*  Aq)
-{
-#if 0
-  // assume the following are precomputed:
-  // p, px, py at SEM nodes
-  // +/- traces of p, px, py at SEM surface nodes
-
-  0 <= i,j,k,m <= N AND 0 <= e < Nelements
-
-    (phix, px) _e
-  + (phiy, py)_e
-  + (phix, nx* (p + -p -) / 2)_de
-  + (phiy, ny* (p + -p -) / 2)_de
-  - (phi -, nx* (px + +px -) / 2)_de
-  - (phi -, ny* (py + +py -) / 2)_de
-  - (phi -, tau* (p + -p -) / 2) _de
-
-  // here w is one component of the product TP quadrature weights
-    (phir, rx* (px + Fsc * nx * dp) + ry* (py + Fsc * ny * dp) + rz* (pz + Fsc * nz * dp)) )_e
-  +   (phir, sx* (px + Fsc * nx * dp) + sy * (py + Fsc * ny * dp) + sz * (pz + Fsc * nz * dp)) )_e
-  +   (phir, tx* (px + Fsc* nx* dp) + ty* (py + Fsc* ny* dp) + tz* (pz + Fsc* nz* dp)) ) _e
-    (phi -,
-    Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) / 2) _e
-
-  px = > px + Fsc * nx * dp (i.e.add pseudo - gradient at end points
-                             py = > py + Fsc * ny * dp
-                                  pz = > pz + Fsc * nz * dp
-                                       Fsc = delta * (Js / J) * (1 / w)
-                                             dp = (p + -p -) / 2;
-
-                             // simplify
-                             (phir, rx * px + ry * py + rz * pz) ) _e
-       +   (phir, sx* px + sy * py + sz * pz) )_e
-       +   (phir, tx* px + ty * py + tz * pz) )_e
-       +   (phi -,
-            Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) /
-            2)_e
-
-#endif
-
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-  {
-    @shared dfloat s_dqdx[p_Nq][p_Nq];
-    @shared dfloat s_dqdy[p_Nq][p_Nq];
-    @shared dfloat s_dqdz[p_Nq][p_Nq];
-    @shared dfloat s_rhsq[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dlong element;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        element = e;
-
-        // assume that this stores (p, px, py, pz) as dfloat4
-        const dfloat4 gradqn = gradq[e * p_Np + j * p_Nq + i];
-
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-        dfloat JW = vgeo[gid + p_JWID * p_Np];
-
-        s_dqdx[j][i] = JW * gradqn.x;
-        s_dqdy[j][i] = JW * gradqn.y;
-        s_dqdz[j][i] = JW * gradqn.z;
-        s_rhsq[j][i] = JW * lambda * gradqn.w;
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-
-    @barrier("local");
-
-    // loop over faces to add pseudo-gradient
-
-    // face 0 & 2
-    for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-      dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i;
-      dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + i;
-
-      //      surfaceTerms(sk0,0,i,0);
-      surfaceTerms(e, sk0, 0, i, 0, tau, sgeo, vmapM, vmapP, gradq, s_dqdx, s_dqdy, s_dqdz, s_rhsq);
-
-      //      surfaceTerms(sk2,2,i,p_Nq-1);
-      surfaceTerms(e,
-                   sk2,
-                   2,
-                   i,
-                   p_Nq - 1,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_dqdz,
-                   s_rhsq);
-    }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-      dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + j;
-      dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + j;
-
-      //      surfaceTerms(sk1,1,p_Nq-1,j);
-      surfaceTerms(e,
-                   sk1,
-                   1,
-                   p_Nq - 1,
-                   j,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_dqdz,
-                   s_rhsq);
-
-      //surfaceTerms(sk3,3,0,j);
-      surfaceTerms(e, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, gradq, s_dqdx, s_dqdy, s_dqdz, s_rhsq);
-    }
-
-    @barrier("local");
-
-    // prescale by geofacs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // does need the nasty geofacs
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-        const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-        const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-        const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-        const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-        // chain rule (need to scale by wei
-        const dfloat dqdx = s_dqdx[j][i];
-        const dfloat dqdy = s_dqdy[j][i];
-        const dfloat dqdz = s_dqdz[j][i];
-
-        s_dqdx[j][i] = (drdx * dqdx + drdy * dqdy + drdz * dqdz);
-        s_dqdy[j][i] = (dsdx * dqdx + dsdy * dqdy + dsdz * dqdz);
-#if 0
-        s_dqdz[j][i] = (dtdx * dqdx + dtdy * dqdy + dtdz * dqdz);
-#else
-        s_dqdz[j][i] = 0;
-#endif
-      }
-    }
-
-    @barrier("local");
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat dr = 0, ds = 0, dt = s_dqdz[j][i];
-
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          dr += s_D[n][i] * s_dqdx[j][n];
-          ds += s_D[n][j] * s_dqdy[n][i];
-        }
-
-        const dlong id = e * p_Np + j * p_Nq + i;
-        Aq[id] = s_rhsq[j][i]  + dr + ds + dt;
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialAxIpdgQuad3D(const dlong Nelements,
-                                         @restrict const dlong*  elementList,
-                                         @restrict const dlong*  vmapM,
-                                         @restrict const dlong*  vmapP,
-                                         const dfloat lambda,
-                                         const dfloat tau,
-                                         @restrict const dfloat*  vgeo,
-                                         @restrict const dfloat*  sgeo,
-                                         @restrict const int*  EToB,
-                                         @restrict const dfloat*  D,
-                                         @restrict const dfloat*  LIFTT,
-                                         @restrict const dfloat*  MM,
-                                         @restrict const dfloat4*  gradq,
-                                         @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Nq][p_Nq];
-    @shared dfloat s_dqdy[p_Nq][p_Nq];
-    @shared dfloat s_dqdz[p_Nq][p_Nq];
-    @shared dfloat s_rhsq[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dlong element;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        element = elementList[e];
-
-        // assume that this stores (p, px, py, pz) as dfloat4
-        const dfloat4 gradqn = gradq[element * p_Np + j * p_Nq + i];
-
-        const dlong gid = i + j * p_Nq + element * p_Np * p_Nvgeo;
-        dfloat JW = vgeo[gid + p_JWID * p_Np];
-
-        s_dqdx[j][i] = JW * gradqn.x;
-        s_dqdy[j][i] = JW * gradqn.y;
-        s_dqdz[j][i] = JW * gradqn.z;
-        s_rhsq[j][i] = JW * lambda * gradqn.w;
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-
-    @barrier("local");
-
-    // loop over faces to add pseudo-gradient
-
-    // face 0 & 2
-    for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-      dlong sk0 = element * p_Nfp * p_Nfaces + 0 * p_Nfp + i;
-      dlong sk2 = element * p_Nfp * p_Nfaces + 2 * p_Nfp + i;
-
-      //      surfaceTerms(sk0,0,i,0);
-      surfaceTerms(element,
-                   sk0,
-                   0,
-                   i,
-                   0,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_dqdz,
-                   s_rhsq);
-
-      //      surfaceTerms(sk2,2,i,p_Nq-1);
-      surfaceTerms(element,
-                   sk2,
-                   2,
-                   i,
-                   p_Nq - 1,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_dqdz,
-                   s_rhsq);
-    }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-      dlong sk1 = element * p_Nfp * p_Nfaces + 1 * p_Nfp + j;
-      dlong sk3 = element * p_Nfp * p_Nfaces + 3 * p_Nfp + j;
-
-      //      surfaceTerms(sk1,1,p_Nq-1,j);
-      surfaceTerms(element,
-                   sk1,
-                   1,
-                   p_Nq - 1,
-                   j,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_dqdz,
-                   s_rhsq);
-
-      //      surfaceTerms(sk3,3,0,j);
-      surfaceTerms(element,
-                   sk3,
-                   3,
-                   0,
-                   j,
-                   tau,
-                   sgeo,
-                   vmapM,
-                   vmapP,
-                   gradq,
-                   s_dqdx,
-                   s_dqdy,
-                   s_dqdz,
-                   s_rhsq);
-    }
-
-    @barrier("local");
-
-    // prescale by geofacs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // does need the nasty geofacs
-        const dlong gid = i + j * p_Nq + element * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-        const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-        const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-        const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-        const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-        // chain rule (need to scale by wei
-        const dfloat dqdx = s_dqdx[j][i];
-        const dfloat dqdy = s_dqdy[j][i];
-        const dfloat dqdz = s_dqdz[j][i];
-
-        s_dqdx[j][i] = (drdx * dqdx + drdy * dqdy + drdz * dqdz);
-        s_dqdy[j][i] = (dsdx * dqdx + dsdy * dqdy + dsdz * dqdz);
-#if 0
-        s_dqdz[j][i] = (dtdx * dqdx + dtdy * dqdy + dtdz * dqdz);
-#else
-        s_dqdz[j][i] = 0;
-#endif
-      }
-    }
-
-    @barrier("local");
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat dr = 0, ds = 0, dt = s_dqdz[j][i];
-
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          dr += s_D[n][i] * s_dqdx[j][n];
-          ds += s_D[n][j] * s_dqdy[n][i];
-        }
-
-        const dlong id = element * p_Np + j * p_Nq + i;
-        Aq[id] = s_rhsq[j][i]  + dr + ds + dt;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl
deleted file mode 100644
index ce8ad3967..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// increments gradients of pressure with pseudo-gradients at faces and
-// increments rhs with pseudo-gradient/penalty terms at faces (need to double check scaling with tau)
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-
-// nx,ny,nz,sJ,invJ - need WsJ
-
-@kernel void ellipticAxIpdgTet3D(const dlong Nelements,
-                                 @restrict const dlong*  vmapM,
-                                 @restrict const dlong*  vmapP,
-                                 const dfloat lambda,
-                                 const dfloat tau,
-                                 @restrict const dfloat*  vgeo,
-                                 @restrict const dfloat*  sgeo,
-                                 @restrict const int*  EToB,
-                                 @restrict const dfloat*  Dmatrices,
-                                 @restrict const dfloat*  LIFTT,
-                                 @restrict const dfloat*  MM,
-                                 @restrict const dfloat4*  gradq,
-                                 @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_dqdz[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_nzdq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @shared dfloat s_Lnzdq[p_Np];
-    @exclusive dlong idM;
-    @exclusive dfloat nx, ny, nz, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        // assume that this stores (qx, qy, qz, q) as dfloat4
-        const dfloat4 gradqn = gradq[e * p_Np + n];
-
-        s_dqdx[n] = gradqn.x;
-        s_dqdy[n] = gradqn.y;
-        s_dqdz[n] = gradqn.z;
-        s_lapq[n] = lambda * gradqn.w;
-      }
-
-      if(n < p_NfacesNfp) {
-        const dlong id  = n + e * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-        const dlong idP = vmapP[id];
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-        dfloat4 gradqP = gradq[idP];
-
-        // load surface geofactors for this face
-        dlong sid = p_Nsgeo * (e * p_Nfaces + face);
-        nx = sgeo[sid + p_NXID];
-        ny = sgeo[sid + p_NYID];
-        nz = sgeo[sid + p_NZID];
-        sJ = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        int bc = EToB[face + p_Nfaces * e];
-        if(bc > 0) {
-          ellipticHomogeneousBC3D(bc,
-                                  gradqM.w,
-                                  gradqM.x,
-                                  gradqM.y,
-                                  gradqM.z,
-                                  gradqP.w,
-                                  gradqP.x,
-                                  gradqP.y,
-                                  gradqP.z)
-          gradqP.x = 2.f * gradqP.x - gradqM.x;
-          gradqP.y = 2.f * gradqP.y - gradqM.y;
-          gradqP.z = 2.f * gradqP.z - gradqM.z;
-          gradqP.w = 2.f * gradqP.w - gradqM.w;
-        }
-
-        const dfloat dq = gradqP.w - gradqM.w;
-        const dfloat half = 0.5f;
-
-        s_nxdq[n] = half * sJ * invJ * nx * dq;
-        s_nydq[n] = half * sJ * invJ * ny * dq;
-        s_nzdq[n] = half * sJ * invJ * nz * dq;
-
-        s_lapflux[n] = half * sJ * invJ * (-nx * (gradqP.x - gradqM.x)
-                                           - ny * (gradqP.y - gradqM.y)
-                                           - nz * (gradqP.z - gradqM.z)
-                                           - tau * hinv * dq);
-      }
-    }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dlong gid = e * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat drdz = vgeo[gid + p_RZID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-        const dfloat dsdz = vgeo[gid + p_SZID];
-        const dfloat dtdx = vgeo[gid + p_TXID];
-        const dfloat dtdy = vgeo[gid + p_TYID];
-        const dfloat dtdz = vgeo[gid + p_TZID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-        dfloat Lnzdq = 0;
-
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i) {
-          Lnxdq += LIFTT[n + i * p_Np] * s_nxdq[i];
-          Lnydq += LIFTT[n + i * p_Np] * s_nydq[i];
-          Lnzdq += LIFTT[n + i * p_Np] * s_nzdq[i];
-        }
-
-        dfloat dqdx = s_dqdx[n] + Lnxdq;
-        dfloat dqdy = s_dqdy[n] + Lnydq;
-        dfloat dqdz = s_dqdz[n] + Lnzdq;
-
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy + drdz * dqdz; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy + dsdz * dqdz;
-        s_dqdz[n] = dtdx * dqdx + dtdy * dqdy + dtdz * dqdz;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-        s_Lnzdq[n] = Lnzdq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        int id = (int) (idM % p_Np);
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id] + nz * s_Lnzdq[id]);
-      }
-
-      if(n < p_Np) {
-        dfloat lapr = 0, laps = 0, lapt = 0;
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          lapr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dqdx[i];
-          laps += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_dqdy[i];
-          lapt += Dmatrices[n + i * p_Np + 2 * p_Np * p_Np] * s_dqdz[i];
-        }
-
-        s_lapq[n] -= (lapr + laps + lapt);
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-        // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          lap += LIFTT[n + i * p_Np] * s_lapflux[i];
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[e * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        Aq[n + e * p_Np] = J * Mlapq;
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialAxIpdgTet3D(const dlong Nelements,
-                                        @restrict const dlong*  elementList,
-                                        @restrict const dlong*  vmapM,
-                                        @restrict const dlong*  vmapP,
-                                        const dfloat lambda,
-                                        const dfloat tau,
-                                        @restrict const dfloat*  vgeo,
-                                        @restrict const dfloat*  sgeo,
-                                        @restrict const int*  EToB,
-                                        @restrict const dfloat*  Dmatrices,
-                                        @restrict const dfloat*  LIFTT,
-                                        @restrict const dfloat*  MM,
-                                        @restrict const dfloat4*  gradq,
-                                        @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_dqdz[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_nzdq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @shared dfloat s_Lnzdq[p_Np];
-    @exclusive dlong element;
-    @exclusive dlong idM;
-    @exclusive dfloat nx, ny, nz, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      element = elementList[e];
-      if(n < p_Np) {
-        // assume that this stores (qx, qy, qz, q) as dfloat4
-        const dfloat4 gradqn = gradq[element * p_Np + n];
-
-        s_dqdx[n] = gradqn.x;
-        s_dqdy[n] = gradqn.y;
-        s_dqdz[n] = gradqn.z;
-        s_lapq[n] = lambda * gradqn.w;
-      }
-
-      if(n < p_NfacesNfp) {
-        const dlong id  = n + element * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-        const dlong idP = vmapP[id];
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-        dfloat4 gradqP = gradq[idP];
-
-        // load surface geofactors for this face
-        dlong sid = p_Nsgeo * (element * p_Nfaces + face);
-        nx = sgeo[sid + p_NXID];
-        ny = sgeo[sid + p_NYID];
-        nz = sgeo[sid + p_NZID];
-        sJ = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        int bc = EToB[face + p_Nfaces * element];
-        if(bc > 0) {
-          ellipticHomogeneousBC3D(bc,
-                                  gradqM.w,
-                                  gradqM.x,
-                                  gradqM.y,
-                                  gradqM.z,
-                                  gradqP.w,
-                                  gradqP.x,
-                                  gradqP.y,
-                                  gradqP.z);
-          gradqP.x = 2.f * gradqP.x - gradqM.x;
-          gradqP.y = 2.f * gradqP.y - gradqM.y;
-          gradqP.z = 2.f * gradqP.z - gradqM.z;
-          gradqP.w = 2.f * gradqP.w - gradqM.w;
-        }
-
-        const dfloat dq = gradqP.w - gradqM.w;
-        const dfloat half = 0.5f;
-
-        s_nxdq[n] = half * sJ * invJ * nx * dq;
-        s_nydq[n] = half * sJ * invJ * ny * dq;
-        s_nzdq[n] = half * sJ * invJ * nz * dq;
-
-        s_lapflux[n] = half * sJ * invJ * (-nx * (gradqP.x - gradqM.x)
-                                           - ny * (gradqP.y - gradqM.y)
-                                           - nz * (gradqP.z - gradqM.z)
-                                           - tau * hinv * dq);
-      }
-    }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dlong gid = element * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat drdz = vgeo[gid + p_RZID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-        const dfloat dsdz = vgeo[gid + p_SZID];
-        const dfloat dtdx = vgeo[gid + p_TXID];
-        const dfloat dtdy = vgeo[gid + p_TYID];
-        const dfloat dtdz = vgeo[gid + p_TZID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-        dfloat Lnzdq = 0;
-
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i) {
-          Lnxdq += LIFTT[n + i * p_Np] * s_nxdq[i];
-          Lnydq += LIFTT[n + i * p_Np] * s_nydq[i];
-          Lnzdq += LIFTT[n + i * p_Np] * s_nzdq[i];
-        }
-
-        dfloat dqdx = s_dqdx[n] + Lnxdq;
-        dfloat dqdy = s_dqdy[n] + Lnydq;
-        dfloat dqdz = s_dqdz[n] + Lnzdq;
-
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy + drdz * dqdz; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy + dsdz * dqdz;
-        s_dqdz[n] = dtdx * dqdx + dtdy * dqdy + dtdz * dqdz;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-        s_Lnzdq[n] = Lnzdq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        int id = (int) (idM % p_Np);
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id] + nz * s_Lnzdq[id]);
-      }
-
-      if(n < p_Np) {
-        dfloat lapr = 0, laps = 0, lapt = 0;
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          lapr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dqdx[i];
-          laps += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_dqdy[i];
-          lapt += Dmatrices[n + i * p_Np + 2 * p_Np * p_Np] * s_dqdz[i];
-        }
-
-        s_lapq[n] -= (lapr + laps + lapt);
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-        // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          lap += LIFTT[n + i * p_Np] * s_lapflux[i];
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[element * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        Aq[n + element * p_Np] = J * Mlapq;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl
deleted file mode 100644
index 9f5e689ab..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl
+++ /dev/null
@@ -1,507 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-// nx,ny,nz,sJ,invJ - need WsJ
-
-@kernel void ellipticAxIpdgTri2D(const dlong Nelements,
-                                 @restrict const dlong*  vmapM,
-                                 @restrict const dlong*  vmapP,
-                                 const dfloat lambda,
-                                 const dfloat tau,
-                                 @restrict const dfloat*  vgeo,
-                                 @restrict const dfloat*  sgeo,
-                                 @restrict const int*  EToB,
-                                 @restrict const dfloat*  Dmatrices,
-                                 @restrict const dfloat*  LIFTT,
-                                 @restrict const dfloat*  MM,
-                                 @restrict const dfloat4*  gradq,
-                                 @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @exclusive dlong idM;
-    @exclusive dfloat nx, ny, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        // assume that this stores (qx, qy, qz, q) as dfloat4
-        const dfloat4 gradqn = gradq[e * p_Np + n];
-
-        s_dqdx[n] = gradqn.x;
-        s_dqdy[n] = gradqn.y;
-        s_lapq[n] = lambda * gradqn.w;
-      }
-
-      if(n < p_NfacesNfp) {
-        const dlong id  = n + e * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-        const dlong idP = vmapP[id];
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-        dfloat4 gradqP = gradq[idP];
-
-        // load surface geofactors for this face
-        dlong sid = p_Nsgeo * (e * p_Nfaces + face);
-        nx   = sgeo[sid + p_NXID];
-        ny   = sgeo[sid + p_NYID];
-        sJ   = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        int bc = EToB[face + p_Nfaces * e];
-        if(bc > 0) {
-          ellipticHomogeneousBC2D(bc, gradqM.w, gradqM.x, gradqM.y, gradqP.w, gradqP.x, gradqP.y);
-          gradqP.x = 2.f * gradqP.x - gradqM.x;
-          gradqP.y = 2.f * gradqP.y - gradqM.y;
-          gradqP.w = 2.f * gradqP.w - gradqM.w;
-        }
-
-        const dfloat dq = gradqP.w - gradqM.w;
-        const dfloat hlf = 0.5f;
-
-        s_nxdq[n] = hlf * sJ * invJ * nx * dq;
-        s_nydq[n] = hlf * sJ * invJ * ny * dq;
-
-        s_lapflux[n] = hlf * sJ * invJ * (-nx * (gradqP.x - gradqM.x)
-                                          - ny * (gradqP.y - gradqM.y)
-                                          - tau * hinv * dq);
-      }
-    }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dlong gid = e * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i) {
-          Lnxdq += LIFTT[n + i * p_Np] * s_nxdq[i];
-          Lnydq += LIFTT[n + i * p_Np] * s_nydq[i];
-        }
-
-        dfloat dqdx = s_dqdx[n] + Lnxdq;
-        dfloat dqdy = s_dqdy[n] + Lnydq;
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        int id = idM % p_Np;
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id]);
-      }
-
-      if(n < p_Np) {
-        dfloat lapr = 0, laps = 0;
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          lapr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dqdx[i];
-          laps += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_dqdy[i];
-        }
-
-        s_lapq[n] -= (lapr + laps);
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-        // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          lap += LIFTT[n + i * p_Np] * s_lapflux[i];
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[e * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        Aq[n + e * p_Np] = J * Mlapq;
-      }
-    }
-  }
-}
-
-// // Optimized sizes for @kernel 4-5
-#if p_N == 1
-#define p_NbV 5
-#define p_Nmt 1
-#endif
-
-#if p_N == 2
-#define p_NbV 7
-#define p_Nmt 2
-#endif
-
-#if p_N == 3
-#define p_NbV 5
-#define p_Nmt 2
-#endif
-
-#if p_N == 4
-#define p_NbV 2
-#define p_Nmt 2
-#endif
-
-#if p_N == 5
-#define p_NbV 3
-#define p_Nmt 2
-#endif
-
-#if p_N == 6
-#define p_NbV 4
-#define p_Nmt 2
-#endif
-
-#if p_N == 7
-#define p_NbV 4
-#define p_Nmt 2
-#endif
-
-#if p_N == 8
-#define p_NbV 2
-#define p_Nmt 3
-#endif
-
-#if p_N == 9
-#define p_NbV 2
-#define p_Nmt 3
-#endif
-
-#if p_N == 10
-#define p_NbV 3
-#define p_Nmt 3
-#endif
-
-// Added multiple element per threadblock
-@kernel void ellipticPartialAxIpdgTri2D(const dlong Nelements,
-                                        @restrict const dlong*  elementList,
-                                        @restrict const dlong*  vmapM,
-                                        @restrict const dlong*  vmapP,
-                                        const dfloat lambda,
-                                        const dfloat tau,
-                                        @restrict const dfloat*  vgeo,
-                                        @restrict const dfloat*  sgeo,
-                                        @restrict const int*  EToB,
-                                        @restrict const dfloat*  Dmatrices,
-                                        @restrict const dfloat*  LIFTT,
-                                        @restrict const dfloat*  MM,
-                                        @restrict const dfloat4*  gradq,
-                                        @restrict dfloat*  Aq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += (p_NbV * p_Nmt); @outer(0)) {
-    @shared dfloat s_dqdx[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_dqdy[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_lapq[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_nxdq[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_nydq[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_lapflux[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_Lnydq[p_Nmt][p_NbV][p_Np];
-
-    // @shared dlong idM[p_Nmt];
-    // @shared dfloat nx[p_Nmt], ny[p_Nmt], sJ[p_Nmt];
-    // @shared dfloat invJ[p_Nmt], hinv[p_Nmt];
-
-    @exclusive dlong idM[p_Nmt];
-    @exclusive dfloat nx[p_Nmt], ny[p_Nmt], sJ[p_Nmt];
-    @exclusive dfloat invJ[p_Nmt], hinv[p_Nmt];
-
-    @exclusive dlong element[p_Nmt];
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em;
-          if(e < Nelements)
-            element[em] = elementList[e];
-        }
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em;
-          //const int element = elementList[e];
-
-          if(e < Nelements) {
-            if(n < p_Np) {
-              // assume that this stores (qx, qy, qz, q) as dfloat4
-
-              const dfloat4 gradqn = gradq[element[em] * p_Np + n];
-
-              s_dqdx[em][es][n] = gradqn.x;
-              s_dqdy[em][es][n] = gradqn.y;
-              s_lapq[em][es][n] = lambda * gradqn.w;
-            }
-
-            if(n < p_NfacesNfp) {
-              const dlong id  = n + element[em] * p_Nfaces * p_Nfp;
-              idM[em] = vmapM[id];
-              const dlong idP = vmapP[id];
-              // find face that owns this node
-              const int face = n / p_Nfp;
-
-              dfloat4 gradqM = gradq[idM[em]];// could fetch from @shared after barrier
-              dfloat4 gradqP = gradq[idP];
-
-              // load surface geofactors for this face
-              dlong sid = p_Nsgeo * (element[em] * p_Nfaces + face);
-              nx[em]   = sgeo[sid + p_NXID];
-              ny[em]   = sgeo[sid + p_NYID];
-              sJ[em]   = sgeo[sid + p_SJID];
-              invJ[em] = sgeo[sid + p_IJID];
-              hinv[em] = sgeo[sid + p_IHID];
-
-              int bc = EToB[face + p_Nfaces * element[em]];
-              if(bc > 0) {
-                ellipticHomogeneousBC2D(bc,
-                                        gradqM.w,
-                                        gradqM.x,
-                                        gradqM.y,
-                                        gradqP.w,
-                                        gradqP.x,
-                                        gradqP.y);
-                gradqP.x = 2.f * gradqP.x - gradqM.x;
-                gradqP.y = 2.f * gradqP.y - gradqM.y;
-                gradqP.w = 2.f * gradqP.w - gradqM.w;
-              }
-
-              const dfloat dq = gradqP.w - gradqM.w;
-              const dfloat hlf = 0.5f;
-
-              s_nxdq[em][es][n] = hlf * sJ[em] * invJ[em] * nx[em] * dq;
-              s_nydq[em][es][n] = hlf * sJ[em] * invJ[em] * ny[em] * dq;
-
-              s_lapflux[em][es][n] = hlf * sJ[em] * invJ[em] * (-nx[em] * (gradqP.x - gradqM.x)
-                                                                - ny[em] * (gradqP.y - gradqM.y)
-                                                                - tau * hinv[em] * dq);
-            }
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat Lnxdq[p_Nmt], Lnydq[p_Nmt];
-
-        // // Try holding drdx in register array
-        // const dfloat drdx[p_Nmt], drdy[p_Nmt];
-        // const dfloat dsdx[p_Nmt], dsdy[p_Nmt];
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          Lnxdq[em] = 0.f;
-          Lnydq[em] = 0.f;
-          //
-          //  const int e = eo+es*p_Nmt + em;
-          //  const int gid = element*p_Nvgeo;
-          // drdx[em] = vgeo[gid + p_RXID];
-          // drdy[em] = vgeo[gid + p_RYID];
-          // dsdx[em] = vgeo[gid + p_SXID];
-          // dsdy[em] = vgeo[gid + p_SYID];
-        }
-
-        if(n < p_Np) {
-#pragma unroll p_NfacesNfp
-          for(int i = 0; i < p_NfacesNfp; ++i) {
-            const dfloat L = LIFTT[n + i * p_Np];
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em) {
-              Lnxdq[em] += L * s_nxdq[em][es][i];
-              Lnydq[em] += L * s_nydq[em][es][i];
-            }
-          }
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em) {
-            const dlong e = eo + es * p_Nmt + em;
-            if(e < Nelements) {
-              // const int element = elementList[e];
-              const dlong gid = element[em] * p_Nvgeo;
-
-              // These data can be stored on @shared
-              const dfloat drdx = vgeo[gid + p_RXID];
-              const dfloat drdy = vgeo[gid + p_RYID];
-              const dfloat dsdx = vgeo[gid + p_SXID];
-              const dfloat dsdy = vgeo[gid + p_SYID];
-
-              dfloat dqdx = s_dqdx[em][es][n] + Lnxdq[em];
-              dfloat dqdy = s_dqdy[em][es][n] + Lnydq[em];
-              s_dqdx[em][es][n] = drdx * dqdx + drdy * dqdy; // abuse of notation
-              s_dqdy[em][es][n] = dsdx * dqdx + dsdy * dqdy;
-
-              s_Lnxdq[em][es][n] = Lnxdq[em];
-              s_Lnydq[em][es][n] = Lnydq[em];
-            }
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat lapr[p_Nmt], laps[p_Nmt];
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          if(n < p_NfacesNfp) {
-            int id = idM[em] % p_Np;
-            s_lapflux[em][es][n] += sJ[em] * invJ[em] * (nx[em] * s_Lnxdq[em][es][id]
-                                                         + ny[em] * s_Lnydq[em][es][id]);
-          }
-
-          lapr[em] = 0.f;
-          laps[em] = 0.f;
-        }
-
-        if(n < p_Np) {
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            const dfloat drT = Dmatrices[n + i * p_Np + 0 * p_Np * p_Np];
-            const dfloat dsT = Dmatrices[n + i * p_Np + 1 * p_Np * p_Np];
-
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em) {
-              lapr[em] += drT * s_dqdx[em][es][i];
-              laps[em] += dsT * s_dqdy[em][es][i];
-            }
-          }
-
-          if(n < p_Np) {
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em)
-              s_lapq[em][es][n] -= (lapr[em] + laps[em]);
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat lap[p_Nmt];
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em)
-          lap[em] = 0.f;
-
-        if(n < p_Np) {
-          // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-          for(int i = 0; i < p_NfacesNfp; ++i) {
-            const dfloat L = LIFTT[n + i * p_Np];
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em)
-              lap[em] += L * s_lapflux[em][es][i];
-          }
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em)
-            s_lapq[em][es][n] += lap[em];
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat Mlapq[p_Nmt];
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em)
-          Mlapq[em] = 0.f;
-
-        if(n < p_Np) {
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            const dfloat mm = MM[n + i * p_Np];
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em)
-              Mlapq[em] += mm * s_lapq[em][es][i];
-          }
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em) {
-            const dlong e = eo + es * p_Nmt + em;
-
-            if(e < Nelements) {
-              // const int element = elementList[e];
-              const dfloat J = vgeo[element[em] * p_Nvgeo + p_JID];
-
-              Aq[n + element[em] * p_Np] = J * Mlapq[em];
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl
deleted file mode 100644
index b63f04961..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-// nx,ny,nz,sJ,invJ - need WsJ
-
-@kernel void ellipticAxIpdgTri3D(const dlong Nelements,
-                                 @restrict const dlong*  vmapM,
-                                 @restrict const dlong*  vmapP,
-                                 const dfloat lambda,
-                                 const dfloat tau,
-                                 @restrict const dfloat*  vgeo,
-                                 @restrict const dfloat*  sgeo,
-                                 @restrict const int*  EToB,
-                                 @restrict const dfloat*  Dmatrices,
-                                 @restrict const dfloat*  LIFTT,
-                                 @restrict const dfloat*  MM,
-                                 @restrict const dfloat4*  gradq,
-                                 @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_dqdz[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_nzdq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @shared dfloat s_Lnzdq[p_Np];
-    @exclusive dlong idM;
-    @exclusive dfloat nx, ny, nz, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        // assume that this stores (qx, qy, qz, q) as dfloat4
-        const dfloat4 gradqn = gradq[e * p_Np + n];
-
-        s_dqdx[n] = gradqn.x;
-        s_dqdy[n] = gradqn.y;
-        s_dqdz[n] = gradqn.z;
-        s_lapq[n] = lambda * gradqn.w;
-      }
-
-      if(n < p_NfacesNfp) {
-        const dlong id  = n + e * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-        const dlong idP = vmapP[id];
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-        dfloat4 gradqP = gradq[idP];
-
-        // load surface geofactors for this face
-        dlong sid = p_Nsgeo * (e * p_Nfaces + face);
-        nx   = sgeo[sid + p_NXID];
-        ny   = sgeo[sid + p_NYID];
-        nz   = sgeo[sid + p_NZID];
-        sJ   = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        const dfloat dq = gradqP.w - gradqM.w;
-        const dfloat hlf = 0.5f;
-
-        s_nxdq[n] = hlf * sJ * invJ * nx * dq;
-        s_nydq[n] = hlf * sJ * invJ * ny * dq;
-        s_nzdq[n] = hlf * sJ * invJ * nz * dq;
-
-        s_lapflux[n] = hlf * sJ * invJ * (-nx * (gradqP.x - gradqM.x)
-                                          - ny * (gradqP.y - gradqM.y)
-                                          - nz * (gradqP.z - gradqM.z)
-                                          - tau * hinv * dq);
-      }
-    }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dlong gid = e * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat drdz = vgeo[gid + p_RZID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-        const dfloat dsdz = vgeo[gid + p_SZID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-        dfloat Lnzdq = 0;
-
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i) {
-          Lnxdq += LIFTT[n + i * p_Np] * s_nxdq[i];
-          Lnydq += LIFTT[n + i * p_Np] * s_nydq[i];
-          Lnzdq += LIFTT[n + i * p_Np] * s_nzdq[i];
-        }
-
-        dfloat dqdx = s_dqdx[n] + Lnxdq;
-        dfloat dqdy = s_dqdy[n] + Lnydq;
-        dfloat dqdz = s_dqdz[n] + Lnzdq;
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy + drdz * dqdz; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy + dsdz * dqdz;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-        s_Lnzdq[n] = Lnzdq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        int id = idM % p_Np;
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id] + nz * s_Lnzdq[id]);
-      }
-
-      if(n < p_Np) {
-        dfloat lapr = 0, laps = 0;
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          lapr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dqdx[i];
-          laps += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_dqdy[i];
-        }
-
-        s_lapq[n] -= (lapr + laps);
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-        // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          lap += LIFTT[n + i * p_Np] * s_lapflux[i];
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[e * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        Aq[n + e * p_Np] = J * Mlapq;
-      }
-    }
-  }
-}
-
-// // Optimized sizes for @kernel 4-5
-#if p_N == 1
-#define p_NbV 5
-#define p_Nmt 1
-#endif
-
-#if p_N == 2
-#define p_NbV 7
-#define p_Nmt 2
-#endif
-
-#if p_N == 3
-#define p_NbV 5
-#define p_Nmt 2
-#endif
-
-#if p_N == 4
-#define p_NbV 2
-#define p_Nmt 2
-#endif
-
-#if p_N == 5
-#define p_NbV 3
-#define p_Nmt 2
-#endif
-
-#if p_N == 6
-#define p_NbV 4
-#define p_Nmt 2
-#endif
-
-#if p_N == 7
-#define p_NbV 4
-#define p_Nmt 2
-#endif
-
-#if p_N == 8
-#define p_NbV 2
-#define p_Nmt 3
-#endif
-
-#if p_N == 9
-#define p_NbV 2
-#define p_Nmt 3
-#endif
-
-#if p_N == 10
-#define p_NbV 3
-#define p_Nmt 3
-#endif
-
-// Added multiple element per threadblock
-@kernel void ellipticPartialAxIpdgTri3D(const dlong Nelements,
-                                        @restrict const dlong*  elementList,
-                                        @restrict const dlong*  vmapM,
-                                        @restrict const dlong*  vmapP,
-                                        const dfloat lambda,
-                                        const dfloat tau,
-                                        @restrict const dfloat*  vgeo,
-                                        @restrict const dfloat*  sgeo,
-                                        @restrict const int*  EToB,
-                                        @restrict const dfloat*  Dmatrices,
-                                        @restrict const dfloat*  LIFTT,
-                                        @restrict const dfloat*  MM,
-                                        @restrict const dfloat4*  gradq,
-                                        @restrict dfloat*  Aq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += (p_NbV * p_Nmt); @outer(0)) {
-    @shared dfloat s_dqdx[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_dqdy[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_dqdz[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_lapq[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_nxdq[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_nydq[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_nzdq[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_lapflux[p_Nmt][p_NbV][p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_Lnydq[p_Nmt][p_NbV][p_Np];
-    @shared dfloat s_Lnzdq[p_Nmt][p_NbV][p_Np];
-
-    // @shared dlong idM[p_Nmt];
-    // @shared dfloat nx[p_Nmt], ny[p_Nmt], sJ[p_Nmt];
-    // @shared dfloat invJ[p_Nmt], hinv[p_Nmt];
-
-    @exclusive dlong idM[p_Nmt];
-    @exclusive dfloat nx[p_Nmt], ny[p_Nmt], nz[p_Nmt], sJ[p_Nmt];
-    @exclusive dfloat invJ[p_Nmt], hinv[p_Nmt];
-
-    @exclusive dlong element[p_Nmt];
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em;
-          if(e < Nelements)
-            element[em] = elementList[e];
-        }
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em;
-          //const int element = elementList[e];
-
-          if(e < Nelements) {
-            if(n < p_Np) {
-              // assume that this stores (qx, qy, qz, q) as dfloat4
-
-              const dfloat4 gradqn = gradq[element[em] * p_Np + n];
-
-              s_dqdx[em][es][n] = gradqn.x;
-              s_dqdy[em][es][n] = gradqn.y;
-              s_dqdz[em][es][n] = gradqn.z;
-              s_lapq[em][es][n] = lambda * gradqn.w;
-            }
-
-            if(n < p_NfacesNfp) {
-              const dlong id  = n + element[em] * p_Nfaces * p_Nfp;
-              idM[em] = vmapM[id];
-              const dlong idP = vmapP[id];
-              // find face that owns this node
-              const int face = n / p_Nfp;
-
-              dfloat4 gradqM = gradq[idM[em]];// could fetch from @shared after barrier
-              dfloat4 gradqP = gradq[idP];
-
-              // load surface geofactors for this face
-              dlong sid = p_Nsgeo * (element[em] * p_Nfaces + face);
-              nx[em]   = sgeo[sid + p_NXID];
-              ny[em]   = sgeo[sid + p_NYID];
-              nz[em]   = sgeo[sid + p_NZID];
-              sJ[em]   = sgeo[sid + p_SJID];
-              invJ[em] = sgeo[sid + p_IJID];
-              hinv[em] = sgeo[sid + p_IHID];
-
-              const dfloat dq = gradqP.w - gradqM.w;
-              const dfloat hlf = 0.5f;
-
-              s_nxdq[em][es][n] = hlf * sJ[em] * invJ[em] * nx[em] * dq;
-              s_nydq[em][es][n] = hlf * sJ[em] * invJ[em] * ny[em] * dq;
-              s_nzdq[em][es][n] = hlf * sJ[em] * invJ[em] * nz[em] * dq;
-
-              s_lapflux[em][es][n] = hlf * sJ[em] * invJ[em] * (-nx[em] * (gradqP.x - gradqM.x)
-                                                                - ny[em] * (gradqP.y - gradqM.y)
-                                                                - nz[em] * (gradqP.z - gradqM.z)
-                                                                - tau * hinv[em] * dq);
-            }
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat Lnxdq[p_Nmt], Lnydq[p_Nmt], Lnzdq[p_Nmt];
-
-        // // Try holding drdx in register array
-        // const dfloat drdx[p_Nmt], drdy[p_Nmt];
-        // const dfloat dsdx[p_Nmt], dsdy[p_Nmt];
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          Lnxdq[em] = 0.f;
-          Lnydq[em] = 0.f;
-          Lnzdq[em] = 0.f;
-          //
-          //  const int e = eo+es*p_Nmt + em;
-          //  const int gid = element*p_Nvgeo;
-          // drdx[em] = vgeo[gid + p_RXID];
-          // drdy[em] = vgeo[gid + p_RYID];
-          // dsdx[em] = vgeo[gid + p_SXID];
-          // dsdy[em] = vgeo[gid + p_SYID];
-        }
-
-        if(n < p_Np) {
-#pragma unroll p_NfacesNfp
-          for(int i = 0; i < p_NfacesNfp; ++i) {
-            const dfloat L = LIFTT[n + i * p_Np];
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em) {
-              Lnxdq[em] += L * s_nxdq[em][es][i];
-              Lnydq[em] += L * s_nydq[em][es][i];
-              Lnzdq[em] += L * s_nzdq[em][es][i];
-            }
-          }
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em) {
-            const dlong e = eo + es * p_Nmt + em;
-            if(e < Nelements) {
-              // const int element = elementList[e];
-              const dlong gid = element[em] * p_Nvgeo;
-
-              // These data can be stored on @shared
-              const dfloat drdx = vgeo[gid + p_RXID];
-              const dfloat drdy = vgeo[gid + p_RYID];
-              const dfloat drdz = vgeo[gid + p_RZID];
-              const dfloat dsdx = vgeo[gid + p_SXID];
-              const dfloat dsdy = vgeo[gid + p_SYID];
-              const dfloat dsdz = vgeo[gid + p_SZID];
-
-              dfloat dqdx = s_dqdx[em][es][n] + Lnxdq[em];
-              dfloat dqdy = s_dqdy[em][es][n] + Lnydq[em];
-              dfloat dqdz = s_dqdz[em][es][n] + Lnzdq[em];
-              s_dqdx[em][es][n] = drdx * dqdx + drdy * dqdy + drdz * dqdz; // abuse of notation
-              s_dqdy[em][es][n] = dsdx * dqdx + dsdy * dqdy + dsdz * dqdz;
-
-              s_Lnxdq[em][es][n] = Lnxdq[em];
-              s_Lnydq[em][es][n] = Lnydq[em];
-              s_Lnzdq[em][es][n] = Lnzdq[em];
-            }
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat lapr[p_Nmt], laps[p_Nmt];
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          if(n < p_NfacesNfp) {
-            int id = idM[em] % p_Np;
-            s_lapflux[em][es][n] += sJ[em] * invJ[em] * (nx[em] * s_Lnxdq[em][es][id]
-                                                         + ny[em] * s_Lnydq[em][es][id]
-                                                         + nz[em] * s_Lnzdq[em][es][id]);
-          }
-
-          lapr[em] = 0.f;
-          laps[em] = 0.f;
-        }
-
-        if(n < p_Np) {
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            const dfloat drT = Dmatrices[n + i * p_Np + 0 * p_Np * p_Np];
-            const dfloat dsT = Dmatrices[n + i * p_Np + 1 * p_Np * p_Np];
-
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em) {
-              lapr[em] += drT * s_dqdx[em][es][i];
-              laps[em] += dsT * s_dqdy[em][es][i];
-            }
-          }
-
-          if(n < p_Np) {
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em)
-              s_lapq[em][es][n] -= (lapr[em] + laps[em]);
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat lap[p_Nmt];
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em)
-          lap[em] = 0.f;
-
-        if(n < p_Np) {
-          // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-          for(int i = 0; i < p_NfacesNfp; ++i) {
-            const dfloat L = LIFTT[n + i * p_Np];
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em)
-              lap[em] += L * s_lapflux[em][es][i];
-          }
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em)
-            s_lapq[em][es][n] += lap[em];
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {//
-      for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-        dfloat Mlapq[p_Nmt];
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em)
-          Mlapq[em] = 0.f;
-
-        if(n < p_Np) {
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            const dfloat mm = MM[n + i * p_Np];
-#pragma unroll p_Nmt
-            for(int em = 0; em < p_Nmt; ++em)
-              Mlapq[em] += mm * s_lapq[em][es][i];
-          }
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em) {
-            const dlong e = eo + es * p_Nmt + em;
-
-            if(e < Nelements) {
-              // const int element = elementList[e];
-              const dfloat J = vgeo[element[em] * p_Nvgeo + p_JID];
-
-              Aq[n + element[em] * p_Np] = J * Mlapq[em];
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticAxQuad2D.okl
deleted file mode 100644
index ef29f433b..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxQuad2D.okl
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// hex @kernel for screened coulomb potential mat-vec
-#define squareThreads                           \
-  for(int j = 0; j < p_Nq; ++j; @inner(1))           \
-  for(int i = 0; i < p_Nq; ++i; @inner(0))
-
-// square thread version
-@kernel void ellipticAxQuad2D(const dlong Nelements,
-                              @restrict const dfloat*  ggeo,
-                              @restrict const dfloat*  D,
-                              @restrict const dfloat*  S,
-                              @restrict const dfloat*  MM,
-                              const dfloat lambda,
-                              @restrict const dfloat*  q,
-                              @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dfloat r_qr, r_qs, r_Aq;
-    @exclusive dfloat r_G00, r_G01, r_G11, r_GwJ;
-
-    // prefetch q(:,:,:,e) to @shared
-    squareThreads {
-      const dlong base = i + j * p_Nq + e * p_Np;
-
-      s_q[j][i] = q[base];
-
-      // fetch D to @shared
-      s_D[j][i] = D[j * p_Nq + i];
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      const dlong base = e * p_Nggeo * p_Np + j * p_Nq + i;
-
-      // assumes w*J built into G entries
-      r_GwJ = ggeo[base + p_GWJID * p_Np];
-
-      r_G00 = ggeo[base + p_G00ID * p_Np];
-      r_G01 = ggeo[base + p_G01ID * p_Np];
-
-      r_G11 = ggeo[base + p_G11ID * p_Np];
-
-      dfloat qr = 0.f, qs = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n) {
-        qr += s_D[i][n] * s_q[j][n];
-        qs += s_D[j][n] * s_q[n][i];
-      }
-
-      r_qr = qr;
-      r_qs = qs;
-
-      r_Aq = r_GwJ * lambda * s_q[j][i];
-    }
-
-    // r term ----->
-    @barrier("local");
-
-    squareThreads {
-      s_q[j][i] = r_G00 * r_qr + r_G01 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      dfloat tmp = 0.f;
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][i] * s_q[j][n];
-
-      r_Aq += tmp;
-    }
-
-    // s term ---->
-    @barrier("local");
-
-    squareThreads {
-      s_q[j][i] = r_G01 * r_qr + r_G11 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      dfloat tmp = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][j] * s_q[n][i];
-
-      r_Aq += tmp;
-
-      const dlong base = e * p_Np + j * p_Nq + i;
-      Aq[base] = r_Aq;
-    }
-  }
-}
-
-// square thread version
-@kernel void ellipticPartialAxQuad2D(const dlong Nelements,
-                                     @restrict const dlong*  elementList,
-                                     @restrict const dfloat*  ggeo,
-                                     @restrict const dfloat*  D,
-                                     @restrict const dfloat*  S,
-                                     @restrict const dfloat*  MM,
-                                     const dfloat lambda,
-                                     @restrict const dfloat*  q,
-                                     @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dlong element;
-    @exclusive dfloat r_qr, r_qs, r_Aq;
-    @exclusive dfloat r_G00, r_G01, r_G11, r_GwJ;
-
-    // prefetch q(:,:,:,e) to @shared
-    squareThreads {
-      element = elementList[e];
-      const dlong base = i + j * p_Nq + element * p_Np;
-
-      s_q[j][i] = q[base];
-
-      // fetch D to @shared
-      s_D[j][i] = D[j * p_Nq + i];
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      const dlong base = element * p_Nggeo * p_Np + j * p_Nq + i;
-
-      // assumes w*J built into G entries
-      r_GwJ = ggeo[base + p_GWJID * p_Np];
-
-      r_G00 = ggeo[base + p_G00ID * p_Np];
-      r_G01 = ggeo[base + p_G01ID * p_Np];
-
-      r_G11 = ggeo[base + p_G11ID * p_Np];
-
-      dfloat qr = 0.f, qs = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n) {
-        qr += s_D[i][n] * s_q[j][n];
-        qs += s_D[j][n] * s_q[n][i];
-      }
-
-      r_qr = qr;
-      r_qs = qs;
-
-      r_Aq = r_GwJ * lambda * s_q[j][i];
-    }
-
-    // r term ----->
-    @barrier("local");
-
-    squareThreads {
-      s_q[j][i] = r_G00 * r_qr + r_G01 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      dfloat tmp = 0.f;
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][i] * s_q[j][n];
-
-      r_Aq += tmp;
-    }
-
-    // s term ---->
-    @barrier("local");
-
-    squareThreads {
-      s_q[j][i] = r_G01 * r_qr + r_G11 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      dfloat tmp = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][j] * s_q[n][i];
-
-      r_Aq += tmp;
-
-      const dlong base = element * p_Np + j * p_Nq + i;
-      Aq[base] = r_Aq;
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxQuad3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxQuad3D.okl
deleted file mode 100644
index 79d279b5e..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxQuad3D.okl
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// hex @kernel for screened coulomb potential mat-vec
-#define squareThreads                           \
-  for(int j = 0; j < p_Nq; ++j; @inner(1))  \
-  for(int i = 0; i < p_Nq; ++i; @inner(0))
-
-// square thread version
-@kernel void ellipticAxQuad3D(const dlong Nelements,
-                              @restrict const dfloat*  ggeo,
-                              @restrict const dfloat*  D,
-                              @restrict const dfloat*  S,
-                              @restrict const dfloat*  MM,
-                              const dfloat lambda,
-                              @restrict const dfloat*  q,
-                              @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dfloat r_qr, r_qs, r_q, r_Aq;
-    @exclusive pfloat r_G00, r_G01, r_G02;
-    @exclusive pfloat r_G11, r_G12;
-    @exclusive pfloat r_G22;
-    @exclusive pfloat r_GwJ;
-
-    // prefetch q(:,:,:,e) to @shared
-    squareThreads {
-      const dlong base = i + j * p_Nq + e * p_Np;
-
-      s_q[j][i] = q[base];
-
-      // fetch D to @shared
-      s_D[j][i] = D[j * p_Nq + i];
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      const dlong base = e * p_Nggeo * p_Np + j * p_Nq + i;
-
-      // assumes w*J built into G entries
-      r_GwJ = ggeo[base + p_GWJID * p_Np];
-
-      r_G00 = ggeo[base + p_G00ID * p_Np];
-      r_G01 = ggeo[base + p_G01ID * p_Np];
-      r_G02 = ggeo[base + p_G02ID * p_Np];
-      r_G11 = ggeo[base + p_G11ID * p_Np];
-      r_G12 = ggeo[base + p_G12ID * p_Np];
-      r_G22 = ggeo[base + p_G22ID * p_Np];
-
-      dfloat qr = 0.f, qs = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n) {
-        qr += s_D[i][n] * s_q[j][n];
-        qs += s_D[j][n] * s_q[n][i];
-      }
-
-      r_qr = qr;
-      r_qs = qs;
-      r_q = s_q[j][i];
-
-      r_Aq = r_GwJ * lambda * r_q;
-    }
-
-    // r term ----->
-    @barrier("local");
-
-    squareThreads {
-      // s_q[j][i] = r_G00*r_qr + r_G01*r_qs + r_G02*r_q;
-      s_q[j][i] = r_G00 * r_qr + r_G01 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      dfloat tmp = 0.f;
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][i] * s_q[j][n];
-      r_Aq += tmp;
-    }
-
-    // // t term ---->
-
-    // squareThreads{
-    //   r_Aq += r_G02*r_qr + r_G12*r_qs + r_G22*r_q;
-    // }
-
-    // s term ---->
-    @barrier("local");
-
-    squareThreads {
-      // s_q[j][i] = r_G01*r_qr + r_G11*r_qs + 0.f*r_G12*r_q;
-      s_q[j][i] = r_G01 * r_qr + r_G11 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      // dfloat tmp = r_G22*r_q;
-      dfloat tmp = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][j] * s_q[n][i];
-
-      r_Aq += tmp;
-
-      const dlong base = e * p_Np + j * p_Nq + i;
-      Aq[base] = r_Aq;
-    }
-  }
-}
-
-// square thread version
-@kernel void ellipticPartialAxQuad3D(const dlong Nelements,
-                                     @restrict const dlong*  elementList,
-                                     @restrict const dfloat*  ggeo,
-                                     @restrict const dfloat*  D,
-                                     @restrict const dfloat*  S,
-                                     @restrict const dfloat*  MM,
-                                     const dfloat lambda,
-                                     @restrict const dfloat*  q,
-                                     @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dlong element;
-    @exclusive dfloat r_qr, r_qs, r_q, r_Aq;
-    @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ;
-
-    // prefetch q(:,:,:,e) to @shared
-    squareThreads {
-      element = elementList[e];
-      const dlong base = i + j * p_Nq + element * p_Np;
-
-      s_q[j][i] = q[base];
-
-      // fetch D to @shared
-      s_D[j][i] = D[j * p_Nq + i];
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      const dlong base = element * p_Nggeo * p_Np + j * p_Nq + i;
-      // assumes w*J built into G entries
-      r_GwJ = ggeo[base + p_GWJID * p_Np];
-      r_G00 = ggeo[base + p_G00ID * p_Np];
-      r_G01 = ggeo[base + p_G01ID * p_Np];
-      r_G02 = ggeo[base + p_G02ID * p_Np];
-      r_G11 = ggeo[base + p_G11ID * p_Np];
-      r_G12 = ggeo[base + p_G12ID * p_Np];
-      r_G22 = ggeo[base + p_G22ID * p_Np];
-
-      dfloat qr = 0.f, qs = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n) {
-        qr += s_D[i][n] * s_q[j][n];
-        qs += s_D[j][n] * s_q[n][i];
-      }
-
-      r_qr = qr;
-      r_qs = qs;
-      r_q = s_q[j][i];
-
-      r_Aq = r_GwJ * lambda * r_q;
-    }
-
-    // r term ----->
-    @barrier("local");
-
-    squareThreads {
-      // s_q[j][i] =  r_G00*r_qr + r_G01*r_qs + r_G02*r_q;
-      s_q[j][i] =  r_G00 * r_qr + r_G01 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      dfloat tmp = 0.f;
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][i] * s_q[j][n];
-
-      r_Aq += tmp;
-      // r_Aq += 0.f;
-    }
-
-    // // ts term ---->
-    // squareThreads{
-    //   r_Aq += r_G02*r_qr + r_G12*r_qs + r_G22*r_q;
-    // }
-
-    // s term ---->
-    @barrier("local");
-
-    squareThreads {
-      // s_q[j][i] = r_G01*r_qr + r_G11*r_qs + r_G12*r_q;
-      s_q[j][i] = r_G01 * r_qr + r_G11 * r_qs;
-    }
-
-    @barrier("local");
-
-    squareThreads {
-      // dfloat tmp = r_G22*r_q;
-      dfloat tmp = 0.f;
-
-#pragma unroll p_Nq
-      for(int n = 0; n < p_Nq; ++n)
-        tmp += s_D[n][j] * s_q[n][i];
-
-      r_Aq += tmp;
-
-      const dlong base = element * p_Np + j * p_Nq + i;
-      Aq[base] = r_Aq;
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxTet3D.okl
deleted file mode 100644
index e698cf896..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxTet3D.okl
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAxTet3D(const dlong Nelements,
-                             @restrict const dfloat*  ggeo,
-                             @restrict const dfloat*  Dmatrices,
-                             @restrict const dfloat*  Smatrices,
-                             @restrict const dfloat*  MM,
-                             const dfloat lambda,
-                             @restrict const dfloat*  q,
-                             @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; e++; @outer(0)) {
-    @shared dfloat s_q[p_Np];
-
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      //prefetch q
-      const dlong id = n + e * p_Np;
-      s_q[n] = q[id];
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const dlong gid = e * p_Nggeo;
-
-      const dfloat Grr = ggeo[gid + p_G00ID];
-      const dfloat Grs = ggeo[gid + p_G01ID];
-      const dfloat Grt = ggeo[gid + p_G02ID];
-      const dfloat Gss = ggeo[gid + p_G11ID];
-      const dfloat Gst = ggeo[gid + p_G12ID];
-      const dfloat Gtt = ggeo[gid + p_G22ID];
-      const dfloat J   = ggeo[gid + p_GWJID];
-
-      dfloat qrr = 0.;
-      dfloat qrs = 0.;
-      dfloat qrt = 0.;
-      dfloat qss = 0.;
-      dfloat qst = 0.;
-      dfloat qtt = 0.;
-      dfloat qM = 0.;
-
-#pragma unroll p_Np
-      for (int k = 0; k < p_Np; k++) {
-        qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * s_q[k];
-        qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * s_q[k];
-        qrt += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * s_q[k];
-        qss += Smatrices[n + k * p_Np + 3 * p_Np * p_Np] * s_q[k];
-        qst += Smatrices[n + k * p_Np + 4 * p_Np * p_Np] * s_q[k];
-        qtt += Smatrices[n + k * p_Np + 5 * p_Np * p_Np] * s_q[k];
-        qM  += MM[n + k * p_Np] * s_q[k];
-      }
-
-      const dlong id = n + e * p_Np;
-
-      Aq[id] = Grr * qrr + Grs * qrs + Grt * qrt
-               + Gss * qss + Gst * qst + Gtt * qtt
-               + J * lambda * qM;
-    }
-  }
-}
-
-@kernel void ellipticPartialAxTet3D_v0(const dlong Nelements,
-                                       @restrict const dlong*  elementList,
-                                       @restrict const dfloat*  ggeo,
-                                       @restrict const dfloat*  Dmatrices,
-                                       @restrict const dfloat*  Smatrices,
-                                       @restrict const dfloat*  MM,
-                                       const dfloat lambda,
-                                       @restrict const dfloat*  q,
-                                       @restrict dfloat*  Aq)
-{
-  for(dlong e = 0; e < Nelements; e++; @outer(0)) {
-    @shared dfloat s_q[p_Np];
-
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      //prefetch q
-      const dlong element = elementList[e];
-      const dlong id = n + element * p_Np;
-      s_q[n] = q[id];
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const dlong element = elementList[e];
-      const dlong gid = element * p_Nggeo;
-
-      const dfloat Grr = ggeo[gid + p_G00ID];
-      const dfloat Grs = ggeo[gid + p_G01ID];
-      const dfloat Grt = ggeo[gid + p_G02ID];
-      const dfloat Gss = ggeo[gid + p_G11ID];
-      const dfloat Gst = ggeo[gid + p_G12ID];
-      const dfloat Gtt = ggeo[gid + p_G22ID];
-      const dfloat J   = ggeo[gid + p_GWJID];
-
-      dfloat qrr = 0.;
-      dfloat qrs = 0.;
-      dfloat qrt = 0.;
-      dfloat qss = 0.;
-      dfloat qst = 0.;
-      dfloat qtt = 0.;
-      dfloat qM = 0.;
-
-#pragma unroll p_Np
-      for (int k = 0; k < p_Np; k++) {
-        qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * s_q[k];
-        qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * s_q[k];
-        qrt += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * s_q[k];
-        qss += Smatrices[n + k * p_Np + 3 * p_Np * p_Np] * s_q[k];
-        qst += Smatrices[n + k * p_Np + 4 * p_Np * p_Np] * s_q[k];
-        qtt += Smatrices[n + k * p_Np + 5 * p_Np * p_Np] * s_q[k];
-        qM  += MM[n + k * p_Np] * s_q[k];
-      }
-
-      const dlong id = n + element * p_Np;
-
-      Aq[id] = Grr * qrr + Grs * qrs + Grt * qrt
-               + Gss * qss + Gst * qst + Gtt * qtt
-               + J * lambda * qM;
-    }
-  }
-}
-
-//Ref3 from benchmarks
-// number of outputs per thread
-// important to tune this at low order
-
-// p_Ne: number of outputs per thread
-// p_Nb: number of Np blocks per threadblock
-
-#if p_N == 1
-#define p_Ne 2
-#define p_Nb 8
-#elif p_N == 2
-#define p_Ne 3
-#define p_Nb 3
-#elif p_N == 3
-#define p_Ne 3
-#define p_Nb 3
-#elif p_N == 4
-#define p_Ne 3
-#define p_Nb 5
-#elif p_N == 5
-#define p_Ne 3
-#define p_Nb 5
-#elif p_N == 6
-#define p_Ne 4
-#define p_Nb 6
-#else  /* if p_N == 1 */
-// from N=7
-#define p_Ne 4
-#define p_Nb 2
-#endif
-
-// #define p_Ne 4
-// #define p_Nb 2
-@kernel void ellipticPartialAxTet3D(const dlong Nelements,
-                                    @restrict const dlong*  elementList,
-                                    @restrict const dfloat*  ggeo,
-                                    @restrict const dfloat*  Dmatrices,
-                                    @restrict const dfloat*  Smatrices,
-                                    @restrict const dfloat*  MM,
-                                    const dfloat lambda,
-                                    @restrict const dfloat*  q,
-                                    @restrict dfloat*  Aq)
-{
-  // p_Ne nodes per thread
-  // p_Nb elements worth of threads per block
-
-  for(dlong eo = 0; eo < Nelements; eo += p_Ne * p_Nb; @outer(0)) {
-    @shared dfloat s_q[p_Ne][p_Nb][p_Np];
-    @shared dfloat s_ggeo[p_Ne][p_Nb][p_Nggeo];
-
-    @exclusive dlong element[p_Ne];
-
-    for(int b = 0; b < p_Nb; ++b; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-#pragma unroll p_Ne
-        for(int et = 0; et < p_Ne; ++et) {
-          const dlong e = eo + b + p_Nb * et;
-
-          if(e < Nelements) {
-            element[et] = elementList[e];
-
-            const dlong id = n + element[et] * p_Np;
-            s_q[et][b][n] = q[id];
-
-            int m = n;
-            while(m < p_Nggeo) {
-              s_ggeo[et][b][m] = ggeo[element[et] * p_Nggeo + m];
-              m += p_Np;
-            }
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int b = 0; b < p_Nb; ++b; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        dfloat qrr[p_Ne], qrs[p_Ne], qrt[p_Ne], qss[p_Ne], qst[p_Ne], qtt[p_Ne], qM[p_Ne];
-
-#pragma unroll p_Ne
-        for(int et = 0; et < p_Ne; ++et) {
-          qrr[et] = 0;
-          qrs[et] = 0;
-          qrt[et] = 0;
-          qss[et] = 0;
-          qst[et] = 0;
-          qtt[et] = 0;
-          qM[et] = 0;
-        }
-
-        // overall this does p_Ne*14 flops for  (7+p_Ne)*|dfloat| L1+@shared accesse
-        // arithmetic intensity is  (p_Ne*14/((7+p_Ne)*8)) flops per byte
-#pragma unroll p_Np
-        for (int k = 0; k < p_Np; k++) {
-          const dfloat Srr_nk = Smatrices[n + k * p_Np + 0 * p_Np * p_Np];
-          const dfloat Srs_nk = Smatrices[n + k * p_Np + 1 * p_Np * p_Np];
-          const dfloat Srt_nk = Smatrices[n + k * p_Np + 2 * p_Np * p_Np];
-          const dfloat Sss_nk = Smatrices[n + k * p_Np + 3 * p_Np * p_Np];
-          const dfloat Sst_nk = Smatrices[n + k * p_Np + 4 * p_Np * p_Np];
-          const dfloat Stt_nk = Smatrices[n + k * p_Np + 5 * p_Np * p_Np];
-          const dfloat MM_nk =    MM[n + k * p_Np];
-
-#pragma unroll p_Ne
-          for(int et = 0; et < p_Ne; ++et) {
-            const dfloat qk = s_q[et][b][k];
-            qrr[et] += Srr_nk * qk;
-            qrs[et] += Srs_nk * qk; // assume (Srs stores Srs+Ssr)
-            qrt[et] += Srt_nk * qk; // assume (Srt stores Srt+Str)
-            qss[et] += Sss_nk * qk;
-            qst[et] += Sst_nk * qk; // assume (Sst stores Sst+Sts)
-            qtt[et] += Stt_nk * qk;
-            qM[et]  += MM_nk * qk;
-          }
-        }
-
-#pragma unroll p_Ne
-        for(int et = 0; et < p_Ne; ++et) {
-          const dlong e = eo + b + p_Nb * et;
-          if(e < Nelements) {
-            const dfloat Grr = s_ggeo[et][b][p_G00ID];
-            const dfloat Grs = s_ggeo[et][b][p_G01ID];
-            const dfloat Grt = s_ggeo[et][b][p_G02ID];
-            const dfloat Gss = s_ggeo[et][b][p_G11ID];
-            const dfloat Gst = s_ggeo[et][b][p_G12ID];
-            const dfloat Gtt = s_ggeo[et][b][p_G22ID];
-            const dfloat J   = s_ggeo[et][b][p_GWJID];
-
-            const dlong id = n + element[et] * p_Np;
-
-            Aq[id] =
-              Grr * qrr[et] +
-              Grs * qrs[et] +
-              Grt * qrt[et] +
-              Gss * qss[et] +
-              Gst * qst[et] +
-              Gtt * qtt[et] +
-              J * lambda * qM[et];
-          }
-        }
-      }
-    }
-  }
-}
-#undef p_Ne
-#undef p_Nb
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticAxTri2D.okl
deleted file mode 100644
index 4c125d4cd..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxTri2D.okl
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAxTri2D(const dlong Nelements,
-                             @restrict const dfloat*  ggeo,
-                             @restrict const dfloat*  Dmatrices,
-                             @restrict const dfloat*  Smatrices,
-                             @restrict const dfloat*  MM,
-                             const dfloat lambda,
-                             @restrict const dfloat*  q,
-                             @restrict dfloat*  Aq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if (e < Nelements) {
-          //prefetch q
-          const dlong id = n + e * p_Np;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if (e < Nelements) {
-          const dlong es = e - eo;
-          const dlong gid = e * p_Nggeo;
-
-          const dfloat Grr = ggeo[gid + p_G00ID];
-          const dfloat Grs = ggeo[gid + p_G01ID];
-          const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
-
-          dfloat qrr = 0.;
-          dfloat qrs = 0.;
-          dfloat qss = 0.;
-          dfloat qM = 0.;
-
-#pragma unroll p_Np
-          for (int k = 0; k < p_Np; k++) {
-            dfloat qn = s_q[es][k];
-            qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * qn;
-            qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * qn;
-            qss += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * qn;
-            qM  += MM[n + k * p_Np] * s_q[es][k];
-          }
-
-          const dlong id = n + e * p_Np;
-
-          Aq[id] = Grr * qrr + Grs * qrs + Gss * qss + J * lambda * qM;
-        }
-      }
-    }
-  }
-}
-//Analysis:
-// We perform (per thread block)
-// Nelements per block x Np x (Np x 5+10) flops
-// We load: (Nelements per block) x Np to @shared
-// We read (Nelements per block) x Np xNp x 5 times from @shared
-// We request: (Nelements per block) x Np x(1+4+Npx5 variables
-// We store (Nelements per block) x Np x (1) variables
-
-@kernel void ellipticPartialAxTri2D(const dlong Nelements,
-                                    @restrict const dlong*  elementList,
-                                    @restrict const dfloat*  ggeo,
-                                    @restrict const dfloat*  Dmatrices,
-                                    @restrict const dfloat*  Smatrices,
-                                    @restrict const dfloat*  MM,
-                                    const dfloat lambda,
-                                    @restrict const dfloat*  q,
-                                    @restrict dfloat*  Aq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if (e < Nelements) {
-          //prefetch q
-          const dlong element = elementList[e];
-          const dlong id = n + element * p_Np;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if (e < Nelements) {
-          const dlong es = e - eo;
-          const dlong element = elementList[e];
-          const dlong gid = element * p_Nggeo;
-
-          const dfloat Grr = ggeo[gid + p_G00ID];
-          const dfloat Grs = ggeo[gid + p_G01ID];
-          const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
-
-          dfloat qrr = 0.;
-          dfloat qrs = 0.;
-          dfloat qss = 0.;
-          dfloat qM = 0.;
-
-#pragma unroll p_Np
-          for (int k = 0; k < p_Np; k++) {
-            dfloat qn = s_q[es][k];
-            qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * qn;
-            qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * qn;
-            qss += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * qn;
-            qM  += MM[n + k * p_Np] * s_q[es][k];
-          }
-
-          const dlong id = n + element * p_Np;
-
-          Aq[id] = Grr * qrr + Grs * qrs + Gss * qss + J * lambda * qM;
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticAxTri3D.okl b/src/libP/solvers/elliptic/okl/ellipticAxTri3D.okl
deleted file mode 100644
index df3ace18c..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticAxTri3D.okl
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticAxTri3D(const dlong Nelements,
-                             @restrict const dfloat*  ggeo,
-                             @restrict const dfloat*  Dmatrices,
-                             @restrict const dfloat*  Smatrices,
-                             @restrict const dfloat*  MM,
-                             const dfloat lambda,
-                             @restrict const dfloat*  q,
-                             @restrict dfloat*  Aq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if (e < Nelements) {
-          //prefetch q
-          const dlong id = n + e * p_Np;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if (e < Nelements) {
-          const dlong es = e - eo;
-          const dlong gid = e * p_Nggeo;
-
-          const dfloat Grr = ggeo[gid + p_G00ID];
-          const dfloat Grs = ggeo[gid + p_G01ID];
-          const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
-
-          dfloat qrr = 0.;
-          dfloat qrs = 0.;
-          dfloat qss = 0.;
-          dfloat qM = 0.;
-
-#pragma unroll p_Np
-          for (int k = 0; k < p_Np; k++) {
-            dfloat qn = s_q[es][k];
-            qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * qn;
-            qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * qn;
-            qss += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * qn;
-            qM  += MM[n + k * p_Np] * s_q[es][k];
-          }
-
-          const dlong id = n + e * p_Np;
-
-          Aq[id] = Grr * qrr + Grs * qrs + Gss * qss + J * lambda * qM;
-        }
-      }
-    }
-  }
-}
-//Analysis:
-// We perform (per thread block)
-// Nelements per block x Np x (Np x 5+10) flops
-// We load: (Nelements per block) x Np to @shared
-// We read (Nelements per block) x Np xNp x 5 times from @shared
-// We request: (Nelements per block) x Np x(1+4+Npx5 variables
-// We store (Nelements per block) x Np x (1) variables
-
-@kernel void ellipticPartialAxTri3D(const dlong Nelements,
-                                    @restrict const dlong*  elementList,
-                                    @restrict const dfloat*  ggeo,
-                                    @restrict const dfloat*  Dmatrices,
-                                    @restrict const dfloat*  Smatrices,
-                                    @restrict const dfloat*  MM,
-                                    const dfloat lambda,
-                                    @restrict const dfloat*  q,
-                                    @restrict dfloat*  Aq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if (e < Nelements) {
-          //prefetch q
-          const dlong element = elementList[e];
-          const dlong id = n + element * p_Np;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if (e < Nelements) {
-          const dlong es = e - eo;
-          const dlong element = elementList[e];
-          const dlong gid = element * p_Nggeo;
-
-          const dfloat Grr = ggeo[gid + p_G00ID];
-          const dfloat Grs = ggeo[gid + p_G01ID];
-          const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
-
-          dfloat qrr = 0.;
-          dfloat qrs = 0.;
-          dfloat qss = 0.;
-          dfloat qM = 0.;
-
-#pragma unroll p_Np
-          for (int k = 0; k < p_Np; k++) {
-            dfloat qn = s_q[es][k];
-            qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * qn;
-            qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * qn;
-            qss += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * qn;
-            qM  += MM[n + k * p_Np] * s_q[es][k];
-          }
-
-          const dlong id = n + element * p_Np;
-
-          Aq[id] = Grr * qrr + Grs * qrs + Gss * qss + J * lambda * qM;
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticBlockAddBCHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticBlockAddBCHex3D.okl
deleted file mode 100644
index 44c3502bb..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticBlockAddBCHex3D.okl
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticBlockAddBCHex3D(const dlong Nelements,
-                                     const dlong offset,
-                                     const dfloat t,
-                                     @restrict const dfloat*  x,
-                                     @restrict const dfloat*  y,
-                                     @restrict const dfloat*  z,
-                                     @restrict const int*  mapB,
-                                     @restrict dfloat*  q)
-{
-  for(dlong e = 0; e < Nelements; e++; @outer(0))
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const dlong id = n + e * p_Np;
-      for(int fld = 0; fld < p_eNfields; fld++) {
-        const int bc = mapB[id + fld * offset];
-
-        dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-        if(bc == 1) {
-          ellipticBoundaryConditions3D(bc,
-                                       fld,
-                                       t,
-                                       x[id],
-                                       y[id],
-                                       z[id],
-                                       nx,
-                                       ny,
-                                       nz,
-                                       0.f,
-                                       0.f,
-                                       0.f,
-                                       0.f,
-                                       uP,
-                                       dudxP,
-                                       dudyP,
-                                       dudzP);
-          q[id + fld * offset] = uP;
-        }
-      }
-    }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticBlockRhsBCHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticBlockRhsBCHex3D.okl
deleted file mode 100644
index 668f06dcf..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticBlockRhsBCHex3D.okl
+++ /dev/null
@@ -1,1078 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#if 1
-void surfaceTerms(int sk,
-                  int fld,
-                  int offset,
-                  int m,
-                  int i,
-                  int j,
-                  const dfloat t,
-                  const dfloat* sgeo,
-                  const dfloat* x,
-                  const dfloat* y,
-                  const dfloat* z,
-                  const int* vmapM,
-                  const int* mapB,
-                  dfloat s_q[2][p_Nq][p_Nq],
-                  dfloat s_ndq[2][p_Nq][p_Nq])
-{
-  const dlong idM = vmapM[sk];
-
-  const dfloat nx  = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny  = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat nz  = sgeo[sk * p_Nsgeo + p_NZID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-  dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-  const int bc = mapB[idM + fld * offset];
-  if(bc > 0)
-    ellipticBoundaryConditions3D(bc,
-                                 fld,
-                                 t,
-                                 x[idM],
-                                 y[idM],
-                                 z[idM],
-                                 nx,
-                                 ny,
-                                 nz,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 uP,
-                                 dudxP,
-                                 dudyP,
-                                 dudzP);
-
-  s_q  [m][j][i]  = uP;
-  s_ndq[m][j][i]  = -WsJ * (nx * dudxP + ny * dudyP + nz * dudzP);
-}
-
-// AK: just one time called, very bad but not important for now
-@kernel void ellipticBlockRhsBCHex3D(const dlong Nelements,
-                                     const int fld,
-                                     const dlong offset,
-                                     @restrict const dfloat*  ggeo,
-                                     @restrict const dfloat*  sgeo,
-                                     @restrict const dfloat*  D,
-                                     @restrict const dfloat*  S,
-                                     @restrict const dfloat*  MM,
-                                     @restrict const dlong* vmapM,
-                                     @restrict const dfloat*  sMT,
-                                     @restrict const dfloat* lambda,
-                                     const dfloat t,
-                                     @restrict const dfloat*  x,
-                                     @restrict const dfloat*  y,
-                                     @restrict const dfloat*  z,
-                                     @restrict const int*  mapB,
-                                     @restrict dfloat*  rhs)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[2][p_Nq][p_Nq];
-    @shared dfloat s_ndq[2][p_Nq][p_Nq];
-
-    @exclusive dfloat r_qt, r_Gqt, r_Auk;
-    @exclusive dfloat r_q[p_Nq]; // register array to hold u(i,j,0:N) private to thread
-    @exclusive dfloat r_rhs[p_Nq];// array for results Au(i,j,0:N)
-
-    @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ;
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        s_D[j][i] = D[p_Nq * j + i];
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          r_q[k] = 0.;
-          r_rhs[k] = 0.;
-        }
-
-        const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i + j * p_Nq;
-        const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + i + j * p_Nq;
-
-        surfaceTerms(sk0, fld, offset, 0, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk5, fld, offset, 1, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_q[0] = s_q[0][j][i];
-        r_rhs[0] += s_ndq[0][j][i];
-
-        //face 5
-        r_q[p_Nq - 1] = s_q[1][j][i];
-        r_rhs[p_Nq - 1] += s_ndq[1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + i + k * p_Nq;
-        const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + i + k * p_Nq;
-
-        surfaceTerms(sk1, fld, offset, 0, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk3, fld, offset, 1, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k]    = s_q[0][k][i];
-            r_rhs[k] += s_ndq[0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k]    = s_q[1][k][i];
-            r_rhs[k] += s_ndq[1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + j + k * p_Nq;
-        const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + j + k * p_Nq;
-
-        surfaceTerms(sk2, fld, offset, 0, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk4, fld, offset, 1, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[0][k][j];
-            r_rhs[k] += s_ndq[0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[1][k][j];
-            r_rhs[k] += s_ndq[1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // Layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; k++) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // prefetch geometric factors
-          const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-
-          r_G00 = ggeo[gbase + p_G00ID * p_Np];
-          r_G01 = ggeo[gbase + p_G01ID * p_Np];
-          r_G02 = ggeo[gbase + p_G02ID * p_Np];
-
-          r_G11 = ggeo[gbase + p_G11ID * p_Np];
-          r_G12 = ggeo[gbase + p_G12ID * p_Np];
-          r_G22 = ggeo[gbase + p_G22ID * p_Np];
-
-          r_GwJ = ggeo[gbase + p_GWJID * p_Np];
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // share u(:,:,k)
-          s_q[0][j][i] = r_q[k];
-
-          r_qt = 0;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++)
-            r_qt += s_D[k][m] * r_q[m];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat qr = 0.f;
-          dfloat qs = 0.f;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            qr += s_D[i][m] * s_q[0][j][m];
-            qs += s_D[j][m] * s_q[0][m][i];
-          }
-
-          //reuse the s_ndq array
-          s_ndq[0][j][i] = (r_G01 * qr + r_G11 * qs + r_G12 * r_qt);
-          s_ndq[1][j][i] = (r_G00 * qr + r_G01 * qs + r_G02 * r_qt);
-
-          // put this here for a performance bump
-          r_Gqt = (r_G02 * qr + r_G12 * qs + r_G22 * r_qt);
-          r_Auk = r_GwJ * lambda[fld] * r_q[k];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            r_Auk    += s_D[m][j] * s_ndq[0][m][i];
-            r_rhs[m] += s_D[k][m] * r_Gqt;   // DT(m,k)*ut(i,j,k,e)
-            r_Auk    += s_D[m][i] * s_ndq[1][j][m];
-          }
-
-          r_rhs[k] += r_Auk;
-        }
-      }
-    }
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; k++) {
-          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          rhs[id + fld * offset] -= r_rhs[k];
-        }
-      }
-    }
-  }
-}
-
-// AK: just one time called, very bad but not important for now
-@kernel void ellipticBlockRhsVarBCHex3D(const dlong Nelements,
-                                        const int fld,
-                                        const dlong offset,
-                                        @restrict const dfloat*  ggeo,
-                                        @restrict const dfloat*  sgeo,
-                                        @restrict const dfloat*  D,
-                                        @restrict const dfloat*  S,
-                                        @restrict const dfloat*  MM,
-                                        @restrict const dlong* vmapM,
-                                        @restrict const dfloat*  sMT,
-                                        @restrict const dfloat* lambda,
-                                        const dfloat t,
-                                        @restrict const dfloat*  x,
-                                        @restrict const dfloat*  y,
-                                        @restrict const dfloat*  z,
-                                        @restrict const int*  mapB,
-                                        @restrict dfloat*  rhs)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[2][p_Nq][p_Nq];
-    @shared dfloat s_ndq[2][p_Nq][p_Nq];
-
-    @exclusive dfloat r_qt, r_Gqt, r_Auk;
-    @exclusive dfloat r_q[p_Nq]; // register array to hold u(i,j,0:N) private to thread
-    @exclusive dfloat r_rhs[p_Nq];// array for results Au(i,j,0:N)
-
-    @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ;
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        s_D[j][i] = D[p_Nq * j + i];
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          r_q[k] = 0.;
-          r_rhs[k] = 0.;
-        }
-
-        const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i + j * p_Nq;
-        const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + i + j * p_Nq;
-
-        surfaceTerms(sk0, fld, offset, 0, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk5, fld, offset, 1, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_q[0] = s_q[0][j][i];
-        r_rhs[0] += s_ndq[0][j][i];
-
-        //face 5
-        r_q[p_Nq - 1] = s_q[1][j][i];
-        r_rhs[p_Nq - 1] += s_ndq[1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + i + k * p_Nq;
-        const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + i + k * p_Nq;
-
-        surfaceTerms(sk1, fld, offset, 0, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk3, fld, offset, 1, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k]    = s_q[0][k][i];
-            r_rhs[k] += s_ndq[0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k]    = s_q[1][k][i];
-            r_rhs[k] += s_ndq[1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + j + k * p_Nq;
-        const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + j + k * p_Nq;
-
-        surfaceTerms(sk2, fld, offset, 0, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk4, fld, offset, 1, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[0][k][j];
-            r_rhs[k] += s_ndq[0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[1][k][j];
-            r_rhs[k] += s_ndq[1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // Layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; k++) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // prefetch geometric factors
-          const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-
-          r_G00 = ggeo[gbase + p_G00ID * p_Np];
-          r_G01 = ggeo[gbase + p_G01ID * p_Np];
-          r_G02 = ggeo[gbase + p_G02ID * p_Np];
-
-          r_G11 = ggeo[gbase + p_G11ID * p_Np];
-          r_G12 = ggeo[gbase + p_G12ID * p_Np];
-          r_G22 = ggeo[gbase + p_G22ID * p_Np];
-
-          r_GwJ = ggeo[gbase + p_GWJID * p_Np];
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // share u(:,:,k)
-          s_q[0][j][i] = r_q[k];
-
-          r_qt = 0;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++)
-            r_qt += s_D[k][m] * r_q[m];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat qr = 0.f;
-          dfloat qs = 0.f;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            qr += s_D[i][m] * s_q[0][j][m];
-            qs += s_D[j][m] * s_q[0][m][i];
-          }
-
-          const dlong base   = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          const dfloat lam_0 = lambda[base + 0 * offset + 2 * fld * offset];
-          const dfloat lam_1 = lambda[base + 1 * offset + 2 * fld * offset];
-
-          //reuse the s_ndq array
-          s_ndq[0][j][i] = lam_0 * (r_G01 * qr + r_G11 * qs + r_G12 * r_qt);
-          s_ndq[1][j][i] = lam_0 * (r_G00 * qr + r_G01 * qs + r_G02 * r_qt);
-
-          // put this here for a performance bump
-          r_Gqt = lam_0 * (r_G02 * qr + r_G12 * qs + r_G22 * r_qt);
-          r_Auk = r_GwJ * lam_1 * r_q[k];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            r_Auk    += s_D[m][j] * s_ndq[0][m][i];
-            r_rhs[m] += s_D[k][m] * r_Gqt;   // DT(m,k)*ut(i,j,k,e)
-            r_Auk    += s_D[m][i] * s_ndq[1][j][m];
-          }
-
-          r_rhs[k] += r_Auk;
-        }
-      }
-    }
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; k++) {
-          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          rhs[id + fld * offset] -= r_rhs[k];
-        }
-      }
-    }
-  }
-}
-
-#endif
-
-#if 0
-
-void surfaceTerms(int sk,
-                  int m,
-                  int i,
-                  int j,
-                  const dfloat t,
-                  const dfloat* sgeo,
-                  const dfloat* x,
-                  const dfloat* y,
-                  const dfloat* z,
-                  const int* vmapM,
-                  const int* mapB,
-                  dfloat s_q[2][p_Nq][p_Nq],
-                  dfloat s_ndq[2][p_Nq][p_Nq])
-{
-  const dlong idM = vmapM[sk];
-
-  const dfloat nx  = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny  = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat nz  = sgeo[sk * p_Nsgeo + p_NZID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-  dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-  const int bc = mapB[idM];
-  const int fld = 0;
-  if(bc > 0)
-    ellipticBoundaryConditions3D(bc,
-                                 fld,
-                                 t,
-                                 x[idM],
-                                 y[idM],
-                                 z[idM],
-                                 nx,
-                                 ny,
-                                 nz,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 uP,
-                                 dudxP,
-                                 dudyP,
-                                 dudzP);
-
-  s_q  [m][j][i]  = uP;
-  s_ndq[m][j][i]  = -WsJ * (nx * dudxP + ny * dudyP + nz * dudzP);
-}
-
-// AK: just one time called, very bad but not important for now
-@kernel void ellipticBlockRhsBCHex3D(const dlong Nelements,
-                                     const int fld,
-                                     const dlong offset,
-                                     @restrict const dfloat*  ggeo,
-                                     @restrict const dfloat*  sgeo,
-                                     @restrict const dfloat*  D,
-                                     @restrict const dfloat*  S,
-                                     @restrict const dfloat*  MM,
-                                     @restrict const dlong* vmapM,
-                                     @restrict const dfloat*  sMT,
-                                     @restrict const dfloat* lambda,
-                                     const dfloat t,
-                                     @restrict const dfloat*  x,
-                                     @restrict const dfloat*  y,
-                                     @restrict const dfloat*  z,
-                                     @restrict const int*  mapB,
-                                     @restrict dfloat*  rhs)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[2][p_Nq][p_Nq];
-    @shared dfloat s_ndq[2][p_Nq][p_Nq];
-
-    @exclusive dfloat r_qt, r_Gqt, r_Auk;
-    @exclusive dfloat r_q[p_Nq]; // register array to hold u(i,j,0:N) private to thread
-    @exclusive dfloat r_rhs[p_Nq];// array for results Au(i,j,0:N)
-
-    @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ;
-
-    // for all face nodes of all elements
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //load D into local memory
-        // s_D[i][j] = d \phi_i at node j
-        s_D[j][i] = D[p_Nq * j + i]; // D is column major
-
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          r_q[k] = 0.;
-          r_rhs[k] = 0.;
-        }
-
-        const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i + j * p_Nq;
-        const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + i + j * p_Nq;
-
-        surfaceTerms(sk0, 0, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk5, 1, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_q[0] = s_q[0][j][i];
-        r_rhs[0] += s_ndq[0][j][i];
-
-        //face 5
-        r_q[p_Nq - 1] = s_q[1][j][i];
-        r_rhs[p_Nq - 1] += s_ndq[1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + i + k * p_Nq;
-        const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + i + k * p_Nq;
-
-        surfaceTerms(sk1, 0, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk3, 1, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[0][k][i]; //TW += => =
-            r_rhs[k] += s_ndq[0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[1][k][i]; //TW += => =
-            r_rhs[k] += s_ndq[1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + j + k * p_Nq;
-        const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + j + k * p_Nq;
-
-        surfaceTerms(sk2, 0, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        surfaceTerms(sk4, 1, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[0][k][j]; //TW += => =
-            r_rhs[k] += s_ndq[0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[1][k][j]; //TW += => =
-            r_rhs[k] += s_ndq[1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // Layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; k++) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // prefetch geometric factors
-          const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-
-          r_G00 = ggeo[gbase + p_G00ID * p_Np];
-          r_G01 = ggeo[gbase + p_G01ID * p_Np];
-          r_G02 = ggeo[gbase + p_G02ID * p_Np];
-
-          r_G11 = ggeo[gbase + p_G11ID * p_Np];
-          r_G12 = ggeo[gbase + p_G12ID * p_Np];
-          r_G22 = ggeo[gbase + p_G22ID * p_Np];
-
-          r_GwJ = ggeo[gbase + p_GWJID * p_Np];
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // share u(:,:,k)
-          s_q[0][j][i] = r_q[k];
-
-          r_qt = 0;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++)
-            r_qt += s_D[k][m] * r_q[m];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat qr = 0.f;
-          dfloat qs = 0.f;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            qr += s_D[i][m] * s_q[0][j][m];
-            qs += s_D[j][m] * s_q[0][m][i];
-          }
-
-          //reuse the s_ndq array
-          s_ndq[0][j][i] = (r_G01 * qr + r_G11 * qs + r_G12 * r_qt);
-          s_ndq[1][j][i] = (r_G00 * qr + r_G01 * qs + r_G02 * r_qt);
-
-          // put this here for a performance bump
-          r_Gqt = (r_G02 * qr + r_G12 * qs + r_G22 * r_qt);
-          r_Auk = r_GwJ * r_q[k];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            r_Auk    += s_D[m][j] * s_ndq[0][m][i];
-            r_rhs[m] += s_D[k][m] * r_Gqt;   // DT(m,k)*ut(i,j,k,e)
-            r_Auk    += s_D[m][i] * s_ndq[1][j][m];
-          }
-
-          r_rhs[k] += r_Auk;
-        }
-      }
-    }
-
-    // write out
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; k++) {
-          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          rhs[id + fld * offset] -= r_rhs[k];
-        }
-      }
-    }
-  }
-}
-
-#endif
-
-#if 0
-// AK: This implemantation is restricted to small number of Nfields
-void surfaceBlockTerms(int sk,
-                       int fld,
-                       int offset,
-                       int m,
-                       int i,
-                       int j,
-                       const dfloat t,
-                       const dfloat* sgeo,
-                       const dfloat* x,
-                       const dfloat* y,
-                       const dfloat* z,
-                       const int* vmapM,
-                       const int* mapB,
-                       dfloat s_q[p_eNfields][2][p_Nq][p_Nq],
-                       dfloat s_ndq[p_eNfields][2][p_Nq][p_Nq])
-{
-  const dlong idM = vmapM[sk];
-
-  const dfloat nx  = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny  = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat nz  = sgeo[sk * p_Nsgeo + p_NZID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-  dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-  const int bc = mapB[idM + fld * offset];
-
-  if(bc > 0)
-    ellipticBoundaryConditions3D(bc,
-                                 fld,
-                                 t,
-                                 x[idM],
-                                 y[idM],
-                                 z[idM],
-                                 nx,
-                                 ny,
-                                 nz,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 uP,
-                                 dudxP,
-                                 dudyP,
-                                 dudzP);
-  s_q  [fld][m][j][i]  = uP;
-  s_ndq[fld][m][j][i]  = -WsJ * (nx * dudxP + ny * dudyP + nz * dudzP);
-}
-// AK: just one time called, very bad but not important for now
-@kernel void ellipticBlockRhsBCHex3D(const dlong Nelements,
-                                     const dlong offset,
-                                     @restrict const dfloat*  ggeo,
-                                     @restrict const dfloat*  sgeo,
-                                     @restrict const dfloat*  D,
-                                     @restrict const dfloat*  S,
-                                     @restrict const dfloat*  MM,
-                                     @restrict const dlong* vmapM,
-                                     @restrict const dfloat*  sMT,
-                                     @restrict const dfloat* lambda,
-                                     const dfloat t,
-                                     @restrict const dfloat*  x,
-                                     @restrict const dfloat*  y,
-                                     @restrict const dfloat*  z,
-                                     @restrict const int*  mapB,
-                                     @restrict dfloat*  rhs)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[p_eNfields][2][p_Nq][p_Nq];
-    @shared dfloat s_ndq[p_eNfields][2][p_Nq][p_Nq];
-
-    @exclusive dfloat r_qt[p_eNfields], r_Gqt[p_eNfields], r_Auk[p_eNfields];
-    @exclusive dfloat r_q[p_eNfields][p_Nq]; // register array to hold u(i,j,0:N) private to thread
-    @exclusive dfloat r_rhs[p_eNfields][p_Nq];// array for results Au(i,j,0:N)
-
-    @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ;
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq*p_eNfields
-        for(int k = 0; k < p_Nq * p_eNfields; ++k) {
-          r_q[0][k]   = 0.f;
-          r_rhs[0][k] = 0.f;
-        }
-      }
-    }
-
-    @barrier("local");
-
-// fill values first
-    for(int fld = 0; fld < p_eNfields; fld++) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          //load D into local memory
-          // s_D[i][j] = d \phi_i at node j
-          s_D[j][i] = D[p_Nq * j + i]; // D is column major
-
-#pragma unroll p_Nq
-          for(int k = 0; k < p_Nq; ++k) {
-            r_q[fld][k] = 0.;
-            r_rhs[fld][k] = 0.;
-          }
-
-          const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i + j * p_Nq;
-          const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + i + j * p_Nq;
-          surfaceBlockTerms(sk0, fld, offset, 0, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-          surfaceBlockTerms(sk5, fld, offset, 1, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        }
-      }
-
-      @barrier("local");
-
-      // face 0 & 5
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          //face 0
-          r_q[fld][0] = s_q[fld][0][j][i];
-          r_rhs[fld][0] += s_ndq[fld][0][j][i];
-
-          //face 5
-          r_q[fld][p_Nq - 1] = s_q[fld][1][j][i];
-          r_rhs[fld][p_Nq - 1] += s_ndq[fld][1][j][i];
-        }
-      @barrier("local");
-
-      // face 1 & 3
-      for(int k = 0; k < p_Nq; ++k; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + i + k * p_Nq;
-          const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + i + k * p_Nq;
-          surfaceBlockTerms(sk1, fld, offset, 0, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-          surfaceBlockTerms(sk3, fld, offset, 1, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        }
-
-      @barrier("local");
-
-      // face 1 & 3
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          if (j == 0) {//face 1
-#pragma unroll p_Nq
-            for (int k = 0; k < p_Nq; k++) {
-              r_q[fld][k]    = s_q[fld][0][k][i];  //TW += => =
-              r_rhs[fld][k] += s_ndq[fld][0][k][i];
-            }
-          }
-          if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-            for (int k = 0; k < p_Nq; k++) {
-              r_q[fld][k]    = s_q[fld][1][k][i];//TW += => =
-              r_rhs[fld][k] += s_ndq[fld][1][k][i];
-            }
-          }
-        }
-      }
-
-      @barrier("local");
-
-      // face 2 & 4
-      for(int k = 0; k < p_Nq; ++k; @inner(1))
-        for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-          const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + j + k * p_Nq;
-          const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + j + k * p_Nq;
-          surfaceBlockTerms(sk2, fld, offset, 0, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-          surfaceBlockTerms(sk4, fld, offset, 1, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-        }
-
-      @barrier("local");
-
-      // face 2 & 4
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-            for (int k = 0; k < p_Nq; k++) {
-              r_q[fld][k]    = s_q[fld][0][k][j];//TW += => =
-              r_rhs[fld][k] += s_ndq[fld][0][k][j];
-            }
-          }
-          if (i == 0) {//face 4
-#pragma unroll p_Nq
-            for (int k = 0; k < p_Nq; k++) {
-              r_q[fld][k]    = s_q[fld][1][k][j];//TW += => =
-              r_rhs[fld][k] += s_ndq[fld][1][k][j];
-            }
-          }
-        }
-      }
-
-// surface values are loaded
-    }
-
-    @barrier("local");
-
-    // Layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; k++) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // prefetch geometric factors
-          const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-
-          r_G00 = ggeo[gbase + p_G00ID * p_Np];
-          r_G01 = ggeo[gbase + p_G01ID * p_Np];
-          r_G02 = ggeo[gbase + p_G02ID * p_Np];
-
-          r_G11 = ggeo[gbase + p_G11ID * p_Np];
-          r_G12 = ggeo[gbase + p_G12ID * p_Np];
-          r_G22 = ggeo[gbase + p_G22ID * p_Np];
-
-          r_GwJ = ggeo[gbase + p_GWJID * p_Np];
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          for(int fld = 0; fld < p_eNfields; fld++) {
-            s_q[fld][0][j][i] = r_q[fld][k];
-            r_qt[fld] = 0.f;
-          }
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++)
-            for(int fld = 0; fld < p_eNfields; fld++)
-              r_qt[fld] += s_D[k][m] * r_q[fld][m];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          for(int fld = 0; fld < p_eNfields; fld++) {
-            dfloat qr = 0.f;
-            dfloat qs = 0.f;
-
-#pragma unroll p_Nq
-            for(int m = 0; m < p_Nq; m++) {
-              qr += s_D[i][m] * s_q[fld][0][j][m];
-              qs += s_D[j][m] * s_q[fld][0][m][i];
-            }
-
-            //reuse the s_ndq array
-            s_ndq[fld][0][j][i] = (r_G01 * qr + r_G11 * qs + r_G12 * r_qt[fld]);
-            s_ndq[fld][1][j][i] = (r_G00 * qr + r_G01 * qs + r_G02 * r_qt[fld]);
-
-            // put this here for a performance bump
-            r_Gqt[fld] = (r_G02 * qr + r_G12 * qs + r_G22 * r_qt[fld]);
-            r_Auk[fld] = r_GwJ * lambda[fld] * r_q[fld][k];
-          }
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          for(int fld = 0; fld < p_eNfields; fld++) {
-#pragma unroll p_Nq
-            for(int m = 0; m < p_Nq; m++) {
-              r_Auk[fld]    += s_D[m][j] * s_ndq[fld][0][m][i];
-              r_rhs[fld][m] += s_D[k][m] * r_Gqt[fld]; // DT(m,k)*ut(i,j,k,e)
-              r_Auk[fld]    += s_D[m][i] * s_ndq[fld][1][j][m];
-            }
-            r_rhs[fld][k] += r_Auk[fld];
-          }
-        }
-      }
-    }
-
-    // write out
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        for(int fld = 0; fld < p_eNfields; fld++) {
-#pragma unroll p_Nq
-          for(int k = 0; k < p_Nq; k++) {
-            const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-            rhs[id + fld * offset] -= r_rhs[fld][k];
-          }
-        }
-      }
-    }
-  }
-}
-
-#endif
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientBBTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticGradientBBTri2D.okl
deleted file mode 100644
index 7fa644a22..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticGradientBBTri2D.okl
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// compute local gradients
-
-@kernel void ellipticGradientBBTri2D(const int Nelements,
-                                     @restrict const dfloat*  vgeo,
-                                     @restrict const int*  D1ids,
-                                     @restrict const int*  D2ids,
-                                     @restrict const int*  D3ids,
-                                     @restrict const dfloat*  Dvals,
-                                     @restrict const dfloat*  q,
-                                     @restrict dfloat4*  gradq)
-{
-  // block partition of elements
-  for(int eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if(e < Nelements) {
-          const int id = e * p_Np + n;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if(e < Nelements) {
-          // prefetch geometric factors (constant on triangle)
-          const dfloat drdx = vgeo[e * p_Nvgeo + p_RXID];
-          const dfloat drdy = vgeo[e * p_Nvgeo + p_RYID];
-          const dfloat dsdx = vgeo[e * p_Nvgeo + p_SXID];
-          const dfloat dsdy = vgeo[e * p_Nvgeo + p_SYID];
-
-          const int es = e - eo;
-
-          const int D1i1 = D1ids[n];
-          const int D2i1 = D2ids[n];
-          const int D3i1 = D3ids[n];
-          const dfloat Dval1 = Dvals[n];
-
-          const int D1i2 = D1ids[n + p_Np];
-          const int D2i2 = D2ids[n + p_Np];
-          const int D3i2 = D3ids[n + p_Np];
-          const dfloat Dval2 = Dvals[n + p_Np];
-
-          const int D1i3 = D1ids[n + 2 * p_Np];
-          const int D2i3 = D2ids[n + 2 * p_Np];
-          const int D3i3 = D3ids[n + 2 * p_Np];
-          const dfloat Dval3 = Dvals[n + 2 * p_Np];
-
-          const dfloat dqdr = .5f * (Dval1 * (s_q[es][D2i1] - s_q[es][D1i1]) +
-                                     Dval2 * (s_q[es][D2i2] - s_q[es][D1i2]) +
-                                     Dval3 * (s_q[es][D2i3] - s_q[es][D1i3]));
-          const dfloat dqds = .5f * (Dval1 * (s_q[es][D3i1] - s_q[es][D1i1]) +
-                                     Dval2 * (s_q[es][D3i2] - s_q[es][D1i2]) +
-                                     Dval3 * (s_q[es][D3i3] - s_q[es][D1i3]));
-
-          dfloat4 gradqn;
-          gradqn.x = drdx * dqdr + dsdx * dqds;
-          gradqn.y = drdy * dqdr + dsdy * dqds;
-          gradqn.w = s_q[es][n];
-
-          const int id = e * p_Np + n;
-          gradq[id] = gradqn;
-        }
-  }
-}
-
-@kernel void ellipticPartialGradientBBTri2D(const int Nelements,
-                                            const int offset,
-                                            @restrict const dfloat*  vgeo,
-                                            @restrict const int*  D1ids,
-                                            @restrict const int*  D2ids,
-                                            @restrict const int*  D3ids,
-                                            @restrict const dfloat*  Dvals,
-                                            @restrict const dfloat*  q,
-                                            @restrict dfloat4*  gradq)
-{
-  // block partition of elements
-  for(int eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if(e < Nelements) {
-          const int id = (e + offset) * p_Np + n;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if(e < Nelements) {
-          // prefetch geometric factors (constant on triangle)
-          const dfloat drdx = vgeo[(e + offset) * p_Nvgeo + p_RXID];
-          const dfloat drdy = vgeo[(e + offset) * p_Nvgeo + p_RYID];
-          const dfloat dsdx = vgeo[(e + offset) * p_Nvgeo + p_SXID];
-          const dfloat dsdy = vgeo[(e + offset) * p_Nvgeo + p_SYID];
-
-          const int es = e - eo;
-
-          const int D1i1 = D1ids[n];
-          const int D2i1 = D2ids[n];
-          const int D3i1 = D3ids[n];
-          const dfloat Dval1 = Dvals[n];
-
-          const int D1i2 = D1ids[n + p_Np];
-          const int D2i2 = D2ids[n + p_Np];
-          const int D3i2 = D3ids[n + p_Np];
-          const dfloat Dval2 = Dvals[n + p_Np];
-
-          const int D1i3 = D1ids[n + 2 * p_Np];
-          const int D2i3 = D2ids[n + 2 * p_Np];
-          const int D3i3 = D3ids[n + 2 * p_Np];
-          const dfloat Dval3 = Dvals[n + 2 * p_Np];
-
-          const dfloat dqdr = .5f * (Dval1 * (s_q[es][D2i1] - s_q[es][D1i1]) +
-                                     Dval2 * (s_q[es][D2i2] - s_q[es][D1i2]) +
-                                     Dval3 * (s_q[es][D2i3] - s_q[es][D1i3]));
-          const dfloat dqds = .5f * (Dval1 * (s_q[es][D3i1] - s_q[es][D1i1]) +
-                                     Dval2 * (s_q[es][D3i2] - s_q[es][D1i2]) +
-                                     Dval3 * (s_q[es][D3i3] - s_q[es][D1i3]));
-
-          dfloat4 gradqn;
-          gradqn.x = drdx * dqdr + dsdx * dqds;
-          gradqn.y = drdy * dqdr + dsdy * dqds;
-          gradqn.w = s_q[es][n];
-
-          const int id = (e + offset) * p_Np + n;
-          gradq[id] = gradqn;
-        }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticGradientQuad2D.okl
deleted file mode 100644
index c55d5012a..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticGradientQuad2D.okl
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// compute local gradients
-
-@kernel void ellipticGradientQuad2D(const dlong Nelements,
-                                    @restrict const dfloat*  vgeo,
-                                    @restrict const dfloat*  const D,
-                                    @restrict const dfloat*  q,
-                                    @restrict dfloat4*  gradq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[p_Nq][p_Nq];
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // prefetch D
-        s_D[j][i] = D[i + p_Nq * j];
-
-        // prefetch q
-        const dlong id = e * p_Np + j * p_Nq + i;
-        s_q[j][i] = q[id];
-      }
-
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-
-        // compute 1D derivatives
-        dfloat qr = 0, qs = 0;
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          qr += s_D[i][n] * s_q[j][n];
-          qs += s_D[j][n] * s_q[n][i];
-        }
-
-        dfloat4 gradqn;
-        gradqn.x = drdx * qr + dsdx * qs;
-        gradqn.y = drdy * qr + dsdy * qs;
-        gradqn.w = s_q[j][i];
-
-        const dlong id = e * p_Np + j * p_Nq + i;
-        gradq[id] = gradqn;
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialGradientQuad2D(const dlong Nelements,
-                                           const dlong offset,
-                                           @restrict const dfloat*  vgeo,
-                                           @restrict const dfloat*  D,
-                                           @restrict const dfloat*  q,
-                                           @restrict dfloat4*  gradq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[p_Nq][p_Nq];
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // prefetch D
-        s_D[j][i] = D[i + p_Nq * j];
-
-        // prefetch q
-        const dlong id = (e + offset) * p_Np + j * p_Nq + i;
-        s_q[j][i] = q[id];
-      }
-
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong gid = i + j * p_Nq + (e + offset) * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-
-        // compute 1D derivatives
-        dfloat qr = 0, qs = 0;
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          qr += s_D[i][n] * s_q[j][n];
-          qs += s_D[j][n] * s_q[n][i];
-        }
-
-        dfloat4 gradqn;
-        gradqn.x = drdx * qr + dsdx * qs;
-        gradqn.y = drdy * qr + dsdy * qs;
-        gradqn.w = s_q[j][i];
-
-        const dlong id = (e + offset) * p_Np + j * p_Nq + i;
-        gradq[id] = gradqn;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientQuad3D.okl b/src/libP/solvers/elliptic/okl/ellipticGradientQuad3D.okl
deleted file mode 100644
index 981135128..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticGradientQuad3D.okl
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// compute local gradients
-
-@kernel void ellipticGradientQuad3D(const dlong Nelements,
-                                    @restrict const dfloat*  vgeo,
-                                    @restrict const dfloat*  const D,
-                                    @restrict const dfloat*  q,
-                                    @restrict dfloat4*  gradq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[p_Nq][p_Nq];
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // prefetch D
-        s_D[j][i] = D[i + p_Nq * j];
-
-        // prefetch q
-        const dlong id = e * p_Np + j * p_Nq + i;
-        s_q[j][i] = q[id];
-      }
-
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-        const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-        const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-        const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-        const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-        // compute 1D derivatives
-        dfloat qr = 0, qs = 0;
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          qr += s_D[i][n] * s_q[j][n];
-          qs += s_D[j][n] * s_q[n][i];
-        }
-
-        dfloat4 gradqn;
-#if 0
-        gradqn.x = drdx * qr + dsdx * qs + dtdx * s_q[j][i];
-        gradqn.y = drdy * qr + dsdy * qs + dtdy * s_q[j][i];
-        gradqn.z = drdz * qr + dsdz * qs + dtdz * s_q[j][i];
-#else
-        gradqn.x = drdx * qr + dsdx * qs;
-        gradqn.y = drdy * qr + dsdy * qs;
-        gradqn.z = drdz * qr + dsdz * qs;
-#endif
-        gradqn.w = s_q[j][i];
-
-        const dlong id = e * p_Np + j * p_Nq + i;
-        gradq[id] = gradqn;
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialGradientQuad3D(const dlong Nelements,
-                                           const dlong offset,
-                                           @restrict const dfloat*  vgeo,
-                                           @restrict const dfloat*  D,
-                                           @restrict const dfloat*  q,
-                                           @restrict dfloat4*  gradq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[p_Nq][p_Nq];
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // prefetch D
-        s_D[j][i] = D[i + p_Nq * j];
-
-        // prefetch q
-        const dlong id = (e + offset) * p_Np + j * p_Nq + i;
-        s_q[j][i] = q[id];
-      }
-
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-        const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-        const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-        const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-        const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-        // compute 1D derivatives
-        dfloat qr = 0, qs = 0;
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          qr += s_D[i][n] * s_q[j][n];
-          qs += s_D[j][n] * s_q[n][i];
-        }
-
-        dfloat4 gradqn;
-#if 0
-        gradqn.x = drdx * qr + dsdx * qs + dtdx * s_q[j][i];
-        gradqn.y = drdy * qr + dsdy * qs + dtdy * s_q[j][i];
-        gradqn.z = drdz * qr + dsdz * qs + dtdz * s_q[j][i];
-#else
-        gradqn.x = drdx * qr + dsdx * qs;
-        gradqn.y = drdy * qr + dsdy * qs;
-        gradqn.z = drdz * qr + dsdz * qs;
-#endif
-        gradqn.w = s_q[j][i];
-
-        const dlong id = (e + offset) * p_Np + j * p_Nq + i;
-        gradq[id] = gradqn;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticGradientTet3D.okl
deleted file mode 100644
index cf60af44b..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticGradientTet3D.okl
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticGradientTet3D(const dlong Nelements,
-                                   @restrict const dfloat*  vgeo,
-                                   @restrict const dfloat*  const Dmatrices,
-                                   @restrict const dfloat*  q,
-                                   @restrict dfloat4*  gradq)
-{
-  // block partition of elements
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if(e < Nelements) {
-          // prefetch q
-          const dlong id = e * p_Np + n;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if(e < Nelements) {
-          const int es = (int) (e - eo);
-          const dlong gid = e * p_Nvgeo;
-
-          const dfloat drdx = vgeo[gid + p_RXID];
-          const dfloat drdy = vgeo[gid + p_RYID];
-          const dfloat drdz = vgeo[gid + p_RZID];
-          const dfloat dsdx = vgeo[gid + p_SXID];
-          const dfloat dsdy = vgeo[gid + p_SYID];
-          const dfloat dsdz = vgeo[gid + p_SZID];
-          const dfloat dtdx = vgeo[gid + p_TXID];
-          const dfloat dtdy = vgeo[gid + p_TYID];
-          const dfloat dtdz = vgeo[gid + p_TZID];
-
-          // compute 1D derivatives
-          dfloat qr = 0, qs = 0, qt = 0;
-
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            qr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_q[es][i];
-            qs += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_q[es][i];
-            qt += Dmatrices[n + i * p_Np + 2 * p_Np * p_Np] * s_q[es][i];
-          }
-
-          dfloat4 gradqn;
-          gradqn.x = drdx * qr + dsdx * qs + dtdx * qt;
-          gradqn.y = drdy * qr + dsdy * qs + dtdy * qt;
-          gradqn.z = drdz * qr + dsdz * qs + dtdz * qt;
-          gradqn.w = s_q[es][n];
-
-          const dlong id = e * p_Np + n;
-          gradq[id] = gradqn;
-        }
-      }
-    }
-  }
-}
-
-@kernel void ellipticPartialGradientTet3D(const dlong Nelements,
-                                          const dlong offset,
-                                          @restrict const dfloat*  vgeo,
-                                          @restrict const dfloat*  const Dmatrices,
-                                          @restrict const dfloat*  q,
-                                          @restrict dfloat4*  gradq)
-{
-  // block partition of elements
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0))
-        if(e < Nelements) {
-          // prefetch q
-          const dlong id = (e + offset) * p_Np + n;
-          s_q[e - eo][n] = q[id];
-        }
-
-    @barrier("local");
-
-    for(dlong e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if(e < Nelements) {
-          const int es = (int) (e - eo);
-          const dlong gid = (e + offset) * p_Nvgeo;
-
-          const dfloat drdx = vgeo[gid + p_RXID];
-          const dfloat drdy = vgeo[gid + p_RYID];
-          const dfloat drdz = vgeo[gid + p_RZID];
-          const dfloat dsdx = vgeo[gid + p_SXID];
-          const dfloat dsdy = vgeo[gid + p_SYID];
-          const dfloat dsdz = vgeo[gid + p_SZID];
-          const dfloat dtdx = vgeo[gid + p_TXID];
-          const dfloat dtdy = vgeo[gid + p_TYID];
-          const dfloat dtdz = vgeo[gid + p_TZID];
-
-          // compute 1D derivatives
-          dfloat qr = 0, qs = 0, qt = 0;
-
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            qr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_q[es][i];
-            qs += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_q[es][i];
-            qt += Dmatrices[n + i * p_Np + 2 * p_Np * p_Np] * s_q[es][i];
-          }
-
-          dfloat4 gradqn;
-          gradqn.x = drdx * qr + dsdx * qs + dtdx * qt;
-          gradqn.y = drdy * qr + dsdy * qs + dtdy * qt;
-          gradqn.z = drdz * qr + dsdz * qs + dtdz * qt;
-          gradqn.w = s_q[es][n];
-
-          const dlong id = (e + offset) * p_Np + n;
-          gradq[id] = gradqn;
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticGradientTri2D.okl
deleted file mode 100644
index 8f93d5033..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticGradientTri2D.okl
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// compute local gradients
-
-@kernel void ellipticGradientTri2D_v0(const dlong Nelements,
-                                      @restrict const dfloat*  vgeo,
-                                      @restrict const dfloat*  const Dmatrices,
-                                      @restrict const dfloat*  q,
-                                      @restrict dfloat4*  gradq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q[p_Np];
-
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      // prefetch q
-      const dlong id = e * p_Np + n;
-      s_q[n] = q[id];
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      const dlong gid = e * p_Nvgeo;
-
-      const dfloat drdx = vgeo[gid + p_RXID];
-      const dfloat drdy = vgeo[gid + p_RYID];
-      const dfloat dsdx = vgeo[gid + p_SXID];
-      const dfloat dsdy = vgeo[gid + p_SYID];
-
-      // compute 1D derivatives
-      dfloat qr = 0, qs = 0;
-      for(int i = 0; i < p_Np; ++i) {
-        qr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_q[i];
-        qs += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_q[i];
-      }
-
-      dfloat4 gradqn;
-      gradqn.x = drdx * qr + dsdx * qs;
-      gradqn.y = drdy * qr + dsdy * qs;
-      gradqn.w = s_q[n];
-
-      const dlong id = e * p_Np + n;
-      gradq[id] = gradqn;
-    }
-  }
-}
-
-#define drdx s_vgeo[es][p_RXID]
-#define drdy s_vgeo[es][p_RYID]
-#define dsdx s_vgeo[es][p_SXID]
-#define dsdy s_vgeo[es][p_SYID]
-
-@kernel void ellipticGradientTri2D(const int Nelements,
-                                   @restrict const dfloat*  vgeo,
-                                   @restrict const dfloat*  const Dmatrices,
-                                   @restrict const dfloat*  q,
-                                   @restrict dfloat4*  gradq)
-{
-  // block partition of elements
-  for(int eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-    @shared dfloat s_vgeo[p_NblockV][p_Nvgeo];
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if(e < Nelements) {
-          // prefetch q
-          const int id = e * p_Np + n;
-          s_q[e - eo][n] = q[id];
-        }
-
-        // coalesce volume geofac reads to @shared
-        int t = n + p_Np * (e - eo);
-        while(t < p_Nvgeo * p_NblockV) {
-          if(eo * p_Nvgeo + t < Nelements * p_Nvgeo)
-            s_vgeo[0][t] = vgeo[eo * p_Nvgeo + t];
-          t += p_NblockV * p_Np;
-        }
-      }
-
-    @barrier("local");
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if(e < Nelements) {
-          const int es = e - eo;
-
-          // compute 1D derivatives
-          dfloat qr = 0, qs = 0;
-
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            dfloat drt = Dmatrices[n + i * p_Np + 0 * p_Np * p_Np];
-            dfloat dst = Dmatrices[n + i * p_Np + 1 * p_Np * p_Np];
-            dfloat qn  = s_q[es][i];
-
-            qr += drt * qn;
-            qs += dst * qn;
-          }
-
-          dfloat4 gradqn;
-          gradqn.x = drdx * qr + dsdx * qs;
-          gradqn.y = drdy * qr + dsdy * qs;
-          gradqn.w = s_q[es][n];
-
-          const int id = e * p_Np + n;
-          gradq[id] = gradqn;
-        }
-      }
-    }
-  }
-}
-
-// // Optimized sizes for @kernel 4-5
-#if p_N == 1
-#define p_NbV 10
-#define p_Nmt 1
-#endif
-
-#if p_N == 2
-#define p_NbV 4
-#define p_Nmt 2
-#endif
-
-#if p_N == 3
-#define p_NbV 6
-#define p_Nmt 1
-#endif
-
-#if p_N == 4
-#define p_NbV 2
-#define p_Nmt 2
-#endif
-
-#if p_N == 5
-#define p_NbV 2
-#define p_Nmt 5
-#endif
-
-#if p_N == 6
-#define p_NbV 3
-#define p_Nmt 7
-#endif
-
-#if p_N == 7
-#define p_NbV 2
-#define p_Nmt 7
-#endif
-
-#if p_N == 8
-#define p_NbV 5
-#define p_Nmt 7
-#endif
-
-#if p_N == 9
-#define p_NbV 5
-#define p_Nmt 7
-#endif
-
-#if p_N == 10
-#define p_NbV 4
-#define p_Nmt 6
-#endif
-
-// map multiple nodes to thread
-@kernel void ellipticPartialGradientTri2D(const dlong Nelements,
-                                          const dlong offset,
-                                          @restrict const dfloat*  vgeo,
-                                          @restrict const dfloat*  Dmatrices,
-                                          @restrict const dfloat*  q,
-                                          @restrict dfloat4*  gradq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += (p_NbV * p_Nmt); @outer(0)) {
-    @shared dfloat s_q[p_Nmt][p_NbV][p_Np];
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em;
-          if(e < Nelements) {
-            const dlong id = n + (e + offset) * p_Np;
-            s_q[em][es][n] = q[id];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        dfloat qr[p_Nmt], qs[p_Nmt];
-
-        // hold geometric factors on register
-        dfloat drdx2[p_Nmt], dsdx2[p_Nmt];
-        dfloat drdy2[p_Nmt], dsdy2[p_Nmt];
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em + offset;
-          qr[em] = 0.f;
-          qs[em] = 0.f;
-          //
-          drdx2[em] = vgeo[p_Nvgeo * e + p_RXID];
-          drdy2[em] = vgeo[p_Nvgeo * e + p_RYID];
-          dsdx2[em] = vgeo[p_Nvgeo * e + p_SXID];
-          dsdy2[em] = vgeo[p_Nvgeo * e + p_SYID];
-        }
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          dfloat drt = Dmatrices[n + i * p_Np + 0 * p_Np * p_Np];
-          dfloat dst = Dmatrices[n + i * p_Np + 1 * p_Np * p_Np];
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em) {
-            dfloat qn = s_q[em][es][i];
-
-            qr[em] += drt * qn;
-            qs[em] += dst * qn;
-          }
-        }
-
-        dfloat4 gradqn;
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e  = eo + es * p_Nmt + em;
-          if(e < Nelements) {
-            const dlong id = (e + offset) * p_Np + n;
-            gradqn.x = drdx2[em] * qr[em] + dsdx2[em] * qs[em];
-            gradqn.y = drdy2[em] * qr[em] + dsdy2[em] * qs[em];
-            gradqn.w = s_q[em][es][n];
-
-            gradq[id] = gradqn;
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticGradientTri3D.okl b/src/libP/solvers/elliptic/okl/ellipticGradientTri3D.okl
deleted file mode 100644
index 691190e0b..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticGradientTri3D.okl
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#define drdx s_vgeo[es][p_RXID]
-#define drdy s_vgeo[es][p_RYID]
-#define drdz s_vgeo[es][p_RZID]
-#define dsdx s_vgeo[es][p_SXID]
-#define dsdy s_vgeo[es][p_SYID]
-#define dsdz s_vgeo[es][p_SZID]
-
-@kernel void ellipticGradientTri3D(const int Nelements,
-                                   @restrict const dfloat*  vgeo,
-                                   @restrict const dfloat*  const Dmatrices,
-                                   @restrict const dfloat*  q,
-                                   @restrict dfloat4*  gradq)
-{
-  // block partition of elements
-  for(int eo = 0; eo < Nelements; eo += p_NblockV; @outer(0)) {
-    @shared dfloat s_q[p_NblockV][p_Np];
-    @shared dfloat s_vgeo[p_NblockV][p_Nvgeo];
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1))
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if(e < Nelements) {
-          // prefetch q
-          const int id = e * p_Np + n;
-          s_q[e - eo][n] = q[id];
-        }
-
-        // coalesce volume geofac reads to @shared
-        int t = n + p_Np * (e - eo);
-        while(t < p_Nvgeo * p_NblockV) {
-          if(eo * p_Nvgeo + t < Nelements * p_Nvgeo)
-            s_vgeo[0][t] = vgeo[eo * p_Nvgeo + t];
-          t += p_NblockV * p_Np;
-        }
-      }
-
-    @barrier("local");
-
-    for(int e = eo; e < eo + p_NblockV; ++e; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        if(e < Nelements) {
-          const int es = e - eo;
-
-          // compute 1D derivatives
-          dfloat qr = 0, qs = 0;
-
-#pragma unroll p_Np
-          for(int i = 0; i < p_Np; ++i) {
-            dfloat drt = Dmatrices[n + i * p_Np + 0 * p_Np * p_Np];
-            dfloat dst = Dmatrices[n + i * p_Np + 1 * p_Np * p_Np];
-            dfloat qn  = s_q[es][i];
-
-            qr += drt * qn;
-            qs += dst * qn;
-          }
-
-          dfloat4 gradqn;
-          gradqn.x = drdx * qr + dsdx * qs;
-          gradqn.y = drdy * qr + dsdy * qs;
-          gradqn.z = drdz * qr + dsdz * qs;
-          gradqn.w = s_q[es][n];
-
-          const int id = e * p_Np + n;
-          gradq[id] = gradqn;
-        }
-      }
-    }
-  }
-}
-
-// // Optimized sizes for @kernel 4-5
-#if p_N == 1
-#define p_NbV 10
-#define p_Nmt 1
-#endif
-
-#if p_N == 2
-#define p_NbV 4
-#define p_Nmt 2
-#endif
-
-#if p_N == 3
-#define p_NbV 6
-#define p_Nmt 1
-#endif
-
-#if p_N == 4
-#define p_NbV 2
-#define p_Nmt 2
-#endif
-
-#if p_N == 5
-#define p_NbV 2
-#define p_Nmt 5
-#endif
-
-#if p_N == 6
-#define p_NbV 3
-#define p_Nmt 7
-#endif
-
-#if p_N == 7
-#define p_NbV 2
-#define p_Nmt 7
-#endif
-
-#if p_N == 8
-#define p_NbV 5
-#define p_Nmt 7
-#endif
-
-#if p_N == 9
-#define p_NbV 5
-#define p_Nmt 7
-#endif
-
-#if p_N == 10
-#define p_NbV 4
-#define p_Nmt 6
-#endif
-
-// map multiple nodes to thread
-@kernel void ellipticPartialGradientTri3D(const dlong Nelements,
-                                          const dlong offset,
-                                          @restrict const dfloat*  vgeo,
-                                          @restrict const dfloat*  Dmatrices,
-                                          @restrict const dfloat*  q,
-                                          @restrict dfloat4*  gradq)
-{
-  for(dlong eo = 0; eo < Nelements; eo += (p_NbV * p_Nmt); @outer(0)) {
-    @shared dfloat s_q[p_Nmt][p_NbV][p_Np];
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em;
-          if(e < Nelements) {
-            const dlong id = n + (e + offset) * p_Np;
-            s_q[em][es][n] = q[id];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NbV; ++es; @inner(1)) {
-      for(int n = 0; n < p_Np; ++n; @inner(0)) {
-        dfloat qr[p_Nmt], qs[p_Nmt];
-
-        // hold geometric factors on register
-        dfloat drdx2[p_Nmt], dsdx2[p_Nmt];
-        dfloat drdy2[p_Nmt], dsdy2[p_Nmt];
-        dfloat drdz2[p_Nmt], dsdz2[p_Nmt];
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e = eo + es * p_Nmt + em + offset;
-          qr[em] = 0.f;
-          qs[em] = 0.f;
-          //
-          drdx2[em] = vgeo[p_Nvgeo * e + p_RXID];
-          drdy2[em] = vgeo[p_Nvgeo * e + p_RYID];
-          drdz2[em] = vgeo[p_Nvgeo * e + p_RZID];
-          dsdx2[em] = vgeo[p_Nvgeo * e + p_SXID];
-          dsdy2[em] = vgeo[p_Nvgeo * e + p_SYID];
-          dsdz2[em] = vgeo[p_Nvgeo * e + p_SZID];
-        }
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          dfloat drt = Dmatrices[n + i * p_Np + 0 * p_Np * p_Np];
-          dfloat dst = Dmatrices[n + i * p_Np + 1 * p_Np * p_Np];
-
-#pragma unroll p_Nmt
-          for(int em = 0; em < p_Nmt; ++em) {
-            dfloat qn = s_q[em][es][i];
-
-            qr[em] += drt * qn;
-            qs[em] += dst * qn;
-          }
-        }
-
-        dfloat4 gradqn;
-
-#pragma unroll p_Nmt
-        for(int em = 0; em < p_Nmt; ++em) {
-          const dlong e  = eo + es * p_Nmt + em;
-          if(e < Nelements) {
-            const dlong id = (e + offset) * p_Np + n;
-            gradqn.x = drdx2[em] * qr[em] + dsdx2[em] * qs[em];
-            gradqn.y = drdy2[em] * qr[em] + dsdy2[em] * qs[em];
-            gradqn.z = drdz2[em] * qr[em] + dsdz2[em] * qs[em];
-            gradqn.w = s_q[em][es][n];
-
-            gradq[id] = gradqn;
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl
deleted file mode 100644
index 74a197156..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticPreconCoarsenQuad2D(const dlong Nelements,
-                                         @restrict const dfloat*  R,
-                                         @restrict const dfloat*  qf,
-                                         @restrict dfloat*  qc)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_qff[p_NqFine][p_NqFine];
-    @shared dfloat s_qcf[p_NqCoarse][p_NqFine];
-    @shared dfloat s_qcc[p_NqCoarse][p_NqCoarse];
-
-    @shared dfloat s_R[p_NqCoarse][p_NqFine];
-
-    // prefetch to @shared
-
-    for(int j = 0; j < p_NqFine; ++j; @inner(1))
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        const dlong id = i + j * p_NqFine + e * p_NpFine;
-        s_qff[j][i] = qf[id];
-
-        if (j < p_NqCoarse)
-          s_R[j][i] = R[j * p_NqFine + i];
-      }
-
-    @barrier("local");
-
-    // coarsen in j index
-    for(int j = 0; j < p_NqFine; ++j; @inner(1)) {
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        if(j < p_NqCoarse) {
-          dfloat res = 0;
-#pragma unroll p_NqFine
-          for(int m = 0; m < p_NqFine; ++m)
-            res += s_R[j][m] * s_qff[m][i];
-          s_qcf[j][i] = res;
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // coarsen in i index
-    for(int j = 0; j < p_NqFine; ++j; @inner(1)) {
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        if(j < p_NqCoarse && i < p_NqCoarse) {
-          dfloat rtmp = 0;
-#pragma unroll p_NqFine
-          for(int m = 0; m < p_NqFine; ++m)
-            rtmp += s_R[i][m] * s_qcf[j][m];
-          s_qcc[j][i] = rtmp;
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // collect writes
-    for(int j = 0; j < p_NqFine; ++j; @inner(1))
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        const int id = i + j * p_NqFine;
-        if(id < p_NpCoarse)
-          qc[id + p_NpCoarse * e] = s_qcc[0][id];
-      }
-  }
-}
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl
deleted file mode 100644
index aad20c2ed..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticPreconCoarsenTet3D(const dlong Nelements,
-                                        @restrict const dfloat*  R,
-                                        @restrict const dfloat*  qN,
-                                        @restrict dfloat*  q1)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockVCoarse; @outer(0)) {
-    @shared dfloat s_qN[p_NblockVCoarse][p_NpFine];
-
-    for(int es = 0; es < p_NblockVCoarse; ++es; @inner(1))
-      for(int n = 0; n < p_NpCoarse; ++n; @inner(0)) {
-        dlong t = n + es * p_NpCoarse;
-
-        while(t < p_NpFine * p_NblockVCoarse) {
-          if(eo * p_NpFine + t < Nelements * p_NpFine)
-            s_qN[0][t] = qN[eo * p_NpFine + t];
-          t += p_NpCoarse * p_NblockVCoarse;
-        }
-      }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NblockVCoarse; ++es; @inner(1)) {
-      for(int n = 0; n < p_NpCoarse; ++n; @inner(0)) {
-        const dlong e = eo + es;
-        if(e < Nelements) {
-          dfloat tmp = 0;
-#pragma unroll p_NpFine
-          for(int i = 0; i < p_NpFine; ++i) {
-            tmp += R[n * p_NpFine + i] * s_qN[es][i]; // bank conflict ?
-          }
-          q1[e * p_NpCoarse + n] = tmp; // *invDegree[e*p_NpCoarse+n];
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl
deleted file mode 100644
index 7d0d462fc..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticPreconCoarsen_v0(const dlong Nelements,
-                                      @restrict const dfloat*  invDegree,
-                                      @restrict const dfloat*  V1,
-                                      @restrict const dfloat*  qN,
-                                      @restrict dfloat*  q1)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-    for(int n = 0; n < p_Nverts; ++n; @inner(0)) {
-      dfloat tmp = 0;
-      for(int i = 0; i < p_Np; ++i)
-        tmp += V1[n * p_Np + i] * qN[e * p_Np + i];
-      q1[e * p_Nverts + n] = tmp; // *invDegree[e*p_Nverts+n];
-    }
-}
-
-@kernel void ellipticPreconCoarsenTri2D(const dlong Nelements,
-                                        @restrict const dfloat*  R,
-                                        @restrict const dfloat*  qN,
-                                        @restrict dfloat*  q1)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockVCoarse; @outer(0)) {
-    @shared dfloat s_qN[p_NblockVCoarse][p_NpFine];
-
-    for(int es = 0; es < p_NblockVCoarse; ++es; @inner(1))
-      for(int n = 0; n < p_NpCoarse; ++n; @inner(0)) {
-        dlong t = n + es * p_NpCoarse;
-
-        while(t < p_NpFine * p_NblockVCoarse) {
-          if(eo * p_NpFine + t < Nelements * p_NpFine)
-            s_qN[0][t] = qN[eo * p_NpFine + t];
-          t += p_NpCoarse * p_NblockVCoarse;
-        }
-      }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NblockVCoarse; ++es; @inner(1)) {
-      for(int n = 0; n < p_NpCoarse; ++n; @inner(0)) {
-        const dlong e = eo + es;
-        if(e < Nelements) {
-          dfloat tmp = 0;
-#pragma unroll p_NpFine
-          for(int i = 0; i < p_NpFine; ++i) {
-            tmp += R[n * p_NpFine + i] * s_qN[es][i]; // bank conflict ?
-          }
-          q1[e * p_NpCoarse + n] = tmp; // *invDegree[e*p_NpCoarse+n];
-        }
-      }
-    }
-  }
-}
-
-//storing R in @shared is too much for 3D
-#if 0
-@kernel void ellipticPreconCoarsen_v1(const dlong Nelements,
-                                      @restrict const dfloat*  R,
-                                      @restrict const dfloat*  qN,
-                                      @restrict dfloat*  q1)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockVCoarse; @outer(0)) {
-    @shared dfloat s_qN[p_NblockVCoarse][p_NpFine];
-    @shared dfloat s_R[p_NpCoarse][p_NpFine];
-
-    for(int es = 0; es < p_NblockVCoarse; ++es; @inner(1))
-      for(int n = 0; n < p_NpCoarse; ++n; @inner(0)) {
-        dlong t = n + es * p_NpCoarse;
-
-        while(t < p_NpFine * p_NblockVCoarse) {
-          if(eo * p_NpFine + t < Nelements * p_NpFine)
-            s_qN[0][t] = qN[eo * p_NpFine + t];
-          t += p_NpCoarse * p_NblockVCoarse;
-        }
-
-        t = n + es * p_NpCoarse;
-
-        while(t < p_NpFine * p_NpCoarse) {
-          s_R[0][t] = R[t];
-          t += p_NpCoarse * p_NblockVCoarse;
-        }
-      }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NblockVCoarse; ++es; @inner(1)) {
-      for(int n = 0; n < p_NpCoarse; ++n; @inner(0)) {
-        const dlong e = eo + es;
-        if(e < Nelements) {
-          dfloat tmp = 0;
-#pragma unroll p_NpFine
-          for(int i = 0; i < p_NpFine; ++i) {
-            tmp += s_R[n][i] * s_qN[es][i]; // bank conflict ?
-          }
-          q1[e * p_NpCoarse + n] = tmp; // *invDegree[e*p_NpCoarse+n];
-        }
-      }
-    }
-  }
-}
-#endif
-
-#if 0
-@kernel void ellipticPreconCoarsenQuad2D(const int Nelements,
-                                         @restrict const dfloat*  R,
-                                         @restrict const dfloat*  qN,
-                                         @restrict dfloat*  q1)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_qNN[p_Nq][p_Nq];
-    @shared dfloat s_q1N[p_Nq1][p_Nq];
-    @shared dfloat s_q11[p_Nq1][p_Nq1];
-
-    // prefetch to @shared
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0))
-        s_qNN[j][i] = qN[i + j * p_Nq + e * p_Nq * p_Nq];
-
-    @barrier("local");
-
-    // coarsen in j index
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0))
-        if(j < 2) {
-          dfloat res = 0;
-          for(int m = 0; m < p_Nq; ++m)
-            res += R[j * p_Nq + m] * s_qN[m][i];
-          s_q1N[j][i] = res;
-        }
-
-    @barrier("local");
-
-    // coarsen in i index
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0))
-        if(j < 2 && i < 2) {
-          dfloat rtmp = 0;
-          for(int m = 0; m < p_Nq; ++m)
-            rtmp += R[i * p_Nq + m] * s_qN[j][m];
-          s_q11[j][i] = rtmp;
-        }
-
-    @barrier("local");
-
-    // collect writes
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const int id = i + j * p_Nq;
-        if(id < p_Nq1 * p_Nq1)
-          q1[id + p_Nq1 * p_Nq1 * e] = s_q11[0][id];
-      }
-
-  }
-}
-#endif
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl
deleted file mode 100644
index c84df0875..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticPreconProlongateQuad2D(const dlong Nelements,
-                                            @restrict const dfloat*  R,
-                                            @restrict const dfloat*  qc,
-                                            @restrict dfloat*  qN)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_qcf[p_NqCoarse][p_NqFine];
-    @shared dfloat s_qcc[p_NqCoarse][p_NqCoarse];
-    @shared dfloat s_R[p_NqCoarse][p_NqFine];
-
-    // prefetch to @shared
-    for(int j = 0; j < p_NqFine; ++j; @inner(1))
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        const int id = i + j * p_NqFine;
-        if(id < p_NpCoarse)
-          s_qcc[0][id] = qc[id + e * p_NpCoarse];
-        if(id < p_NqCoarse * p_NqFine)
-          s_R[0][id] = R[id];
-      }
-
-    @barrier("local");
-
-    // prolongate in i index
-    for(int j = 0; j < p_NqFine; ++j; @inner(1)) {
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        if(j < p_NqCoarse) {
-          dfloat res = 0;
-#pragma unroll p_NqCoarse
-          for(int m = 0; m < p_NqCoarse; ++m)
-            res += s_R[m][i] * s_qcc[j][m];
-          s_qcf[j][i] = res;
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // coarsen in i index
-    for(int j = 0; j < p_NqFine; ++j; @inner(1)) {
-      for(int i = 0; i < p_NqFine; ++i; @inner(0)) {
-        dfloat res = 0;
-#pragma unroll p_NqCoarse
-        for(int m = 0; m < p_NqCoarse; ++m)
-          res += s_R[m][j] * s_qcf[m][i];
-
-        const dlong id = i + j * p_NqFine + e * p_NpFine;
-        qN[id] += res;
-      }
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl
deleted file mode 100644
index 3d1ef92e8..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticPreconProlongateTet3D(const dlong Nelements,
-                                           @restrict const dfloat*  R,
-                                           @restrict const dfloat*  qCoarse,
-                                           @restrict dfloat*  qFine)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockVFine; @outer(0)) {
-    @shared dfloat s_qCoarse[p_NblockVFine][p_NpCoarse];
-
-    for(int es = 0; es < p_NblockVFine; ++es; @inner(1))
-      for(int n = 0; n < p_NpFine; ++n; @inner(0)) {
-        dlong t = n + es * p_NpFine;
-
-        if(t < p_NpCoarse * p_NblockVFine)
-          if((eo * p_NpCoarse + t) < Nelements * p_NpCoarse)
-            s_qCoarse[0][t] = qCoarse[eo * p_NpCoarse + t];
-      }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NblockVFine; ++es; @inner(1)) {
-      for(int n = 0; n < p_NpFine; ++n; @inner(0)) {
-        const dlong e = eo + es;
-        if(e < Nelements) {
-          //dfloat tmp = 0.;
-          dfloat tmp = qFine[e * p_NpFine + n];
-
-#pragma unroll p_NpCoarse
-          for(int i = 0; i < p_NpCoarse; ++i)
-            tmp += R[i * p_NpFine + n] * s_qCoarse[es][i];
-
-          qFine[e * p_NpFine + n] = tmp;
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl
deleted file mode 100644
index 0c6923869..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticPreconProlongate_v0(const dlong Nelements,
-                                         @restrict const dfloat*  V1,
-                                         @restrict const dfloat*  q1,
-                                         @restrict dfloat*  qN)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-    for(int n = 0; n < p_Np; ++n; @inner(0)) {
-      dfloat tmp = 0; // qN[e*p_Np+n] ;
-
-      for(int i = 0; i < p_Nverts; ++i)
-        tmp += V1[i * p_Np + n] * q1[e * p_Nverts + i];
-      qN[e * p_Np + n] = tmp;
-    }
-}
-
-@kernel void ellipticPreconProlongateTri2D(const dlong Nelements,
-                                           @restrict const dfloat*  R,
-                                           @restrict const dfloat*  qCoarse,
-                                           @restrict dfloat*  qFine)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockVFine; @outer(0)) {
-    @shared dfloat s_qCoarse[p_NblockVFine][p_NpCoarse];
-
-    for(int es = 0; es < p_NblockVFine; ++es; @inner(1))
-      for(int n = 0; n < p_NpFine; ++n; @inner(0)) {
-        dlong t = n + es * p_NpFine;
-
-        if(t < p_NpCoarse * p_NblockVFine)
-          if((eo * p_NpCoarse + t) < Nelements * p_NpCoarse)
-            s_qCoarse[0][t] = qCoarse[eo * p_NpCoarse + t];
-      }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NblockVFine; ++es; @inner(1)) {
-      for(int n = 0; n < p_NpFine; ++n; @inner(0)) {
-        const dlong e = eo + es;
-        if(e < Nelements) {
-          //dfloat tmp = 0.;
-          dfloat tmp = qFine[e * p_NpFine + n];
-
-#pragma unroll p_NpCoarse
-          for(int i = 0; i < p_NpCoarse; ++i)
-            tmp += R[i * p_NpFine + n] * s_qCoarse[es][i];
-
-          qFine[e * p_NpFine + n] = tmp;
-        }
-      }
-    }
-  }
-}
-
-//storing R in @shared is too much for 3D
-#if 0
-@kernel void ellipticPreconProlongate_v1(const dlong Nelements,
-                                         @restrict const dfloat*  R,
-                                         @restrict const dfloat*  qCoarse,
-                                         @restrict dfloat*  qFine)
-{
-  for(dlong eo = 0; eo < Nelements; eo += p_NblockVFine; @outer(0)) {
-    @shared dfloat s_qCoarse[p_NblockVFine][p_NpCoarse];
-    @shared dfloat s_R[p_NpCoarse][p_NpFine];
-
-    for(int es = 0; es < p_NblockVFine; ++es; @inner(1))
-      for(int n = 0; n < p_NpFine; ++n; @inner(0)) {
-        dlong t = n + es * p_NpFine;
-
-        if(t < p_NpCoarse * p_NblockVFine)
-          if((eo * p_NpCoarse + t) < Nelements * p_NpCoarse)
-            s_qCoarse[0][t] = qCoarse[eo * p_NpCoarse + t];
-
-        while(t < p_NpFine * p_NpCoarse) {
-          s_R[0][t] = R[t];
-          t += p_NpFine * p_NblockVFine;
-        }
-      }
-
-    @barrier("local");
-
-    for(int es = 0; es < p_NblockVFine; ++es; @inner(1)) {
-      for(int n = 0; n < p_NpFine; ++n; @inner(0)) {
-        const dlong e = eo + es;
-        if(e < Nelements) {
-          dfloat tmp = 0.;
-          //dfloat tmp = qFine[e*p_NpFine+n];
-
-#pragma unroll p_NpCoarse
-          for(int i = 0; i < p_NpCoarse; ++i)
-            tmp += s_R[i][n] * s_qCoarse[es][i];
-
-          qFine[e * p_NpFine + n] = tmp;
-        }
-      }
-    }
-  }
-}
-#endif
-
-#if 0
-@kernel void ellipticPreconProlongateQuad2D(const int Nelements,
-                                            @restrict const dfloat*  V1,
-                                            @restrict const dfloat*  q1,
-                                            @restrict dfloat*  qN)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q1N[p_Nq1][p_Nq];
-    @shared dfloat s_q11[p_Nq1][p_Nq1];
-    @shared dfloat s_V1[p_Nq1][p_Nq];
-
-    // prefetch to @shared
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        int id = i + j * p_Nq;
-        if(id < p_Nq1 * p_Nq1)
-          s_q11[0][id] = q1[id + e * p_Nq1 * p_Nq1];
-        if(id < p_Nq1 * p_Nq)
-          s_V1[0][id] = V1[id];
-      }
-
-    @barrier("local");
-
-    // prolongate in i index
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0))
-        if(j < 2) {
-          dfloat res = 0;
-          for(int m = 0; m < p_Nq1; ++m)
-            res += s_V1[i][m] * s_q11[j][m];
-          s_q1N[j][i] = res;
-        }
-
-    @barrier("local");
-
-    // coarsen in i index
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat res = 0;
-        for(int m = 0; m < p_Nq1; ++m)
-          res += s_V1[j][m] * s_q1N[m][i];
-
-        qN[i + j * p_Nq + e * p_Nq * p_Nq] = res;
-      }
-  }
-}
-#endif
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCHex3D.okl
deleted file mode 100644
index 7da9ca2a2..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCHex3D.okl
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-void surfaceTerms(int sk,
-                  int m,
-                  int i,
-                  int j,
-                  const dfloat t,
-                  const dfloat* sgeo,
-                  const dfloat* x,
-                  const dfloat* y,
-                  const dfloat* z,
-                  const int* vmapM,
-                  const int* mapB,
-                  dfloat s_q[2][p_Nq][p_Nq],
-                  dfloat s_ndq[2][p_Nq][p_Nq])
-{
-  const dlong idM = vmapM[sk];
-
-  const dfloat nx  = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny  = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat nz  = sgeo[sk * p_Nsgeo + p_NZID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-  dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-  const int bc = mapB[idM];
-
-  if(bc > 0)
-    ellipticBoundaryConditions3D(bc,
-                                 t,
-                                 x[idM],
-                                 y[idM],
-                                 z[idM],
-                                 nx,
-                                 ny,
-                                 nz,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 uP,
-                                 dudxP,
-                                 dudyP,
-                                 dudzP);
-
-  s_q  [m][j][i]  = uP;
-  s_ndq[m][j][i]  = -WsJ * (nx * dudxP + ny * dudyP + nz * dudzP);
-}
-
-@kernel void ellipticRhsBCHex3D(const dlong Nelements,
-                                @restrict const dfloat*  ggeo,
-                                @restrict const dfloat*  sgeo,
-                                @restrict const dfloat*  D,
-                                @restrict const dfloat*  S,
-                                @restrict const dfloat*  MM,
-                                @restrict const dlong*  vmapM,
-                                @restrict const dfloat*  sMT,
-                                const dfloat lambda,
-                                const dfloat t,
-                                @restrict const dfloat*  x,
-                                @restrict const dfloat*  y,
-                                @restrict const dfloat*  z,
-                                @restrict const int*  mapB,
-                                @restrict dfloat*  rhs)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_D[p_Nq][p_Nq];
-    @shared dfloat s_q[2][p_Nq][p_Nq];
-    @shared dfloat s_ndq[2][p_Nq][p_Nq];
-
-    @exclusive dfloat r_qt, r_Gqt, r_Auk;
-    @exclusive dfloat r_q[p_Nq]; // register array to hold u(i,j,0:N) private to thread
-    @exclusive dfloat r_rhs[p_Nq];// array for results Au(i,j,0:N)
-
-    @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ;
-
-    // for all face nodes of all elements
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //load D into local memory
-        // s_D[i][j] = d \phi_i at node j
-        s_D[j][i] = D[p_Nq * j + i]; // D is column major
-
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          r_q[k] = 0.;
-          r_rhs[k] = 0.;
-        }
-
-        const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i + j * p_Nq;
-        const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + i + j * p_Nq;
-
-        //      surfaceTerms(sk0,0,i,j);
-        surfaceTerms(sk0, 0, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-
-        //        surfaceTerms(sk5,1,i,j);
-        surfaceTerms(sk5, 1, i, j, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_q[0] = s_q[0][j][i]; //TW += => =
-        r_rhs[0] += s_ndq[0][j][i];
-
-        //face 5
-        r_q[p_Nq - 1] = s_q[1][j][i];//TW += => =
-        r_rhs[p_Nq - 1] += s_ndq[1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + i + k * p_Nq;
-        const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + i + k * p_Nq;
-
-        //        surfaceTerms(sk1,0,i,k);
-        surfaceTerms(sk1, 0, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-
-        //surfaceTerms(sk3,1,i,k);
-        surfaceTerms(sk3, 1, i, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[0][k][i]; //TW += => =
-            r_rhs[k] += s_ndq[0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[1][k][i]; //TW += => =
-            r_rhs[k] += s_ndq[1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + j + k * p_Nq;
-        const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + j + k * p_Nq;
-
-        //        surfaceTerms(sk2,0,j,k);
-        surfaceTerms(sk2, 0, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-
-        //        surfaceTerms(sk4,1,j,k);
-        surfaceTerms(sk4, 1, j, k, t, sgeo, x, y, z, vmapM, mapB, s_q, s_ndq);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[0][k][j]; //TW += => =
-            r_rhs[k] += s_ndq[0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_q[k] = s_q[1][k][j]; //TW += => =
-            r_rhs[k] += s_ndq[1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // Layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; k++) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // prefetch geometric factors
-          const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-
-          r_G00 = ggeo[gbase + p_G00ID * p_Np];
-          r_G01 = ggeo[gbase + p_G01ID * p_Np];
-          r_G02 = ggeo[gbase + p_G02ID * p_Np];
-
-          r_G11 = ggeo[gbase + p_G11ID * p_Np];
-          r_G12 = ggeo[gbase + p_G12ID * p_Np];
-          r_G22 = ggeo[gbase + p_G22ID * p_Np];
-
-          r_GwJ = ggeo[gbase + p_GWJID * p_Np];
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          // share u(:,:,k)
-          s_q[0][j][i] = r_q[k];
-
-          r_qt = 0;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++)
-            r_qt += s_D[k][m] * r_q[m];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat qr = 0.f;
-          dfloat qs = 0.f;
-
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            qr += s_D[i][m] * s_q[0][j][m];
-            qs += s_D[j][m] * s_q[0][m][i];
-          }
-
-          //reuse the s_ndq array
-          s_ndq[0][j][i] = (r_G01 * qr + r_G11 * qs + r_G12 * r_qt);
-          s_ndq[1][j][i] = (r_G00 * qr + r_G01 * qs + r_G02 * r_qt);
-
-          // put this here for a performance bump
-          r_Gqt = (r_G02 * qr + r_G12 * qs + r_G22 * r_qt);
-          r_Auk = r_GwJ * lambda * r_q[k];
-        }
-      }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-          for(int m = 0; m < p_Nq; m++) {
-            r_Auk    += s_D[m][j] * s_ndq[0][m][i];
-            r_rhs[m] += s_D[k][m] * r_Gqt;   // DT(m,k)*ut(i,j,k,e)
-            r_Auk    += s_D[m][i] * s_ndq[1][j][m];
-          }
-
-          r_rhs[k] += r_Auk;
-        }
-      }
-    }
-
-    // write out
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; k++) {
-          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          rhs[id] -= r_rhs[k];
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl
deleted file mode 100644
index 5c4610244..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-void surfaceTerms(int e,
-                  int sk,
-                  int face,
-                  int m,
-                  int i,
-                  int j,
-                  @global const dfloat* sgeo,
-                  @global const dfloat* x,
-                  @global const dfloat* y,
-                  @global const dfloat* z,
-                  @global const int* vmapM,
-                  @global const int* EToB,
-                  const dfloat tau,
-                  dfloat s_dqdx[2][p_Nq][p_Nq],
-                  dfloat s_dqdy[2][p_Nq][p_Nq],
-                  dfloat s_dqdz[2][p_Nq][p_Nq],
-                  dfloat s_rhs[2][p_Nq][p_Nq])
-{
-  const dfloat nx  = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny  = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat nz  = sgeo[sk * p_Nsgeo + p_NZID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-  const dfloat hinv = sgeo[sk * p_Nsgeo + p_IHID];
-
-  dfloat dqdxP = 0.f, dqdyP = 0.f, dqdzP = 0.f, qP = 0.f;
-
-  const int bc = EToB[face + p_Nfaces * e];
-  if(bc > 0) {
-    const int id = vmapM[sk];
-    ellipticBoundaryConditions3D(bc,
-                                 t,
-                                 x[id],
-                                 y[id],
-                                 z[id],
-                                 nx,
-                                 ny,
-                                 nz,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 qP,
-                                 dqdxP,
-                                 dqdyP,
-                                 dqdzP);
-  }
-
-  const dfloat dq = qP;
-
-  s_dqdx[m][j][i] = WsJ * nx * dq;
-  s_dqdy[m][j][i] = WsJ * ny * dq;
-  s_dqdz[m][j][i] = WsJ * nz * dq;
-
-  s_rhs[m][j][i] = -WsJ * (nx * dqdxP + ny * dqdyP + nz * dqdzP + tau * hinv * dq);
-}
-
-// (grad phi, grad q) + ([phi], n.{grad q}) + ({grad phi}, n[u]) + (tau[phi],[u])
-
-@kernel void ellipticRhsBCIpdgHex3D(const dlong Nelements,
-                                    @restrict const dlong*  vmapM,
-                                    const dfloat tau,
-                                    const dfloat t,
-                                    @restrict const dfloat*  x,
-                                    @restrict const dfloat*  y,
-                                    @restrict const dfloat*  z,
-                                    @restrict const dfloat*  vgeo,
-                                    @restrict const dfloat*  sgeo,
-                                    @restrict const int*  EToB,
-                                    @restrict const dfloat*  D,
-                                    @restrict const dfloat*  LIFTT,
-                                    @restrict const dfloat*  MM,
-                                    @restrict dfloat*  rhs)
-{
-#if 0
-  // assume the following are precomputed:
-  // p, px, py at SEM nodes
-  // +/- traces of p, px, py at SEM surface nodes
-
-  0 <= i,j,k,m <= N AND 0 <= e < Nelements
-
-    (phix, px) _e
-  + (phiy, py)_e
-  + (phix, nx* (p + -p -) / 2)_de
-  + (phiy, ny* (p + -p -) / 2)_de
-  - (phi -, nx* (px + +px -) / 2)_de
-  - (phi -, ny* (py + +py -) / 2)_de
-  - (phi -, tau* (p + -p -) / 2) _de
-
-  // here w is one component of the product TP quadrature weights
-    (phir, rx* (px + Fsc* nx* dp) + ry* (py + Fsc* ny* dp) + rz* (pz + Fsc* nz* dp)) )_e
-  +   (phir, sx* (px + Fsc * nx * dp) + sy * (py + Fsc * ny * dp) + sz * (pz + Fsc * nz * dp)) )_e
-  +   (phir, tx* (px + Fsc* nx* dp) + ty* (py + Fsc* ny* dp) + tz* (pz + Fsc* nz* dp)) ) _e
-    (phi -,
-    Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) / 2) _e
-
-  px = > px + Fsc * nx * dp (i.e.add pseudo - gradient at end points
-                             py = > py + Fsc * ny * dp
-                                  pz = > pz + Fsc * nz * dp
-                                       Fsc = delta * (Js / J) * (1 / w)
-                                             dp = (p + -p -) / 2;
-
-                             // simplify
-                             (phir, rx * px + ry * py + rz * pz) ) _e
-       +   (phir, sx* px + sy * py + sz * pz) )_e
-       +   (phir, tx* px + ty * py + tz * pz) )_e
-       +   (phi -,
-            Fsc* (nx * (px + +px -) + ny * (py + +py -) + nz * (pz + +pz -) - tau * (p + -p -)) /
-            2)_e
-
-#endif
-
-  for(dlong e = 0; e < Nelements; ++e; @outer(0))
-  {
-    @shared dfloat s_dqdx[2][p_Nq][p_Nq];
-    @shared dfloat s_dqdy[2][p_Nq][p_Nq];
-    @shared dfloat s_dqdz[2][p_Nq][p_Nq];
-    @shared dfloat s_rhs[2][p_Nq][p_Nq];
-
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dfloat r_dqdx[p_Nq], r_dqdy[p_Nq], r_dqdz[p_Nq], r_rhs[p_Nq];
-    @exclusive dfloat r_dqdt;
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          r_dqdx[k] = 0.f;
-          r_dqdy[k] = 0.f;
-          r_dqdz[k] = 0.f;
-          r_rhs [k] = 0.f;
-        }
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-    }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + j * p_Nq + i;
-        const dlong sk5 = e * p_Nfp * p_Nfaces + 5 * p_Nfp + j * p_Nq + i;
-
-        //      {surfaceTerms(sk0,0,0,i,j)}
-        surfaceTerms(e, sk0, 0, 0, i, j,
-                     sgeo, x, y, z, vmapM, EToB, tau, s_dqdx, s_dqdy, s_dqdz, s_rhs);
-
-        //        {surfaceTerms(sk5,5,1,i,j)}
-        surfaceTerms(e, sk5, 5, 1, i, j,
-                     sgeo, x, y, z, vmapM, EToB, tau, s_dqdx, s_dqdy, s_dqdz, s_rhs);
-      }
-
-    @barrier("local");
-
-    // face 0 & 5
-    for(int j = 0; j < p_Nq; ++j; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        //face 0
-        r_dqdx[0] += s_dqdx[0][j][i];
-        r_dqdy[0] += s_dqdy[0][j][i];
-        r_dqdz[0] += s_dqdz[0][j][i];
-        r_rhs [0] += s_rhs [0][j][i];
-
-        //face 5
-        r_dqdx[p_Nq - 1] += s_dqdx[1][j][i];
-        r_dqdy[p_Nq - 1] += s_dqdy[1][j][i];
-        r_dqdz[p_Nq - 1] += s_dqdz[1][j][i];
-        r_rhs [p_Nq - 1] += s_rhs [1][j][i];
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + k * p_Nq + i;
-        const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + k * p_Nq + i;
-
-        //        {surfaceTerms(sk1,1,0,i,k)}
-        surfaceTerms(e, sk1, 1, 0, i, k,
-                     sgeo, x, y, z, vmapM, EToB, tau, s_dqdx, s_dqdy, s_dqdz, s_rhs);
-
-        //        {surfaceTerms(sk3,3,1,i,k)}
-        surfaceTerms(e, sk3, 3, 1, i, k,
-                     sgeo, x, y, z, vmapM, EToB, tau, s_dqdx, s_dqdy, s_dqdz, s_rhs);
-      }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (j == 0) {//face 1
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[0][k][i];
-            r_dqdy[k] += s_dqdy[0][k][i];
-            r_dqdz[k] += s_dqdz[0][k][i];
-            r_rhs [k] += s_rhs [0][k][i];
-          }
-        }
-        if (j == p_Nq - 1) {//face 3
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[1][k][i];
-            r_dqdy[k] += s_dqdy[1][k][i];
-            r_dqdz[k] += s_dqdz[1][k][i];
-            r_rhs [k] += s_rhs [1][k][i];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int k = 0; k < p_Nq; ++k; @inner(1))
-      for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-        const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + k * p_Nq + j;
-        const dlong sk4 = e * p_Nfp * p_Nfaces + 4 * p_Nfp + k * p_Nq + j;
-
-        //        {surfaceTerms(sk2,2,0,j,k)}
-        surfaceTerms(e, sk2, 2, 0, j, k,
-                     sgeo, x, y, z, vmapM, EToB, tau, s_dqdx, s_dqdy, s_dqdz, s_rhs);
-
-        //      {surfaceTerms(sk4,4,1,j,k)}
-        surfaceTerms(e, sk4, 4, 1, j, k,
-                     sgeo, x, y, z, vmapM, EToB, tau, s_dqdx, s_dqdy, s_dqdz, s_rhs);
-      }
-
-    @barrier("local");
-
-    // face 2 & 4
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        if (i == p_Nq - 1) {//face 2
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[0][k][j];
-            r_dqdy[k] += s_dqdy[0][k][j];
-            r_dqdz[k] += s_dqdz[0][k][j];
-            r_rhs [k] += s_rhs [0][k][j];
-          }
-        }
-        if (i == 0) {//face 4
-#pragma unroll p_Nq
-          for (int k = 0; k < p_Nq; k++) {
-            r_dqdx[k] += s_dqdx[1][k][j];
-            r_dqdy[k] += s_dqdy[1][k][j];
-            r_dqdz[k] += s_dqdz[1][k][j];
-            r_rhs [k] += s_rhs [1][k][j];
-          }
-        }
-      }
-    }
-
-    @barrier("local");
-
-    //layer by layer
-#pragma unroll p_Nq
-    for(int k = 0; k < p_Nq; ++k) {
-      for(int j = 0; j < p_Nq; ++j; @inner(1))
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo;
-
-          const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-          const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-          const dfloat drdz = vgeo[gid + p_RZID * p_Np];
-
-          const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-          const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-          const dfloat dsdz = vgeo[gid + p_SZID * p_Np];
-
-          const dfloat dtdx = vgeo[gid + p_TXID * p_Np];
-          const dfloat dtdy = vgeo[gid + p_TYID * p_Np];
-          const dfloat dtdz = vgeo[gid + p_TZID * p_Np];
-
-          const dfloat dqdx = r_dqdx[k];
-          const dfloat dqdy = r_dqdy[k];
-          const dfloat dqdz = r_dqdz[k];
-
-          s_dqdx[0][j][i] = (drdx * dqdx + drdy * dqdy + drdz * dqdz);
-          s_dqdy[0][j][i] = (dsdx * dqdx + dsdy * dqdy + dsdz * dqdz);
-          r_dqdt = (dtdx * dqdx + dtdy * dqdy + dtdz * dqdz);
-        }
-
-      @barrier("local");
-
-      for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-        for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-          dfloat dr = 0, ds = 0;
-
-#pragma unroll p_Nq
-          for(int n = 0; n < p_Nq; ++n) {
-            dr += s_D[n][i] * s_dqdx[0][j][n];
-            r_rhs[n] += s_D[k][n] * r_dqdt; // DT(m,k)*ut(i,j,k,e)
-            ds += s_D[n][j] * s_dqdy[0][n][i];
-          }
-
-          r_rhs[k] += dr + ds;
-        }
-      }
-
-      @barrier("local");
-    }
-
-    for(int j = 0; j < p_Nq; ++j; @inner(1)) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-#pragma unroll p_Nq
-        for(int k = 0; k < p_Nq; ++k) {
-          const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-          rhs[id] -= r_rhs[k];
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl
deleted file mode 100644
index 7c3fa23cd..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-// increments gradients of pressure with pseudo-gradients at faces and
-// increments rhs with pseudo-gradient/penalty terms at faces (need to double check scaling with tau)
-
-// sgeo stores dfloat4s with nx,ny,nz,(sJ/J)*(w1*w2*w3/(ws1*ws2))
-
-// nx,ny,nz,sJ,invJ - need WsJ
-
-void surfaceTerms(int e, int sk, int face, int i, int j,
-                  @global const dfloat* sgeo,
-                  @global const dfloat* x,
-                  @global const dfloat* y,
-                  @global const int* vmapM,
-                  @global const int* EToB,
-                  const dfloat tau,
-                  dfloat s_dqdx[p_Nq][p_Nq],
-                  dfloat s_dqdy[p_Nq][p_Nq],
-                  dfloat s_rhsq[p_Nq][p_Nq])
-{
-  const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-  const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-  const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-  const dfloat hinv = sgeo[sk * p_Nsgeo + p_IHID];
-
-  dfloat dqdxP, dqdyP, qP;
-  dqdxP = 0.f;
-  dqdyP = 0.f;
-  qP = 0.f;
-
-  const int bc = EToB[face + p_Nfaces * e];
-  if(bc > 0)
-    ellipticBoundaryConditions2D(bc,
-                                 t,
-                                 x[vmapM[sk]],
-                                 y[vmapM[sk]],
-                                 nx,
-                                 ny,
-                                 0.f,
-                                 0.f,
-                                 0.f,
-                                 qP,
-                                 dqdxP,
-                                 dqdyP);
-
-  dfloat dq = qP;
-
-  s_dqdx[j][i] += WsJ * nx * dq;
-  s_dqdy[j][i] += WsJ * ny * dq;
-
-  s_rhsq[j][i] -= WsJ * (nx * dqdxP + ny * dqdyP + tau * dq * hinv);
-}
-
-@kernel void ellipticRhsBCIpdgQuad2D(const dlong Nelements,
-                                     @restrict const dlong*  vmapM,
-                                     const dfloat tau,
-                                     const dfloat t,
-                                     @restrict const dfloat*  x,
-                                     @restrict const dfloat*  y,
-                                     @restrict const dfloat*  z,
-                                     @restrict const dfloat*  vgeo,
-                                     @restrict const dfloat*  sgeo,
-                                     @restrict const int*  EToB,
-                                     @restrict const dfloat*  D,
-                                     @restrict const dfloat*  LIFTT,
-                                     @restrict const dfloat*  MM,
-                                     @restrict dfloat*  rhs)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Nq][p_Nq];
-    @shared dfloat s_dqdy[p_Nq][p_Nq];
-    @shared dfloat s_rhsq[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        s_dqdx[j][i] = 0.f;
-        s_dqdy[j][i] = 0.f;
-        s_rhsq[j][i] = 0.f;
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-
-    @barrier("local");
-
-    // loop over faces to add pseudo-gradient
-
-    // face 0 & 2
-    for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-      const dlong sk0 = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i;
-      const dlong sk2 = e * p_Nfp * p_Nfaces + 2 * p_Nfp + i;
-
-      //      surfaceTerms(sk0,0,i,0);
-      surfaceTerms(e, sk0, 0, i, 0, sgeo, x, y, vmapM, EToB, tau,
-                   s_dqdx, s_dqdy, s_rhsq);
-
-      //      surfaceTerms(sk2,2,i,p_Nq-1);
-      surfaceTerms(e, sk2, 2, i, p_Nq - 1, sgeo, x, y, vmapM, EToB, tau,
-                   s_dqdx, s_dqdy, s_rhsq);
-    }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-      const dlong sk1 = e * p_Nfp * p_Nfaces + 1 * p_Nfp + j;
-      const dlong sk3 = e * p_Nfp * p_Nfaces + 3 * p_Nfp + j;
-
-      //      surfaceTerms(sk1,1,p_Nq-1,j);
-      surfaceTerms(e, sk1, 1, p_Nq - 1, j, sgeo, x, y, vmapM, EToB, tau,
-                   s_dqdx, s_dqdy, s_rhsq);
-
-      //      surfaceTerms(sk3,3,0,j);
-      surfaceTerms(e, sk3, 3, 0, j, sgeo, x, y, vmapM, EToB, tau,
-                   s_dqdx, s_dqdy, s_rhsq);
-    }
-
-    @barrier("local");
-
-    // prescale by geofacs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        // does need the nasty geofacs
-        const dlong gid = i + j * p_Nq + e * p_Np * p_Nvgeo;
-
-        const dfloat drdx = vgeo[gid + p_RXID * p_Np];
-        const dfloat drdy = vgeo[gid + p_RYID * p_Np];
-        const dfloat dsdx = vgeo[gid + p_SXID * p_Np];
-        const dfloat dsdy = vgeo[gid + p_SYID * p_Np];
-
-        // chain rule (need to scale by wei
-        const dfloat dqdx = s_dqdx[j][i];
-        const dfloat dqdy = s_dqdy[j][i];
-
-        s_dqdx[j][i] = (drdx * dqdx + drdy * dqdy);
-        s_dqdy[j][i] = (dsdx * dqdx + dsdy * dqdy);
-      }
-
-    @barrier("local");
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat dr = 0, ds = 0;
-
-        for(int n = 0; n < p_Nq; ++n) {
-          dr += s_D[n][i] * s_dqdx[j][n];
-          ds += s_D[n][j] * s_dqdy[n][i];
-        }
-
-        dlong id = e * p_Np + j * p_Nq + i;
-        rhs[id] -= s_rhsq[j][i]  + dr + ds;
-      }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl
deleted file mode 100644
index 1d99bd2fd..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticRhsBCIpdgTet3D(const dlong Nelements,
-                                    @restrict const dlong*  vmapM,
-                                    const dfloat tau,
-                                    const dfloat t,
-                                    @restrict const dfloat*  x,
-                                    @restrict const dfloat*  y,
-                                    @restrict const dfloat*  z,
-                                    @restrict const dfloat*  vgeo,
-                                    @restrict const dfloat*  sgeo,
-                                    @restrict const int*  EToB,
-                                    @restrict const dfloat*  Dmatrices,
-                                    @restrict const dfloat*  LIFTT,
-                                    @restrict const dfloat*  MM,
-                                    @restrict dfloat*  rhs)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dqdx[p_Np];
-    @shared dfloat s_dqdy[p_Np];
-    @shared dfloat s_dqdz[p_Np];
-    @shared dfloat s_lapq[p_Np];
-    @shared dfloat s_nxdq[p_NfacesNfp];
-    @shared dfloat s_nydq[p_NfacesNfp];
-    @shared dfloat s_nzdq[p_NfacesNfp];
-    @shared dfloat s_lapflux[p_NfacesNfp];
-    @shared dfloat s_Lnxdq[p_Np];
-    @shared dfloat s_Lnydq[p_Np];
-    @shared dfloat s_Lnzdq[p_Np];
-
-    @exclusive int idM;
-    @exclusive dfloat nx, ny, nz, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_NfacesNfp) {
-        const int id  = n + e * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-
-        // find face that owns this node
-        const int face = n / p_Nfp;
-
-        dfloat dudxP = 0.f, dudyP = 0.f, dudzP = 0.f, uP = 0.f;
-
-        // load surface geofactors for this face
-        int sid = p_Nsgeo * (e * p_Nfaces + face);
-        nx = sgeo[sid + p_NXID];
-        ny = sgeo[sid + p_NYID];
-        nz = sgeo[sid + p_NZID];
-        sJ = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        const int bc = EToB[face + p_Nfaces * e];
-        if(bc > 0)
-          ellipticBoundaryConditions3D(bc, t, x[idM], y[idM], z[idM], nx, ny, nz, \
-                                       0.f, 0.f, 0.f, 0.f,               \
-                                       uP, dudxP, dudyP, dudzP);
-
-        const dfloat dq = uP;
-        const dfloat half = 1.f;
-
-        s_nxdq[n] = half * sJ * invJ * nx * dq;
-        s_nydq[n] = half * sJ * invJ * ny * dq;
-        s_nzdq[n] = half * sJ * invJ * nz * dq;
-
-        s_lapflux[n] = half * sJ * invJ * (-nx * dudxP - ny * dudyP - nz * dudzP - tau * hinv * dq);
-      }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const int gid = e * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat drdz = vgeo[gid + p_RZID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-        const dfloat dsdz = vgeo[gid + p_SZID];
-        const dfloat dtdx = vgeo[gid + p_TXID];
-        const dfloat dtdy = vgeo[gid + p_TYID];
-        const dfloat dtdz = vgeo[gid + p_TZID];
-
-        dfloat Lnxdq = 0;
-        dfloat Lnydq = 0;
-        dfloat Lnzdq = 0;
-
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i) {
-          Lnxdq += LIFTT[n + i * p_Np] * s_nxdq[i];
-          Lnydq += LIFTT[n + i * p_Np] * s_nydq[i];
-          Lnzdq += LIFTT[n + i * p_Np] * s_nzdq[i];
-        }
-
-        dfloat dqdx = Lnxdq;
-        dfloat dqdy = Lnydq;
-        dfloat dqdz = Lnzdq;
-
-        s_dqdx[n] = drdx * dqdx + drdy * dqdy + drdz * dqdz; // abuse of notation
-        s_dqdy[n] = dsdx * dqdx + dsdy * dqdy + dsdz * dqdz;
-        s_dqdz[n] = dtdx * dqdx + dtdy * dqdy + dtdz * dqdz;
-
-        s_Lnxdq[n] = Lnxdq;
-        s_Lnydq[n] = Lnydq;
-        s_Lnzdq[n] = Lnzdq;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        int id = idM % p_Np;
-        s_lapflux[n] += sJ * invJ * (nx * s_Lnxdq[id] + ny * s_Lnydq[id] + nz * s_Lnzdq[id]);
-      }
-
-      if(n < p_Np) {
-        dfloat lapr = 0, laps = 0, lapt = 0;
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          lapr += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dqdx[i];
-          laps += Dmatrices[n + i * p_Np + 1 * p_Np * p_Np] * s_dqdy[i];
-          lapt += Dmatrices[n + i * p_Np + 2 * p_Np * p_Np] * s_dqdz[i];
-        }
-
-        s_lapq[n] = -(lapr + laps + lapt);
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lap = 0;
-
-        // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          lap += LIFTT[n + i * p_Np] * s_lapflux[i];
-
-        s_lapq[n] += lap;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[e * p_Nvgeo + p_JID];
-
-        dfloat Mlapq = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapq += MM[n + i * p_Np] * s_lapq[i];
-
-        const int id = e * p_Np + n;
-        rhs[id] -=  J * Mlapq;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl
deleted file mode 100644
index 13bfadebc..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticRhsBCIpdgTri2D(const dlong Nelements,
-                                    @restrict const dlong*  vmapM,
-                                    const dfloat tau,
-                                    const dfloat t,
-                                    @restrict const dfloat*  x,
-                                    @restrict const dfloat*  y,
-                                    @restrict const dfloat*  z,
-                                    @restrict const dfloat*  vgeo,
-                                    @restrict const dfloat*  sgeo,
-                                    @restrict const int*  EToB,
-                                    @restrict const dfloat*  Dmatrices,
-                                    @restrict const dfloat*  LIFTT,
-                                    @restrict const dfloat*  MM,
-                                    @restrict dfloat*  rhs)
-{
-  for(int e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_dudx[p_Np];
-    @shared dfloat s_dudy[p_Np];
-    @shared dfloat s_lapu[p_Np];
-    @shared dfloat s_nxdu[p_NfacesNfp];
-    @shared dfloat s_nydu[p_NfacesNfp];
-    @shared dfloat s_lapuflux[p_NfacesNfp];
-    @shared dfloat s_Lnxdu[p_Np];
-    @shared dfloat s_Lnydu[p_Np];
-
-    @exclusive int idM;
-    @exclusive dfloat nx, ny, sJ, invJ, hinv;
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_NfacesNfp) {
-        const int id  = n + e * p_Nfaces * p_Nfp;
-        idM = vmapM[id];
-
-        const int face = n / p_Nfp;
-
-        dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-        // load surface geofactors for this face
-        int sid = p_Nsgeo * (e * p_Nfaces + face);
-        nx = sgeo[sid + p_NXID];
-        ny = sgeo[sid + p_NYID];
-        sJ = sgeo[sid + p_SJID];
-        invJ = sgeo[sid + p_IJID];
-        hinv = sgeo[sid + p_IHID];
-
-        const int bc = EToB[face + p_Nfaces * e];
-        if(bc > 0)
-          ellipticBoundaryConditions2D(bc, t, x[idM], y[idM], nx, ny, \
-                                       0.f, 0.f, 0.f,                \
-                                       uP, dudxP, dudyP);
-
-        const dfloat du = uP;
-        const dfloat half = 1.f;
-
-        s_nxdu[n] = half * sJ * invJ * nx * du;
-        s_nydu[n] = half * sJ * invJ * ny * du;
-
-        s_lapuflux[n] = half * sJ * invJ * (-nx * (dudxP) - ny * (dudyP) - tau * hinv * du);
-      }
-
-    @barrier("local");
-
-    // dqdx += LIFT*(sJ/J)*nx*dq
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const int gid = e * p_Nvgeo;
-        const dfloat drdx = vgeo[gid + p_RXID];
-        const dfloat drdy = vgeo[gid + p_RYID];
-        const dfloat dsdx = vgeo[gid + p_SXID];
-        const dfloat dsdy = vgeo[gid + p_SYID];
-
-        dfloat Lnxdu = 0;
-        dfloat Lnydu = 0;
-
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i) {
-          Lnxdu += LIFTT[n + i * p_Np] * s_nxdu[i];
-          Lnydu += LIFTT[n + i * p_Np] * s_nydu[i];
-        }
-
-        dfloat dudx = Lnxdu;
-        dfloat dudy = Lnydu;
-
-        s_dudx[n] = drdx * dudx + drdy * dudy; // abuse of notation
-        s_dudy[n] = dsdx * dudx + dsdy * dudy;
-
-        s_Lnxdu[n] = Lnxdu;
-        s_Lnydu[n] = Lnydu;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_NfacesNfp) {
-        int id = idM % p_Np;
-        s_lapuflux[n] += sJ * invJ * (nx * s_Lnxdu[id] + ny * s_Lnydu[id]);
-      }
-
-      if(n < p_Np) {
-        dfloat laur = 0, laus = 0;
-
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i) {
-          laur += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dudx[i];
-          laus += Dmatrices[n + i * p_Np + 0 * p_Np * p_Np] * s_dudy[i];
-        }
-
-        s_lapu[n] = -(laur + laus);
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        dfloat lau = 0;
-
-        // lift remaining surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          lau += LIFTT[n + i * p_Np] * s_lapuflux[i];
-
-        s_lapu[n] += lau;
-      }
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        const dfloat J = vgeo[e * p_Nvgeo + p_JID];
-
-        dfloat Mlapu = 0;
-
-        // multiply by mass matrix
-#pragma unroll p_Np
-        for(int i = 0; i < p_Np; ++i)
-          Mlapu += MM[n + i * p_Np] * s_lapu[i];
-
-        const int id = e * p_Np + n;
-        rhs[id] -=  J * Mlapu;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl
deleted file mode 100644
index 416ee165a..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticRhsBCQuad2D(const dlong Nelements,
-                                 @restrict const dfloat*  ggeo,
-                                 @restrict const dfloat*  sgeo,
-                                 @restrict const dfloat*  D,
-                                 @restrict const dfloat*  S,
-                                 @restrict const dfloat*  MM,
-                                 @restrict const dlong*  vmapM,
-                                 @restrict const dfloat*  sMT,
-                                 const dfloat lambda,
-                                 const dfloat t,
-                                 @restrict const dfloat*  x,
-                                 @restrict const dfloat*  y,
-                                 @restrict const dfloat*  z,
-                                 @restrict const int*  mapB,
-                                 @restrict dfloat*  rhs)
-{
-  for(dlong e = 0; e < Nelements; e++; @outer(0)) {
-    @shared dfloat s_q[p_Nq][p_Nq];
-    @shared dfloat s_ndq[p_Nq][p_Nq];
-    @shared dfloat s_D[p_Nq][p_Nq];
-
-    @exclusive dfloat r_qr[p_Nq], r_qs[p_Nq], r_Aq[p_Nq];
-    @exclusive dfloat r_G00[p_Nq], r_G01[p_Nq], r_G11[p_Nq], r_GwJ[p_Nq];
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        s_q[j][i] = 0.;
-        s_ndq[j][i] = 0.;
-
-        s_D[j][i] = D[j * p_Nq + i];
-      }
-
-    @barrier("local");
-
-    // face 0 & 2
-    for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-      {//face 0
-        const dlong sk = e * p_Nfp * p_Nfaces + 0 * p_Nfp + i;
-        const dlong idM = vmapM[sk];
-
-        const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-        const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-        const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-        dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-        const int bc = mapB[idM];
-        if(bc > 0)
-          ellipticBoundaryConditions2D(bc,
-                                       t,
-                                       x[idM],
-                                       y[idM],
-                                       nx,
-                                       ny,
-                                       0.f,
-                                       0.f,
-                                       0.f,
-                                       uP,
-                                       dudxP,
-                                       dudyP);
-
-        s_q  [0][i] = uP;
-        s_ndq[0][i] -= WsJ * (nx * dudxP + ny * dudyP);
-      }
-      {//face 2
-        const dlong sk = e * p_Nfp * p_Nfaces + 2 * p_Nfp + i;
-        const dlong idM = vmapM[sk];
-
-        const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-        const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-        const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-        dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-        const int bc = mapB[idM];
-        if(bc > 0)
-          ellipticBoundaryConditions2D(bc,
-                                       t,
-                                       x[idM],
-                                       y[idM],
-                                       nx,
-                                       ny,
-                                       0.f,
-                                       0.f,
-                                       0.f,
-                                       uP,
-                                       dudxP,
-                                       dudyP);
-
-        s_q  [p_Nq - 1][i] = uP;
-        s_ndq[p_Nq - 1][i] -= WsJ * (nx * dudxP + ny * dudyP);
-      }
-    }
-
-    @barrier("local");
-
-    // face 1 & 3
-    for(int j = 0; j < p_Nq; ++j; @inner(0)) {
-      {//face 1
-        const dlong sk = e * p_Nfp * p_Nfaces + 1 * p_Nfp + j;
-        const dlong idM = vmapM[sk];
-
-        const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-        const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-        const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-        dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-        const int bc = mapB[idM];
-        if(bc > 0)
-          ellipticBoundaryConditions2D(bc,
-                                       t,
-                                       x[idM],
-                                       y[idM],
-                                       nx,
-                                       ny,
-                                       0.f,
-                                       0.f,
-                                       0.f,
-                                       uP,
-                                       dudxP,
-                                       dudyP);
-
-        s_q  [j][p_Nq - 1] = uP;
-        s_ndq[j][p_Nq - 1] -= WsJ * (nx * dudxP + ny * dudyP);
-      }
-      {//face 2
-        const dlong sk = e * p_Nfp * p_Nfaces + 3 * p_Nfp + j;
-        const dlong idM = vmapM[sk];
-
-        const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID];
-        const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID];
-        const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID];
-
-        dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-        const int bc = mapB[idM];
-        if(bc > 0)
-          ellipticBoundaryConditions2D(bc,
-                                       t,
-                                       x[idM],
-                                       y[idM],
-                                       nx,
-                                       ny,
-                                       0.f,
-                                       0.f,
-                                       0.f,
-                                       uP,
-                                       dudxP,
-                                       dudyP);
-
-        s_q  [j][0] = uP;
-        s_ndq[j][0] -= WsJ * (nx * dudxP + ny * dudyP);
-      }
-    }
-
-    @barrier("local");
-
-    // loop over slabs
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        const dlong base = e * p_Nggeo * p_Np + j * p_Nq + i;
-
-        // assumes w*J built into G entries
-        r_GwJ[j] = ggeo[base + p_GWJID * p_Np];
-
-        r_G00[j] = ggeo[base + p_G00ID * p_Np];
-        r_G01[j] = ggeo[base + p_G01ID * p_Np];
-
-        r_G11[j] = ggeo[base + p_G11ID * p_Np];
-
-        dfloat qr = 0.f, qs = 0.f;
-
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n) {
-          qr += s_D[i][n] * s_q[j][n];
-          qs += s_D[j][n] * s_q[n][i];
-        }
-
-        r_qr[j] = qr;
-        r_qs[j] = qs;
-
-        r_Aq[j] = r_GwJ[j] * lambda * s_q[j][i];
-      }
-    }
-
-    // r term ----->
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0))
-        s_q[j][i] = r_G00[j] * r_qr[j] + r_G01[j] * r_qs[j];
-
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat tmp = 0.f;
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n)
-          tmp += s_D[n][i] * s_q[j][n];
-
-        r_Aq[j] += tmp;
-      }
-    }
-
-    // s term ---->
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j)
-      for(int i = 0; i < p_Nq; ++i; @inner(0))
-        s_q[j][i] = r_G01[j] * r_qr[j] + r_G11[j] * r_qs[j];
-
-    @barrier("local");
-
-    for(int j = 0; j < p_Nq; ++j) {
-      for(int i = 0; i < p_Nq; ++i; @inner(0)) {
-        dfloat tmp = 0.f;
-
-#pragma unroll p_Nq
-        for(int n = 0; n < p_Nq; ++n)
-          tmp += s_D[n][j] * s_q[n][i];
-
-        r_Aq[j] += tmp;
-
-        const dlong id = e * p_Np + j * p_Nq + i;
-        rhs[id] -= r_Aq[j] + s_ndq[j][i];
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCTet3D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCTet3D.okl
deleted file mode 100644
index 26331e71b..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCTet3D.okl
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticRhsBCTet3D(const int Nelements,
-                                @restrict const dfloat*  ggeo,
-                                @restrict const dfloat*  sgeo,
-                                @restrict const dfloat*  Dmatrices,
-                                @restrict const dfloat*  Smatrices,
-                                @restrict const dfloat*  MM,
-                                @restrict const int*  vmapM,
-                                @restrict const dfloat*  sMT,
-                                const dfloat lambda,
-                                const dfloat t,
-                                @restrict const dfloat*  x,
-                                @restrict const dfloat*  y,
-                                @restrict const dfloat*  z,
-                                @restrict const int*  mapB,
-                                @restrict dfloat*  rhs)
-{
-  for(int e = 0; e < Nelements; e++; @outer(0)) {
-    @shared dfloat s_q[p_Np];
-    @shared dfloat s_ndq[p_Nfp * p_Nfaces];
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np)
-        s_q[n] = 0.;
-      if(n < p_NfacesNfp)
-        s_ndq[n] = 0.;
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_NfacesNfp) {
-        const int id  = n + e * p_Nfaces * p_Nfp;
-        const int idM = vmapM[id];
-        const int nid = idM % p_Np;
-
-        const int face = n / p_Nfp;
-
-        dfloat dudxP = 0, dudyP = 0, dudzP = 0, uP = 0;
-
-        // load surface geofactors for this face
-        const int sid = p_Nsgeo * (e * p_Nfaces + face);
-        const dfloat nx = sgeo[sid + p_NXID];
-        const dfloat ny = sgeo[sid + p_NYID];
-        const dfloat nz = sgeo[sid + p_NZID];
-        const dfloat sJ = sgeo[sid + p_SJID];
-
-        const int bc = mapB[idM];
-        if(bc > 0)
-          ellipticBoundaryConditions3D(bc, t, x[idM], y[idM], z[idM], nx, ny, nz, \
-                                       0.f, 0.f, 0.f, 0.f,               \
-                                       uP, dudxP, dudyP, dudzP);
-
-        s_q[nid] = uP;
-        s_ndq[n] = sJ * (nx * dudxP + ny * dudyP + nz * dudzP);
-      }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        //volume Dirichlet data
-        const int id = n + e * p_Np;
-        const int gid = e * p_Nggeo;
-        const dfloat Grr = ggeo[gid + p_G00ID];
-        const dfloat Grs = ggeo[gid + p_G01ID];
-        const dfloat Grt = ggeo[gid + p_G02ID];
-        const dfloat Gss = ggeo[gid + p_G11ID];
-        const dfloat Gst = ggeo[gid + p_G12ID];
-        const dfloat Gtt = ggeo[gid + p_G22ID];
-        const dfloat J   = ggeo[gid + p_GWJID];
-
-        dfloat qrr = 0.;
-        dfloat qrs = 0.;
-        dfloat qrt = 0.;
-        dfloat qss = 0.;
-        dfloat qst = 0.;
-        dfloat qtt = 0.;
-        dfloat qM = 0.;
-
-#pragma unroll p_Np
-        for (int k = 0; k < p_Np; k++) {
-          qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * s_q[k];
-          qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * s_q[k];
-          qrt += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * s_q[k];
-          qss += Smatrices[n + k * p_Np + 3 * p_Np * p_Np] * s_q[k];
-          qst += Smatrices[n + k * p_Np + 4 * p_Np * p_Np] * s_q[k];
-          qtt += Smatrices[n + k * p_Np + 5 * p_Np * p_Np] * s_q[k];
-          qM  += MM[n + k * p_Np] * s_q[k];
-        }
-
-        dfloat Lndq = 0;
-        // surface mass * surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          Lndq += sMT[n + i * p_Np] * s_ndq[i];
-
-        rhs[id] -= Grr * qrr + Grs * qrs + Grt * qrt
-                   + Gss * qss + Gst * qst + Gtt * qtt
-                   + J * lambda * qM - Lndq;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticRhsBCTri2D.okl b/src/libP/solvers/elliptic/okl/ellipticRhsBCTri2D.okl
deleted file mode 100644
index 6afc30acd..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticRhsBCTri2D.okl
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticRhsBCTri2D(const dlong Nelements,
-                                @restrict const dfloat*  ggeo,
-                                @restrict const dfloat*  sgeo,
-                                @restrict const dfloat*  Dmatrices,
-                                @restrict const dfloat*  Smatrices,
-                                @restrict const dfloat*  MM,
-                                @restrict const dlong*  vmapM,
-                                @restrict const dfloat*  sMT,
-                                const dfloat lambda,
-                                const dfloat t,
-                                @restrict const dfloat*  x,
-                                @restrict const dfloat*  y,
-                                @restrict const dfloat*  z,
-                                @restrict const int*  mapB,
-                                @restrict dfloat*  rhs)
-{
-  for(int e = 0; e < Nelements; e++; @outer(0)) {
-    @shared dfloat s_q[p_Np];
-    @shared dfloat s_ndq[p_Nfp * p_Nfaces];
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np)
-        s_q[n] = 0.;
-      if(n < p_NfacesNfp)
-        s_ndq[n] = 0.;
-    }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0))
-      if(n < p_NfacesNfp) {
-        const int id  = n + e * p_Nfaces * p_Nfp;
-        const int idM = vmapM[id];
-        const int nid = idM % p_Np;
-
-        const int face = n / p_Nfp;
-
-        dfloat dudxP = 0, dudyP = 0, uP = 0;
-
-        // load surface geofactors for this face
-        const int sid = p_Nsgeo * (e * p_Nfaces + face);
-        const dfloat nx = sgeo[sid + p_NXID];
-        const dfloat ny = sgeo[sid + p_NYID];
-        const dfloat sJ = sgeo[sid + p_SJID];
-
-        const int bc = mapB[idM];
-        if(bc > 0)
-          ellipticBoundaryConditions2D(bc, t, x[idM], y[idM], nx, ny, \
-                                       0.f, 0.f, 0.f,                \
-                                       uP, dudxP, dudyP);
-
-        s_q[nid] = uP;
-        s_ndq[n] = sJ * (nx * dudxP + ny * dudyP);
-      }
-
-    @barrier("local");
-
-    for(int n = 0; n < p_Nmax; ++n; @inner(0)) {
-      if(n < p_Np) {
-        //volume Dirichlet data
-        const int id = n + e * p_Np;
-        const int gid = e * p_Nggeo;
-        const dfloat Grr = ggeo[gid + p_G00ID];
-        const dfloat Grs = ggeo[gid + p_G01ID];
-        const dfloat Gss = ggeo[gid + p_G11ID];
-        const dfloat J   = ggeo[gid + p_GWJID];
-
-        dfloat qrr = 0.;
-        dfloat qrs = 0.;
-        dfloat qss = 0.;
-        dfloat qM = 0.;
-
-#pragma unroll p_Np
-        for (int k = 0; k < p_Np; k++) {
-          qrr += Smatrices[n + k * p_Np + 0 * p_Np * p_Np] * s_q[k];
-          qrs += Smatrices[n + k * p_Np + 1 * p_Np * p_Np] * s_q[k];
-          qss += Smatrices[n + k * p_Np + 2 * p_Np * p_Np] * s_q[k];
-          qM  += MM[n + k * p_Np] * s_q[k];
-        }
-
-        dfloat Lndq = 0;
-        // surface mass * surface terms
-#pragma unroll p_NfacesNfp
-        for(int i = 0; i < p_NfacesNfp; ++i)
-          Lndq += sMT[n + i * p_Np] * s_ndq[i];
-
-        rhs[id] -= Grr * qrr + Grs * qrs + Gss * qss + J * lambda * qM - Lndq;
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/okl/ellipticThinOasPreconditionerHex3D.okl b/src/libP/solvers/elliptic/okl/ellipticThinOasPreconditionerHex3D.okl
deleted file mode 100644
index a879fca30..000000000
--- a/src/libP/solvers/elliptic/okl/ellipticThinOasPreconditionerHex3D.okl
+++ /dev/null
@@ -1,211 +0,0 @@
-
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-@kernel void ellipticThinOasPreconditionerHex3D(const dlong Nelements,
-                                                @restrict const dlong* oasMapP,
-                                                @restrict const dfloat* oasForward,
-                                                @restrict const dfloat* oasBack,
-                                                @restrict const dfloat* oasDiagOp, // TW: oops need diagonal
-                                                const dfloat* q,
-                                                const dfloat* Pq)
-{
-  for(dlong e = 0; e < Nelements; ++e; @outer(0)) {
-    @shared dfloat s_q[p_oasNq][p_oasNq][p_oasNq];
-    @shared dfloat s_oasF[p_oasNq][p_oasNq];
-    @shared dfloat s_oasB[p_oasNq][p_oasNq];
-
-    @exclusive dfloat r_q[p_oasNq];
-
-    for(int j = 0; j < p_oasNq; ++j)
-      for(int i = 0; i < p_oasNq; ++i)
-        for(int k = 0; k < p_oasNq; ++k)
-          s_q[k][j][i] = 0;
-
-    @barrier("local");
-
-    for(int j = 0; j < p_oasNq; ++j)
-      for(int i = 0; i < p_oasNq; ++i) {
-        if(i < p_Nq && j < p_Nq) { // fix this later
-          // populate local data
-          for(int k = 0; k < p_Nq; ++k) {
-            dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
-            s_q[k + 1][j + 1][i + 1] = q[id];
-          }
-        }
-
-        int t = i + j * p_oasNq;
-        s_oasF[j][i] = oasForward[t];
-        s_oasB[j][i] = oasBackward[t];
-      }
-
-    @barrier("local");
-
-    for(int j = 0; j < p_oasNq; ++j)
-      for(int i = 0; i < p_oasNq; ++i)
-
-        if(i < p_Nq && j < p_Nq) { // fix this later
-          dlong base = e * p_Nfaces * p_Nfp + j * p_Nq + i;
-          // face 0
-          dlong idP = oasMapP[base + 0 * p_Nfp];
-          s_q[0][j + 1][i + 1] = q[idP];
-
-          // face 1
-          idP = oasMapP[base + 1 * p_Nfp];
-          s_q[j + 1][0][i + 1] = q[idP];
-
-          // face 2
-          idP = oasMapP[base + 2 * p_Nfp];
-          s_q[j + 1][i + 1][p_oasNq - 1] = q[idP];
-
-          // face 3
-          idP = oasMapP[base + 3 * p_Nfp];
-          s_q[j + 1][p_oasNq - 1][i + 1] = q[idP];
-
-          // face 4
-          idP = oasMapP[base + 4 * p_Nfp];
-          s_q[j + 1][i + 1][0] = q[idP];
-
-          // face 5
-          idP = oasMapP[base + 5 * p_Nfp];
-          s_q[p_oasNq - 1][j + 1][i + 1] = q[idP];
-        }
-
-    @barrier("local");
-
-    // transform forward in 'r'
-    for(int k = 0; k < p_oasNq; ++k)
-      for(int j = 0; j < p_oasNq; ++j) {
-        for(int i = 0; i < p_oasNq; ++i)
-          r_q[i] = s_q[k][j][i];
-
-        for(int a = 0; a < p_oasNq; ++a) {
-          dfloat res = 0;
-
-          for(int i = 0; i < p_oasNq; ++i)
-            res += s_oasF[a][i] * r_q[i];
-
-          s_q[k][j][a] = res;
-        }
-      }
-
-    @barrier("local");
-
-    // transform forward in 's'
-    for(int k = 0; k < p_oasNq; ++k)
-      for(int a = 0; a < p_oasNq; ++a) {
-        for(int j = 0; j < p_oasNq; ++j)
-          r_q[j] = s_q[k][j][a];
-
-        for(int b = 0; b < p_oasNq; ++b) {
-          dfloat res = 0;
-
-          for(int j = 0; j < p_oasNq; ++j)
-            res += s_oasF[b][j] * r_q[j];
-
-          s_q[k][b][a] = res;
-        }
-      }
-
-
-    @barrier("local");
-
-    // transform forward in 't'
-    for(int b = 0; b < p_Nq; ++b)
-      for(int a = 0; a < p_Nq; ++a) {
-        for(int k = 0; k < p_oasNq; ++k)
-          r_q[k] = s_q[k][b][a];
-
-        for(int c = 0; c < p_oasNq; ++c) {
-          dfloat res = 0;
-
-          for(int k = 0; k < p_oasNq; ++k)
-            res += s_oasF[c][k] * r_q[k];
-
-          dlong id = e * p_oasNp + c * p_oasNq * p_oasNq + b * p_oasNq + a;
-          dfloat W = oasDiagOp[id];
-
-          s_q[c][b][a] = res * W;
-        }
-      }
-
-    @barrier("local");
-
-    // transform forward in 't'
-    for(int b = 0; b < p_Nq; ++b)
-      for(int a = 0; a < p_Nq; ++a) {
-        for(int c = 0; c < p_oasNq; ++c)
-          r_q[c] = s_q[c][b][a];
-
-        for(int k = 0; k < p_oasNq; ++k) {
-          dfloat res = 0;
-
-          for(int c = 0; c < p_oasNq; ++c)
-            res += s_oasB[k][c] * r_q[c];
-
-          s_q[k][b][a] = res;
-        }
-      }
-
-    @barrier("local");
-
-    // transform forward in 's'
-    for(int k = 0; k < p_oasNq; ++k)
-      for(int a = 0; a < p_oasNq; ++a) {
-        for(int b = 0; b < p_oasNq; ++b)
-          r_q[b] = s_q[k][b][a];
-
-        for(int j = 0; j < p_oasNq; ++j) {
-          dfloat res = 0;
-
-          for(int b = 0; b < p_oasNq; ++b)
-            res += s_oasB[j][b] * r_q[b];
-
-          s_q[k][j][a] = res;
-        }
-      }
-
-
-    @barrier("local");
-
-    // transform forward in 'r'
-    for(int k = 0; k < p_oasNq; ++k)
-      for(int j = 0; j < p_oasNq; ++j) {
-        for(int a = 0; a < p_oasNq; ++a)
-          r_q[a] = s_q[k][j][a];
-
-        for(int i = 0; i < p_oasNq; ++i) {
-          dfloat res = 0;
-
-          for(int a = 0; a < p_oasNq; ++a)
-            res += s_oasB[i][a] * r_q[a];
-
-          dlong id = e * p_oasNp + c * p_oasNq * p_oasNq + b * p_oasNq + a;
-          Pq[id] = res;
-        }
-      }
-  }
-}
diff --git a/src/libP/solvers/elliptic/setups/setupHex3D.rc b/src/libP/solvers/elliptic/setups/setupHex3D.rc
deleted file mode 100644
index e40c3863a..000000000
--- a/src/libP/solvers/elliptic/setups/setupHex3D.rc
+++ /dev/null
@@ -1,200 +0,0 @@
-[FORMAT]
-1.0
-
-[BENCHMARK]
-SOLVE
-
-[DATA FILE]
-#data /ellipticSine3D.h
-data / ellipticBlockSine3D.h
-#data /ellipticHomogeneous3D.h
-
-[BOX DOMAIN]
-#TRUE
-FALSE
-
-[BOX NX]
-2
-
-[BOX NY]
-2
-
-[BOX NZ]
-2
-
-[BOX XMIN]
-0.0
-[BOX XMAX]
-1.0
-
-[BOX YMIN]
-0.0
-[BOX YMAX]
-1.0
-
-[BOX ZMIN]
-0.0
-[BOX ZMAX]
-1.0
-
-[MESH FILE]
-#cubeHex .msh
-cubeHexCoarse.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE]
-12
-
-[POLYNOMIAL DEGREE]
-7
-
-[ELEMENT MAP]
-ISOPARAMETRIC
-
-[ELLIPTIC INTEGRATION]
-NODAL
-
-[THREAD MODEL]
-SERIAL
-#CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-1
-
-[COEFFICIENT]
-#CONSTANT
-VARIABLE
-
-[NUMBER OF VECTOR FIELDS]
-2
-
-[LAMBDA0]
-1
-
-[LAMBDA1]
-1
-
-[LAMBDA2]
-1
-
-# can add FLEXIBLE to PCG
-# can add NONBLOCKING (disable FLEXIBLE for the moment)
-[KRYLOV SOLVER]
-PCG
-#PCG +FLEXIBLE+NONBLOCKING
-#+ NONBLOCKING+FLEXIBLE
-#+ FLEXIBLE
-#+ FLEXIBLE+NONBLOCKING
-
-[SOLVER TOLERANCE]
-1.e-9
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, MULTIGRID, or OAS
-# OAS does not currently work with NONBLOCKING PCG
-[PRECONDITIONER]
-#NONE
-JACOBI
-#OAS
-#MULTIGRID
-#SEMFEM
-#FULLALMOND
-
-[MULTIGRID VARIABLE COEFFICIENT]
-FALSE
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-#ALLDEGREES
-HALFDEGREES
-#HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-#EXACT
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV + DAMPEDJACOBI
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-1
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
-
-# set to 0 (zero) to disable reductions
-[DEBUG ENABLE REDUCTIONS]
-1
-
-# set to 0 (zero) to disable gather-scatter
-[DEBUG ENABLE OGS]
-1
-
-[MAXIMUM ITERATIONS]
-500
-
-[FIXED ITERATION COUNT]
-FALSE
-
-[USE PRECOMPILED BINARIES]
-FALSE
-
-[PARALMOND SMOOTH COARSEST]
-FALSE
diff --git a/src/libP/solvers/elliptic/setups/setupOasHex3D.rc b/src/libP/solvers/elliptic/setups/setupOasHex3D.rc
deleted file mode 100644
index a314cc32a..000000000
--- a/src/libP/solvers/elliptic/setups/setupOasHex3D.rc
+++ /dev/null
@@ -1,151 +0,0 @@
-[FORMAT]
-1.0
-
-[BENCHMARK]
-SOLVE
-#NONE
-#BP5
-
-[DATA FILE]
-data / ellipticSineTest3D.h
-#data /ellipticHomogeneous3D.h
-
-[MESH FILE]
-#. ./../meshes/cubeHexE8Thilina.msh
-#. ./../meshes/cavityHexH025.msh
-#. ./../meshes/cavityHexH0125.msh
-#. ./../meshes/cavityHexH0075.msh
-#. ./../meshes/cubeHexE00008.msh
-#. ./../meshes/cubeHexE00064.msh
-#. ./../meshes/cubeHexE00216.msh
-#. ./../meshes/cubeHexE00512.msh
-#. ./../meshes/cubeHexE01000.msh
-#. ./../meshes/cubeHexE01728.msh
-../../ meshes / cubeHexE04096.msh
-#. ./../meshes/cubeHexE05832.msh
-#. ./../meshes/cubeHexE08000.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-12
-
-[POLYNOMIAL DEGREE]
-7
-
-[ELEMENT MAP]
-ISOPARAMETRIC
-#TRILINEAR
-
-[ELLIPTIC INTEGRATION]
-NODAL
-#CUBATURE
-# CUBATURE - WORKING FOR INHOMOGENEOUS DIRICHLET BCS - NOT WORKING FOR NEUMANN YET
-
-[THREAD MODEL]
-#Serial
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-10
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-#IPDG
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, MULTIGRID, or OAS
-[PRECONDITIONER]
-OAS
-#JACOBI
-#MULTIGRID
-#SEMFEM
-#FULLALMOND
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-#ALLDEGREES
-HALFDEGREES
-#HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-1
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-#EXACT
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV + DAMPEDJACOBI
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-1
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
-#FALSE
-
-# set to 0 (zero) to disable reductions
-[DEBUG ENABLE REDUCTIONS]
-1
-
-# set to 0 (zero) to disable gather-scatter
-[DEBUG ENABLE OGS]
-1
-
-[MAXIMUM ITERATIONS]
-1000
diff --git a/src/libP/solvers/elliptic/setups/setupPeriodicHex3D.rc b/src/libP/solvers/elliptic/setups/setupPeriodicHex3D.rc
deleted file mode 100644
index 1d0b80d08..000000000
--- a/src/libP/solvers/elliptic/setups/setupPeriodicHex3D.rc
+++ /dev/null
@@ -1,186 +0,0 @@
-[FORMAT]
-1.0
-
-[BENCHMARK]
-SOLVE
-
-[DATA FILE]
-data / ellipticSineTest3D.h
-#data /ellipticHomogeneous3D.h
-
-[BOX DOMAIN]
-TRUE
-
-[BOX NX]
-20
-
-[BOX NY]
-20
-
-[BOX NZ]
-20
-
-[BOX XMIN]
-- 1.0
-[BOX XMAX]
-1.0
-
-[BOX YMIN]
-- 1.0
-[BOX YMAX]
-1.0
-
-[BOX ZMIN]
-- 1.0
-[BOX ZMAX]
-1.0
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-12
-
-[POLYNOMIAL DEGREE]
-7
-
-[ELEMENT MAP]
-ISOPARAMETRIC
-#TRILINEAR
-
-[ELLIPTIC INTEGRATION]
-NODAL
-#CUBATURE
-# CUBATURE - WORKING FOR INHOMOGENEOUS DIRICHLET BCS - NOT WORKING FOR NEUMANN YET
-
-[THREAD MODEL]
-#Serial
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-# can add NONBLOCKING (disable FLEXIBLE for the moment)
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-#+ NONBLOCKING
-#+ NONBLOCKING+FLEXIBLE
-#+ FLEXIBLE
-#+ FLEXIBLE+NONBLOCKING
-
-[SOLVER TOLERANCE]
-1.e-7
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-#IPDG
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, MULTIGRID, or OAS
-# OAS does not currently work with NONBLOCKING PCG
-[PRECONDITIONER]
-#OAS
-#JACOBI
-MULTIGRID
-#SEMFEM
-#FULLALMOND
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-#ALLDEGREES
-HALFDEGREES
-#HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-1
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-#EXACT
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV + DAMPEDJACOBI
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-1
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-# only smooth at bottom of cycle
-[PARALMOND SMOOTH COARSEST]
-TRUE
-
-[PARALMOND SMOOTH COARSEST DEGREE]
-20
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
-#FALSE
-
-# set to 0 (zero) to disable reductions
-[DEBUG ENABLE REDUCTIONS]
-1
-
-# set to 0 (zero) to disable gather-scatter
-[DEBUG ENABLE OGS]
-1
-
-[MAXIMUM ITERATIONS]
-1000
-
-[FIXED ITERATION COUNT]
-FALSE
-
-# TRUE : assume binaries are precompiled
-# FALSE : rely on occa caching
-# NONROOT : rank>0 assume rank==0 compiles binaries or occa binaries are already in cache
-[USE PRECOMPILED BINARIES]
-FALSE
diff --git a/src/libP/solvers/elliptic/setups/setupQuad2D.rc b/src/libP/solvers/elliptic/setups/setupQuad2D.rc
deleted file mode 100644
index 1bb36d904..000000000
--- a/src/libP/solvers/elliptic/setups/setupQuad2D.rc
+++ /dev/null
@@ -1,145 +0,0 @@
-[FORMAT]
-1.0
-
-[DATA FILE]
-#data /ellipticHomogeneous2D.h
-data / ellipticSine2D.h
-
-[BOX DOMAIN]
-TRUE
-
-[MESH DIMENSION]
-2
-
-[MESH FILE]
-../../ meshes / meshSine2D.msh
-
-[ELEMENT TYPE] # number of edges
-4
-
-[BOX NX]
-40
-
-[BOX NY]
-40
-
-[BOX NZ]
-40
-
-[ELEMENT TYPE] # number of edges
-4
-
-[POLYNOMIAL DEGREE]
-4
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-[ELLIPTIC INTEGRATION]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-#MULTIGRID
-JACOBI
-
-[FIXED ITERATION COUNT]
-FALSE
-
-[ELLIPTIC INTEGRATION]
-NODAL
-
-[SOLVER TOLERANCE]
-1e-8
-
-[MAXIMUM ITERATIONS]
-1000
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI,CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-[PARALMOND SMOOTH COARSEST]
-FALSE
-
-[PARALMOND SMOOTH COARSEST DEGREE]
-8
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
-
-[DEBUG ENABLE REDUCTIONS]
-1
-
-[DEBUG ENABLE OGS]
-1
diff --git a/src/libP/solvers/elliptic/setups/setupQuad3D.rc b/src/libP/solvers/elliptic/setups/setupQuad3D.rc
deleted file mode 100644
index ef91b1d3a..000000000
--- a/src/libP/solvers/elliptic/setups/setupQuad3D.rc
+++ /dev/null
@@ -1,111 +0,0 @@
-[FORMAT]
-1.0
-
-[DATA FILE]
-data / ellipticHomogeneous3D.h
-
-[MESH FILE]
-../../ meshes / sphereQuadH02.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-4
-
-[POLYNOMIAL DEGREE]
-5
-
-[SPHERE RADIUS]
-1.0
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-100
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-#IPDG
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-MULTIGRID
-#FULLALMOND
-#JACOBI
-#NONE
-#SEMFEM
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI,CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
diff --git a/src/libP/solvers/elliptic/setups/setupTemplateScanHex3D.rc b/src/libP/solvers/elliptic/setups/setupTemplateScanHex3D.rc
deleted file mode 100644
index 515f1247c..000000000
--- a/src/libP/solvers/elliptic/setups/setupTemplateScanHex3D.rc
+++ /dev/null
@@ -1,110 +0,0 @@
-[FORMAT]
-1.0
-
-[BENCHMARK]
-BK5
-
-[DATA FILE]
-data / ellipticHomogeneous3D.h
-
-[MESH FILE]
-../../ meshes / cavityHexH006.msh,../../ meshes / cavityHexH0075.msh,
-../../ meshes / cavityHexH008.msh,../../ meshes / cavityHexH0125.msh,
-../../ meshes / cavityHexH016.msh,../../ meshes / cavityHexH01.msh,../../ meshes / cavityHexH02.msh,
-../../ meshes / cavityHexH03.msh,../../ meshes / cavityHexH04.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-12
-
-[POLYNOMIAL DEGREE]
-1,2,3,4,5,6,7,8
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-MULTIGRID,JACOBI
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDEGREES
-#HALFDOFS ,HALFDEGREES,ALLDEGREES
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-VCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
diff --git a/src/libP/solvers/elliptic/setups/setupTemplateScanHex3DBK5.rc b/src/libP/solvers/elliptic/setups/setupTemplateScanHex3DBK5.rc
deleted file mode 100644
index 7786a44b8..000000000
--- a/src/libP/solvers/elliptic/setups/setupTemplateScanHex3DBK5.rc
+++ /dev/null
@@ -1,110 +0,0 @@
-[FORMAT]
-1.0
-
-[BENCHMARK]
-BK5
-
-[DATA FILE]
-data / ellipticHomogeneous3D.h
-
-[MESH FILE]
-../../ meshes / cavityHexH005.msh,../../ meshes / cavityHexH0125.msh,
-../../ meshes / cavityHexH016.msh,../../ meshes / cavityHexH01.msh,../../ meshes / cavityHexH02.msh,
-../../ meshes / cavityHexH03.msh,../../ meshes / cavityHexH04.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-12
-
-[POLYNOMIAL DEGREE]
-1,2,3,4,5,6,7,8
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-JACOBI
-#MULTIGRID ,
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDEGREES
-#HALFDOFS ,HALFDEGREES,ALLDEGREES
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-VCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
diff --git a/src/libP/solvers/elliptic/setups/setupTemplateScanTet3D.rc b/src/libP/solvers/elliptic/setups/setupTemplateScanTet3D.rc
deleted file mode 100644
index ac8a1b7f8..000000000
--- a/src/libP/solvers/elliptic/setups/setupTemplateScanTet3D.rc
+++ /dev/null
@@ -1,110 +0,0 @@
-[FORMAT]
-1.0
-
-[DATA FILE]
-data / ellipticHomogeneous3D.h
-
-[MESH FILE]
-../../ meshes / cavityTetH0075.msh,../../ meshes / cavityTetH009.msh,
-../../ meshes / cavityTetH01.msh,../../ meshes / cavityTetH02.msh,../../ meshes / cavityTetH04.msh,
-../../ meshes / cavityTetH00625.msh,../../ meshes / cavityTetH008.msh,
-../../ meshes / cavityTetH0125.msh,../../ meshes / cavityTetH025.msh,
-../../ meshes / cavityTetH03.msh,../../ meshes / cavityTetH05.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-6
-
-[POLYNOMIAL DEGREE]
-1,2,3,4,5,6,7,8
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-#IPDG
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-MULTIGRID,JACOBI
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDEGREES
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-#KCYCLE
-VCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-#DAMPEDJACOBI
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
diff --git a/src/libP/solvers/elliptic/setups/setupTemplateTri2D.rc b/src/libP/solvers/elliptic/setups/setupTemplateTri2D.rc
deleted file mode 100644
index b6a09562c..000000000
--- a/src/libP/solvers/elliptic/setups/setupTemplateTri2D.rc
+++ /dev/null
@@ -1,103 +0,0 @@
-[FORMAT]
-1.0
-
-[DATA FILE]
-data / ellipticHomogeneous2D.h
-
-[MESH FILE]
-../../ meshes / cavityH00125.msh
-
-[MESH DIMENSION]
-2
-
-[ELEMENT TYPE] # number of edges
-3
-
-[POLYNOMIAL DEGREE]
-1,2,3
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-10
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-MULTIGRID
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI,CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
diff --git a/src/libP/solvers/elliptic/setups/setupTet3D.rc b/src/libP/solvers/elliptic/setups/setupTet3D.rc
deleted file mode 100644
index 218958fc6..000000000
--- a/src/libP/solvers/elliptic/setups/setupTet3D.rc
+++ /dev/null
@@ -1,134 +0,0 @@
-[FORMAT]
-1.0
-
-[BOX DOMAIN]
-FALSE
-
-[DATA FILE]
-data / ellipticSineTest3D.h
-#data /ellipticHomogeneous3D.h
-
-[MESH FILE]
-../../ meshes / cavityTetH01.msh
-
-[MESH DIMENSION]
-3
-
-[ELEMENT TYPE] # number of edges
-6
-
-[POLYNOMIAL DEGREE]
-6
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-#IPDG
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-[ELLIPTIC INTEGRATION]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-#FULLALMOND
-#JACOBI
-MULTIGRID
-#SEMFEM
-
-[MAXIMUM ITERATIONS]
-1000
-
-[SOLVER TOLERANCE]
-1e-8
-
-[FIXED ITERATION COUNT]
-FALSE
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDEGREES
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI + CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-#KCYCLE
-VCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-#DAMPEDJACOBI
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-[PARALMOND SMOOTH COARSEST]
-FALSE
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
-
-[DEBUG ENABLE OGS]
-1
-
-[DEBUG ENABLE REDUCTIONS]
-1
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/setups/setupTri2D.rc b/src/libP/solvers/elliptic/setups/setupTri2D.rc
deleted file mode 100644
index ae0370054..000000000
--- a/src/libP/solvers/elliptic/setups/setupTri2D.rc
+++ /dev/null
@@ -1,132 +0,0 @@
-[FORMAT]
-1.0
-
-[BENCHMARK]
-SOLVE
-#NONE
-#BP5
-
-[BOX DOMAIN]
-FALSE
-
-[DATA FILE]
-data / ellipticHomogeneous2D.h
-
-[MESH FILE]
-../../ meshes / cavityH00125.msh
-
-[MESH DIMENSION]
-2
-
-[ELEMENT TYPE] # number of edges
-3
-
-[POLYNOMIAL DEGREE]
-6
-
-[THREAD MODEL]
-CUDA
-
-[PLATFORM NUMBER]
-0
-
-[DEVICE NUMBER]
-0
-
-[LAMBDA]
-0
-
-# can add FLEXIBLE to PCG
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-
-# can be IPDG, or CONTINUOUS
-[DISCRETIZATION]
-CONTINUOUS
-
-# can be NODAL or BERN
-[BASIS]
-NODAL
-
-[ELLIPTIC INTEGRATION]
-NODAL
-
-# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID
-[PRECONDITIONER]
-MULTIGRID
-
-[MAXIMUM ITERATIONS]
-1000
-
-[SOLVER TOLERANCE]
-1e-8
-
-[FIXED ITERATION COUNT]
-FALSE
-
-## ## ## ## ## MULTIGRID Options ## ## ## ## ## ## ##
-
-# can be ALLDEGREES, HALFDEGREES, HALFDOFS
-[MULTIGRID COARSENING]
-HALFDOFS
-
-# can be LOCALPATCH, or DAMPEDJACOBI
-# LOCALPATCH smoother can include EXACT
-# can include CHEBYSHEV for smoother acceleration
-[MULTIGRID SMOOTHER]
-DAMPEDJACOBI,CHEBYSHEV
-
-# can be any integer >0
-[MULTIGRID CHEBYSHEV DEGREE]
-2
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-## ## ## ## ## ParAlmond Options ## ## ## ## ## ## ##
-
-# can be KCYCLE, or VCYCLE
-# can add the EXACT and NONSYM option
-[PARALMOND CYCLE]
-KCYCLE
-
-# can be DAMPEDJACOBI or CHEBYSHEV
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-
-# can be any integer >0
-[PARALMOND CHEBYSHEV DEGREE]
-2
-
-# can be STRONGNODES, DISTRIBUTED, SATURATE
-[PARALMOND PARTITION]
-STRONGNODES
-
-# can be DEFAULT or LPSCN
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-#LPSCN
-
-# can be MAX, MIN, or NONE
-[PARALMOND LPSCN ORDERING]
-MAX
-#MIN
-
-[PARALMOND SMOOTH COARSEST]
-FALSE
-
-## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #
-
-[RESTART FROM FILE]
-0
-
-[OUTPUT FILE NAME]
-cavity
-
-[VERBOSE]
-TRUE
-
-[DEBUG ENABLE OGS]
-1
-
-[DEBUG ENABLE REDUCTIONS]
-1
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildContinuous.c b/src/libP/solvers/elliptic/src/ellipticBuildContinuous.c
deleted file mode 100644
index a3811ad19..000000000
--- a/src/libP/solvers/elliptic/src/ellipticBuildContinuous.c
+++ /dev/null
@@ -1,1099 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-// compare on global indices
-int parallelCompareRowColumn(const void* a, const void* b)
-{
-  nonZero_t* fa = (nonZero_t*) a;
-  nonZero_t* fb = (nonZero_t*) b;
-
-  if(fa->row < fb->row) return -1;
-  if(fa->row > fb->row) return +1;
-
-  if(fa->col < fb->col) return -1;
-  if(fa->col > fb->col) return +1;
-
-  return 0;
-}
-
-// void ellipticBuildContinuousTri2D (elliptic_t *elliptic, dfloat lambda, nonZero_t **A, dlong *nnz, ogs_t **ogs, hlong *globalStarts);
-void ellipticBuildContinuousQuad2D(elliptic_t* elliptic,
-                                   nonZero_t** A,
-                                   dlong* nnz,
-                                   ogs_t** ogs,
-                                   hlong* globalStarts);
-// void ellipticBuildContinuousQuad3D(elliptic_t *elliptic, dfloat lambda, nonZero_t **A, dlong *nnz, ogs_t **ogs, hlong *globalStarts);
-// void ellipticBuildContinuousTet3D (elliptic_t *elliptic, dfloat lambda, nonZero_t **A, dlong *nnz, ogs_t **ogs, hlong *globalStarts);
-void ellipticBuildContinuousHex3D (elliptic_t* elliptic,
-                                   nonZero_t** A,
-                                   dlong* nnz,
-                                   ogs_t** ogs,
-                                   hlong* globalStarts);
-
-void ellipticBuildContinuous(elliptic_t* elliptic,
-                             nonZero_t** A,
-                             dlong* nnz,
-                             ogs_t** ogs,
-                             hlong* globalStarts)
-{
-  switch(elliptic->elementType) {
-  // case TRIANGLES:
-  //   ellipticBuildContinuousTri2D(elliptic, lambda, A, nnz, ogs, globalStarts); break;
-  case QUADRILATERALS: {
-    // if(elliptic->dim==2)
-    ellipticBuildContinuousQuad2D(elliptic, A, nnz, ogs, globalStarts);
-    // else
-    // ellipticBuildContinuousQuad3D(elliptic, lambda, A, nnz, ogs, globalStarts);
-    break;
-  }
-  // case TETRAHEDRA:
-  //   ellipticBuildContinuousTet3D(elliptic, lambda, A, nnz, ogs, globalStarts); break;
-  case HEXAHEDRA:
-    ellipticBuildContinuousHex3D(elliptic, A, nnz, ogs, globalStarts);
-    break;
-  }
-}
-
-void ellipticBuildContinuousQuad2D(elliptic_t* elliptic,
-                                   nonZero_t** A,
-                                   dlong* nnz,
-                                   ogs_t** ogs,
-                                   hlong* globalStarts)
-{
-  mesh_t* mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-  // currently constant coefficient case only
-  const dfloat lambda = elliptic->lambda[0];
-
-  int rank = mesh->rank;
-
-  //use the masked gs handle to define a global ordering
-
-  // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = elliptic->ogs->Ngather;
-  dlong Ntotal  = mesh->Np * mesh->Nelements;
-
-  // create a global numbering system
-  hlong* globalIds = (hlong*) calloc(Ngather,sizeof(hlong));
-  int* owner     = (int*) calloc(Ngather,sizeof(int));
-
-  // every gathered degree of freedom has its own global id
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts + 1, 1, MPI_HLONG, mesh->comm);
-  for(int r = 0; r < mesh->size; ++r)
-    globalStarts[r + 1] = globalStarts[r] + globalStarts[r + 1];
-
-  //use the offsets to set a consecutive global numbering
-  for (dlong n = 0; n < elliptic->ogs->Ngather; n++) {
-    globalIds[n] = n + globalStarts[rank];
-    owner[n] = rank;
-  }
-
-  //scatter this numbering to the original nodes
-  hlong* globalNumbering = (hlong*) calloc(Ntotal,sizeof(hlong));
-  int* globalOwners = (int*) calloc(Ntotal,sizeof(int));
-  for (dlong n = 0; n < Ntotal; n++) globalNumbering[n] = -1;
-  ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, elliptic->ogs);
-  ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs);
-
-  free(globalIds);
-  free(owner);
-
-  // 2. Build non-zeros of stiffness matrix (unassembled)
-  dlong nnzLocal = mesh->Np * mesh->Np * mesh->Nelements;
-  nonZero_t* sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
-  int* AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
-  int* ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
-  int* AsendOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
-  int* ArecvOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
-
-  int* mask = (int*) calloc(mesh->Np * mesh->Nelements,sizeof(int));
-  for (dlong n = 0; n < elliptic->Nmasked; n++) mask[elliptic->maskIds[n]] = 1;
-
-  if(mesh->rank == 0) printf("Building full FEM matrix...");
-  fflush(stdout);
-
-  //Build unassembed non-zeros
-  dlong cnt = 0;
-  for (dlong e = 0; e < mesh->Nelements; e++)
-    for (int ny = 0; ny < mesh->Nq; ny++)
-      for (int nx = 0; nx < mesh->Nq; nx++) {
-        if (mask[e * mesh->Np + nx + ny * mesh->Nq]) continue; //skip masked nodes
-        for (int my = 0; my < mesh->Nq; my++)
-          for (int mx = 0; mx < mesh->Nq; mx++) {
-            if (mask[e * mesh->Np + mx + my * mesh->Nq]) continue; //skip masked nodes
-
-            int id;
-            dfloat val = 0.;
-
-            if (ny == my) {
-              for (int k = 0; k < mesh->Nq; k++) {
-                id = k + ny * mesh->Nq;
-                dfloat Grr = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G00ID * mesh->Np];
-
-                val += Grr * mesh->D[nx + k * mesh->Nq] * mesh->D[mx + k * mesh->Nq];
-              }
-            }
-
-            id = mx + ny * mesh->Nq;
-            dfloat Grs = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-            val += Grs * mesh->D[nx + mx * mesh->Nq] * mesh->D[my + ny * mesh->Nq];
-
-            id = nx + my * mesh->Nq;
-            dfloat Gsr = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-            val += Gsr * mesh->D[mx + nx * mesh->Nq] * mesh->D[ny + my * mesh->Nq];
-
-            if (nx == mx) {
-              for (int k = 0; k < mesh->Nq; k++) {
-                id = nx + k * mesh->Nq;
-                dfloat Gss = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G11ID * mesh->Np];
-
-                val += Gss * mesh->D[ny + k * mesh->Nq] * mesh->D[my + k * mesh->Nq];
-              }
-            }
-
-            if ((nx == mx) && (ny == my)) {
-              id = nx + ny * mesh->Nq;
-              dfloat JW = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + GWJID * mesh->Np];
-              val += JW * lambda;
-            }
-
-            dfloat nonZeroThreshold = 1e-7;
-            if (fabs(val) > nonZeroThreshold) {
-              // pack non-zero
-              sendNonZeros[cnt].val = val;
-              sendNonZeros[cnt].row = globalNumbering[e * mesh->Np + nx + ny * mesh->Nq];
-              sendNonZeros[cnt].col = globalNumbering[e * mesh->Np + mx + my * mesh->Nq];
-              sendNonZeros[cnt].ownerRank = globalOwners[e * mesh->Np + nx + ny * mesh->Nq];
-              cnt++;
-            }
-          }
-      }
-
-  // Make the MPI_NONZERO_T data type
-  MPI_Datatype MPI_NONZERO_T;
-  MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
-  int blength[4] = {1, 1, 1, 1};
-  MPI_Aint addr[4], displ[4];
-  MPI_Get_address ( &(sendNonZeros[0]          ), addr + 0);
-  MPI_Get_address ( &(sendNonZeros[0].col      ), addr + 1);
-  MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr + 2);
-  MPI_Get_address ( &(sendNonZeros[0].val      ), addr + 3);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
-  MPI_Type_commit (&MPI_NONZERO_T);
-
-  // count how many non-zeros to send to each process
-  for(dlong n = 0; n < cnt; ++n)
-    AsendCounts[sendNonZeros[n].ownerRank]++;
-
-  // sort by row ordering
-  qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
-
-  // find send and recv offsets for gather
-  *nnz = 0;
-  for(int r = 0; r < mesh->size; ++r) {
-    AsendOffsets[r + 1] = AsendOffsets[r] + AsendCounts[r];
-    ArecvOffsets[r + 1] = ArecvOffsets[r] + ArecvCounts[r];
-    *nnz += ArecvCounts[r];
-  }
-
-  *A = (nonZero_t*) calloc(*nnz, sizeof(nonZero_t));
-
-  // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
-                (*A), ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
-                mesh->comm);
-
-  // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  qsort((*A), *nnz, sizeof(nonZero_t), parallelCompareRowColumn);
-
-  // compress duplicates
-  cnt = 0;
-  for(dlong n = 1; n < *nnz; ++n) {
-    if((*A)[n].row == (*A)[cnt].row &&
-       (*A)[n].col == (*A)[cnt].col) {
-      (*A)[cnt].val += (*A)[n].val;
-    }else {
-      ++cnt;
-      (*A)[cnt] = (*A)[n];
-    }
-  }
-  if (*nnz) cnt++;
-  *nnz = cnt;
-
-#if 1
-  // Write matlab dat for postprocess
-  char fname[BUFSIZ];
-  sprintf(fname, "Ax.dat");
-  FILE* fp;
-  fp = fopen(fname, "w");
-
-  for(dlong n = 1; n < *nnz; ++n)
-    fprintf(fp, hlongFormat " " hlongFormat " %.8e\n", (*A)[n].row + 1, (*A)[n].col + 1,
-            (*A)[n].val);
-
-  fclose(fp);
-#endif
-
-  if(mesh->rank == 0) printf("done.\n");
-
-  MPI_Barrier(mesh->comm);
-  MPI_Type_free(&MPI_NONZERO_T);
-
-  free(sendNonZeros);
-  free(globalNumbering);
-  free(globalOwners);
-
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
-}
-
-void ellipticBuildContinuousHex3D(elliptic_t* elliptic,
-                                  nonZero_t** A,
-                                  dlong* nnz,
-                                  ogs_t** ogs,
-                                  hlong* globalStarts)
-{
-  mesh2D* mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-  // currently constant coefficient case only
-  const dfloat lambda = elliptic->lambda[0];
-
-  int rank = mesh->rank;
-
-  //use the masked gs handle to define a global ordering
-
-  // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = elliptic->ogs->Ngather;
-  dlong Ntotal  = mesh->Np * mesh->Nelements;
-
-  // create a global numbering system
-  hlong* globalIds = (hlong*) calloc(Ngather,sizeof(hlong));
-  int* owner     = (int*) calloc(Ngather,sizeof(int));
-
-  // every gathered degree of freedom has its own global id
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts + 1, 1, MPI_HLONG, mesh->comm);
-  for(int r = 0; r < mesh->size; ++r)
-    globalStarts[r + 1] = globalStarts[r] + globalStarts[r + 1];
-
-  //use the offsets to set a consecutive global numbering
-  for (dlong n = 0; n < elliptic->ogs->Ngather; n++) {
-    globalIds[n] = n + globalStarts[rank];
-    owner[n] = rank;
-  }
-
-  //scatter this numbering to the original nodes
-  hlong* globalNumbering = (hlong*) calloc(Ntotal,sizeof(hlong));
-  int* globalOwners = (int*) calloc(Ntotal,sizeof(int));
-  for (dlong n = 0; n < Ntotal; n++) globalNumbering[n] = -1;
-  ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, elliptic->ogs);
-  ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs);
-
-  free(globalIds);
-  free(owner);
-
-  // 2. Build non-zeros of stiffness matrix (unassembled)
-  dlong nnzLocal = mesh->Np * mesh->Np * mesh->Nelements;
-  nonZero_t* sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
-  int* AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
-  int* ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
-  int* AsendOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
-  int* ArecvOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
-
-  int* mask = (int*) calloc(mesh->Np * mesh->Nelements,sizeof(int));
-  for (dlong n = 0; n < elliptic->Nmasked; n++) mask[elliptic->maskIds[n]] = 1;
-
-  if(mesh->rank == 0) printf("Building full FEM matrix...");
-  fflush(stdout);
-
-  dlong cnt = 0;
-  for (dlong e = 0; e < mesh->Nelements; e++)
-    for (int nz = 0; nz < mesh->Nq; nz++)
-      for (int ny = 0; ny < mesh->Nq; ny++)
-        for (int nx = 0; nx < mesh->Nq; nx++) {
-          int idn = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-          if (mask[e * mesh->Np + idn]) continue; //skip masked nodes
-
-          for (int mz = 0; mz < mesh->Nq; mz++)
-            for (int my = 0; my < mesh->Nq; my++)
-              for (int mx = 0; mx < mesh->Nq; mx++) {
-                int idm = mx + my * mesh->Nq + mz * mesh->Nq * mesh->Nq;
-                if (mask[e * mesh->Np + idm]) continue; //skip masked nodes
-
-                int id;
-                dfloat val = 0.;
-
-                if ((ny == my) && (nz == mz)) {
-                  for (int k = 0; k < mesh->Nq; k++) {
-                    id = k + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                    dfloat Grr = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G00ID * mesh->Np];
-
-                    val += Grr * mesh->D[nx + k * mesh->Nq] * mesh->D[mx + k * mesh->Nq];
-                  }
-                }
-
-                if (nz == mz) {
-                  id = mx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Grs = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-                  val += Grs * mesh->D[nx + mx * mesh->Nq] * mesh->D[my + ny * mesh->Nq];
-
-                  id = nx + my * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Gsr = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-                  val += Gsr * mesh->D[mx + nx * mesh->Nq] * mesh->D[ny + my * mesh->Nq];
-                }
-
-                if (ny == my) {
-                  id = mx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Grt = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G02ID * mesh->Np];
-                  val += Grt * mesh->D[nx + mx * mesh->Nq] * mesh->D[mz + nz * mesh->Nq];
-
-                  id = nx + ny * mesh->Nq + mz * mesh->Nq * mesh->Nq;
-                  dfloat Gst = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G02ID * mesh->Np];
-                  val += Gst * mesh->D[mx + nx * mesh->Nq] * mesh->D[nz + mz * mesh->Nq];
-                }
-
-                if ((nx == mx) && (nz == mz)) {
-                  for (int k = 0; k < mesh->Nq; k++) {
-                    id = nx + k * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                    dfloat Gss = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G11ID * mesh->Np];
-
-                    val += Gss * mesh->D[ny + k * mesh->Nq] * mesh->D[my + k * mesh->Nq];
-                  }
-                }
-
-                if (nx == mx) {
-                  id = nx + my * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Gst = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G12ID * mesh->Np];
-                  val += Gst * mesh->D[ny + my * mesh->Nq] * mesh->D[mz + nz * mesh->Nq];
-
-                  id = nx + ny * mesh->Nq + mz * mesh->Nq * mesh->Nq;
-                  dfloat Gts = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G12ID * mesh->Np];
-                  val += Gts * mesh->D[my + ny * mesh->Nq] * mesh->D[nz + mz * mesh->Nq];
-                }
-
-                if ((nx == mx) && (ny == my)) {
-                  for (int k = 0; k < mesh->Nq; k++) {
-                    id = nx + ny * mesh->Nq + k * mesh->Nq * mesh->Nq;
-                    dfloat Gtt = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + G22ID * mesh->Np];
-
-                    val += Gtt * mesh->D[nz + k * mesh->Nq] * mesh->D[mz + k * mesh->Nq];
-                  }
-                }
-
-                if ((nx == mx) && (ny == my) && (nz == mz)) {
-                  id = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat JW = mesh->ggeo[e * mesh->Np * mesh->Nggeo + id + GWJID * mesh->Np];
-                  val += JW * lambda;
-                }
-
-                // pack non-zero
-                dfloat nonZeroThreshold = 1e-7;
-                if (fabs(val) >= nonZeroThreshold) {
-                  sendNonZeros[cnt].val = val;
-                  sendNonZeros[cnt].row = globalNumbering[e * mesh->Np + idn];
-                  sendNonZeros[cnt].col = globalNumbering[e * mesh->Np + idm];
-                  sendNonZeros[cnt].ownerRank = globalOwners[e * mesh->Np + idn];
-                  cnt++;
-                }
-              }
-        }
-
-  // Make the MPI_NONZERO_T data type
-  MPI_Datatype MPI_NONZERO_T;
-  MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
-  int blength[4] = {1, 1, 1, 1};
-  MPI_Aint addr[4], displ[4];
-  MPI_Get_address ( &(sendNonZeros[0]          ), addr + 0);
-  MPI_Get_address ( &(sendNonZeros[0].col      ), addr + 1);
-  MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr + 2);
-  MPI_Get_address ( &(sendNonZeros[0].val      ), addr + 3);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
-  MPI_Type_commit (&MPI_NONZERO_T);
-
-  // count how many non-zeros to send to each process
-  for(dlong n = 0; n < cnt; ++n)
-    AsendCounts[sendNonZeros[n].ownerRank]++;
-
-  // sort by row ordering
-  qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
-
-  // find send and recv offsets for gather
-  *nnz = 0;
-  for(int r = 0; r < mesh->size; ++r) {
-    AsendOffsets[r + 1] = AsendOffsets[r] + AsendCounts[r];
-    ArecvOffsets[r + 1] = ArecvOffsets[r] + ArecvCounts[r];
-    *nnz += ArecvCounts[r];
-  }
-
-  *A = (nonZero_t*) calloc(*nnz, sizeof(nonZero_t));
-
-  // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
-                (*A), ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
-                mesh->comm);
-
-  // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  qsort((*A), *nnz, sizeof(nonZero_t), parallelCompareRowColumn);
-
-  // compress duplicates
-  cnt = 0;
-  for(dlong n = 1; n < *nnz; ++n) {
-    if((*A)[n].row == (*A)[cnt].row &&
-       (*A)[n].col == (*A)[cnt].col) {
-      (*A)[cnt].val += (*A)[n].val;
-    }else {
-      ++cnt;
-      (*A)[cnt] = (*A)[n];
-    }
-  }
-  if (*nnz) cnt++;
-  *nnz = cnt;
-
-  if(mesh->rank == 0) printf("done.\n");
-
-  MPI_Barrier(mesh->comm);
-  MPI_Type_free(&MPI_NONZERO_T);
-
-  free(sendNonZeros);
-  free(globalNumbering);
-  free(globalOwners);
-
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
-}
-
-// void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t **A, dlong *nnz, ogs_t **ogs, hlong *globalStarts) {
-
-//   mesh2D *mesh = elliptic->mesh;
-//   setupAide options = elliptic->options;
-
-//   int rank = mesh->rank;
-
-//   //use the masked gs handle to define a global ordering
-
-//   // number of degrees of freedom on this rank (after gathering)
-//   hlong Ngather = elliptic->ogs->Ngather;
-//   dlong Ntotal  = mesh->Np*mesh->Nelements;
-
-//   // create a global numbering system
-//   hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong));
-//   int   *owner     = (int *) calloc(Ngather,sizeof(int));
-
-//   // every gathered degree of freedom has its own global id
-//   MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm);
-//   for(int r=0;r<mesh->size;++r)
-//     globalStarts[r+1] = globalStarts[r]+globalStarts[r+1];
-
-//   //use the offsets to set a consecutive global numbering
-//   for (dlong n =0;n<elliptic->ogs->Ngather;n++) {
-//     globalIds[n] = n + globalStarts[rank];
-//     owner[n] = rank;
-//   }
-
-//   //scatter this numbering to the original nodes
-//   hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong));
-//   int *globalOwners = (int *) calloc(Ntotal,sizeof(int));
-//   for (dlong n=0;n<Ntotal;n++) globalNumbering[n] = -1;
-//   ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, elliptic->ogs);
-//   ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs);
-
-//   free(globalIds); free(owner);
-
-//   // Build non-zeros of stiffness matrix (unassembled)
-//   dlong nnzLocal = mesh->Np*mesh->Np*mesh->Nelements;
-
-//   nonZero_t *sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
-//   int *AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
-//   int *ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
-//   int *AsendOffsets = (int*) calloc(mesh->size+1, sizeof(int));
-//   int *ArecvOffsets = (int*) calloc(mesh->size+1, sizeof(int));
-
-//   dfloat *Srr = (dfloat *) calloc(mesh->Np*mesh->Np,sizeof(dfloat));
-//   dfloat *Srs = (dfloat *) calloc(mesh->Np*mesh->Np,sizeof(dfloat));
-//   dfloat *Sss = (dfloat *) calloc(mesh->Np*mesh->Np,sizeof(dfloat));
-//   dfloat *MM  = (dfloat *) calloc(mesh->Np*mesh->Np,sizeof(dfloat));
-
-//   for (int n=0;n<mesh->Np;n++) {
-//     for (int m=0;m<mesh->Np;m++) {
-//       Srr[m+n*mesh->Np] = mesh->Srr[m+n*mesh->Np];
-//       Srs[m+n*mesh->Np] = mesh->Srs[m+n*mesh->Np] + mesh->Ssr[m+n*mesh->Np];
-//       Sss[m+n*mesh->Np] = mesh->Sss[m+n*mesh->Np];
-//       MM[m+n*mesh->Np] = mesh->MM[m+n*mesh->Np];
-//     }
-//   }
-
-//   if(mesh->rank==0) printf("Building full FEM matrix...");fflush(stdout);
-
-//   //Build unassembed non-zeros
-//   dlong cnt =0;
-//   for (dlong e=0;e<mesh->Nelements;e++) {
-//     dfloat Grr = mesh->ggeo[e*mesh->Nggeo + G00ID];
-//     dfloat Grs = mesh->ggeo[e*mesh->Nggeo + G01ID];
-//     dfloat Gss = mesh->ggeo[e*mesh->Nggeo + G11ID];
-//     dfloat J   = mesh->ggeo[e*mesh->Nggeo + GWJID];
-
-//     for (int n=0;n<mesh->Np;n++) {
-//       if (globalNumbering[e*mesh->Np + n]<0) continue; //skip masked nodes
-//       for (int m=0;m<mesh->Np;m++) {
-//         if (globalNumbering[e*mesh->Np + m]<0) continue; //skip masked nodes
-
-//         dfloat val = 0.;
-
-//         val += Grr*Srr[m+n*mesh->Np];
-//         val += Grs*Srs[m+n*mesh->Np];
-//         val += Gss*Sss[m+n*mesh->Np];
-//         val += J*lambda*MM[m+n*mesh->Np];
-
-//         dfloat nonZeroThreshold = 1e-7;
-//         if (fabs(val)>nonZeroThreshold) {
-//           // pack non-zero
-//           sendNonZeros[cnt].val = val;
-//           sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + n];
-//           sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + m];
-//           sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + n];
-//           cnt++;
-//         }
-//       }
-//     }
-//   }
-
-//   // Make the MPI_NONZERO_T data type
-//   MPI_Datatype MPI_NONZERO_T;
-//   MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
-//   int blength[4] = {1, 1, 1, 1};
-//   MPI_Aint addr[4], displ[4];
-//   MPI_Get_address ( &(sendNonZeros[0]          ), addr+0);
-//   MPI_Get_address ( &(sendNonZeros[0].col      ), addr+1);
-//   MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr+2);
-//   MPI_Get_address ( &(sendNonZeros[0].val      ), addr+3);
-//   displ[0] = 0;
-//   displ[1] = addr[1] - addr[0];
-//   displ[2] = addr[2] - addr[0];
-//   displ[3] = addr[3] - addr[0];
-//   MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
-//   MPI_Type_commit (&MPI_NONZERO_T);
-
-//   // count how many non-zeros to send to each process
-//   for(dlong n=0;n<cnt;++n)
-//     AsendCounts[sendNonZeros[n].ownerRank]++;
-
-//   // sort by row ordering
-//   qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
-
-//   // find how many nodes to expect (should use sparse version)
-//   MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
-
-//   // find send and recv offsets for gather
-//   *nnz = 0;
-//   for(int r=0;r<mesh->size;++r){
-//     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
-//     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
-//     *nnz += ArecvCounts[r];
-//   }
-
-//   *A = (nonZero_t*) calloc(*nnz, sizeof(nonZero_t));
-
-//   // determine number to receive
-//   MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
-//                         (*A), ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
-//                         mesh->comm);
-
-//   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-//   qsort((*A), *nnz, sizeof(nonZero_t), parallelCompareRowColumn);
-
-//   // compress duplicates
-//   cnt = 0;
-//   for(dlong n=1;n<*nnz;++n){
-//     if((*A)[n].row == (*A)[cnt].row &&
-//        (*A)[n].col == (*A)[cnt].col){
-//       (*A)[cnt].val += (*A)[n].val;
-//     }
-//     else{
-//       ++cnt;
-//       (*A)[cnt] = (*A)[n];
-//     }
-//   }
-//   if (*nnz) cnt++;
-//   *nnz = cnt;
-
-//   if(mesh->rank==0) printf("done.\n");
-
-//   MPI_Barrier(mesh->comm);
-//   MPI_Type_free(&MPI_NONZERO_T);
-
-//   free(sendNonZeros);
-//   free(globalNumbering); free(globalOwners);
-
-//   free(AsendCounts);
-//   free(ArecvCounts);
-//   free(AsendOffsets);
-//   free(ArecvOffsets);
-
-//   free(Srr);
-//   free(Srs);
-//   free(Sss);
-//   free(MM );
-// }
-
-// void ellipticBuildContinuousQuad3D(elliptic_t *elliptic, dfloat lambda, nonZero_t **A, dlong *nnz, ogs_t **ogs, hlong *globalStarts) {
-
-//   mesh2D *mesh = elliptic->mesh;
-//   setupAide options = elliptic->options;
-
-//   int rank = mesh->rank;
-
-//   //use the masked gs handle to define a global ordering
-
-//   // number of degrees of freedom on this rank (after gathering)
-//   hlong Ngather = elliptic->ogs->Ngather;
-//   dlong Ntotal  = mesh->Np*mesh->Nelements;
-
-//   // create a global numbering system
-//   hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong));
-//   int   *owner     = (int *) calloc(Ngather,sizeof(int));
-
-//   // every gathered degree of freedom has its own global id
-//   MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm);
-//   for(int r=0;r<mesh->size;++r)
-//     globalStarts[r+1] = globalStarts[r]+globalStarts[r+1];
-
-//   //use the offsets to set a consecutive global numbering
-//   for (dlong n =0;n<elliptic->ogs->Ngather;n++) {
-//     globalIds[n] = n + globalStarts[rank];
-//     owner[n] = rank;
-//   }
-
-//   //scatter this numbering to the original nodes
-//   hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong));
-//   int *globalOwners = (int *) calloc(Ntotal,sizeof(int));
-//   for (dlong n=0;n<Ntotal;n++) globalNumbering[n] = -1;
-//   ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, elliptic->ogs);
-//   ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs);
-
-//   free(globalIds); free(owner);
-
-//   // 2. Build non-zeros of stiffness matrix (unassembled)
-//   dlong nnzLocal = mesh->Np*mesh->Np*mesh->Nelements;
-//   nonZero_t *sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
-//   int *AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
-//   int *ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
-//   int *AsendOffsets = (int*) calloc(mesh->size+1, sizeof(int));
-//   int *ArecvOffsets = (int*) calloc(mesh->size+1, sizeof(int));
-
-//   int *mask = (int *) calloc(mesh->Np*mesh->Nelements,sizeof(int));
-//   for (dlong n=0;n<elliptic->Nmasked;n++) mask[elliptic->maskIds[n]] = 1;
-
-//   if(mesh->rank==0) printf("Building full FEM matrix...");fflush(stdout);
-
-// #if 0
-//   hlong NTf = mesh->Nelements*mesh->Np * mesh->Nelements*mesh->Np ;
-//   dfloat *Af = (dfloat *)calloc(NTf, sizeof(dfloat));
-// #endif
-
-//   //Build unassembed non-zeros
-//   dlong cnt =0;
-//   for (dlong e=0;e<mesh->Nelements;e++) {
-//     for (int ny=0;ny<mesh->Nq;ny++) {
-//       for (int nx=0;nx<mesh->Nq;nx++) {
-//         if (mask[e*mesh->Np + nx+ny*mesh->Nq]) continue; //skip masked nodes
-//         for (int my=0;my<mesh->Nq;my++) {
-//           for (int mx=0;mx<mesh->Nq;mx++) {
-//             if (mask[e*mesh->Np + mx+my*mesh->Nq]) continue; //skip masked nodes
-
-//             int id;
-//             dfloat val = 0.;
-
-//              if (ny==my) {
-//               for (int k=0;k<mesh->Nq;k++) {
-//                 id = k+ny*mesh->Nq;
-//                 dfloat Grr = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G00ID*mesh->Np];
-
-//                 val += Grr*mesh->D[nx+k*mesh->Nq]*mesh->D[mx+k*mesh->Nq];
-//               }
-//             }
-
-//             id = mx+ny*mesh->Nq;
-//             dfloat Grs = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G01ID*mesh->Np];
-//             val += Grs*mesh->D[nx+mx*mesh->Nq]*mesh->D[my+ny*mesh->Nq];
-
-//             id = nx+my*mesh->Nq;
-//             dfloat Gsr = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G01ID*mesh->Np];
-//             val += Gsr*mesh->D[mx+nx*mesh->Nq]*mesh->D[ny+my*mesh->Nq];
-
-//             // id = mx+ny*mesh->Nq;
-//             // dfloat Grt = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G02ID*mesh->Np];
-//             // val += Grt*mesh->D[nx+mx*mesh->Nq];
-
-//             // id = nx+my*mesh->Nq;
-//             // dfloat Gtr = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G02ID*mesh->Np];
-//             // val += Gtr*mesh->D[mx+nx*mesh->Nq];
-
-//             if (nx==mx) {
-//               for (int k=0;k<mesh->Nq;k++) {
-//                 id = nx+k*mesh->Nq;
-//                 dfloat Gss = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G11ID*mesh->Np];
-
-//                 val += Gss*mesh->D[ny+k*mesh->Nq]*mesh->D[my+k*mesh->Nq];
-//               }
-//             }
-
-//             // double check following two: AK
-//             // id = nx+my*mesh->Nq;
-//             // dfloat Gst = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G12ID*mesh->Np];
-//             // val += Gst*mesh->D[ny+my*mesh->Nq];
-
-//             // id = mx+ny*mesh->Nq;
-//             // dfloat Gts = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G12ID*mesh->Np];
-//             // val += Gts*mesh->D[my+ny*mesh->Nq];
-
-//             if ((nx==mx)&&(ny==my)) {
-//               id = nx + ny*mesh->Nq;
-
-//               // dfloat Gtt = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + G22ID*mesh->Np];
-//               // val += Gtt;
-
-//               dfloat JW = mesh->ggeo[e*mesh->Np*mesh->Nggeo + id + GWJID*mesh->Np];
-//               val += JW*lambda;
-//             }
-
-// #if 0
-//             const hlong rowid = e*mesh->Np + nx + ny*mesh->Nq;
-//             const hlong colid = e*mesh->Np + mx + my*mesh->Nq;
-
-//             Af[rowid*mesh->Nelements*mesh->Np + colid] = val;
-// #endif
-
-//             dfloat nonZeroThreshold = 1e-7;
-//             if (fabs(val)>nonZeroThreshold) {
-//               // pack non-zero
-//               sendNonZeros[cnt].val = val;
-//               sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + nx+ny*mesh->Nq];
-//               sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + mx+my*mesh->Nq];
-//               sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + nx+ny*mesh->Nq];
-//               cnt++;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-
-// #if 0
-//  // Write matlab dat for postprocess
-//   char fname[BUFSIZ];
-//   sprintf(fname, "Ax.dat");
-//   FILE *fp;
-//   fp = fopen(fname, "w");
-
-//   for(hlong row = 0; row<(mesh->Nelements*mesh->Np); row++){
-//     for(hlong col = 0; col<(mesh->Nelements*mesh->Np); col++){
-//       dfloat val = Af[row*mesh->Nelements*mesh->Np + col];
-//       fprintf(fp,"%.8e ", val);
-//     }
-//     fprintf(fp,"\n");
-//   }
-
-//  fclose(fp);
-
-// #endif
-
-//   // Make the MPI_NONZERO_T data type
-//   MPI_Datatype MPI_NONZERO_T;
-//   MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
-//   int blength[4] = {1, 1, 1, 1};
-//   MPI_Aint addr[4], displ[4];
-//   MPI_Get_address ( &(sendNonZeros[0]          ), addr+0);
-//   MPI_Get_address ( &(sendNonZeros[0].col      ), addr+1);
-//   MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr+2);
-//   MPI_Get_address ( &(sendNonZeros[0].val      ), addr+3);
-//   displ[0] = 0;
-//   displ[1] = addr[1] - addr[0];
-//   displ[2] = addr[2] - addr[0];
-//   displ[3] = addr[3] - addr[0];
-//   MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
-//   MPI_Type_commit (&MPI_NONZERO_T);
-
-//   // count how many non-zeros to send to each process
-//   for(dlong n=0;n<cnt;++n)
-//     AsendCounts[sendNonZeros[n].ownerRank]++;
-
-//   // sort by row ordering
-//   qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
-
-//   // find how many nodes to expect (should use sparse version)
-//   MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
-
-//   // find send and recv offsets for gather
-//   *nnz = 0;
-//   for(int r=0;r<mesh->size;++r){
-//     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
-//     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
-//     *nnz += ArecvCounts[r];
-//   }
-
-//   *A = (nonZero_t*) calloc(*nnz, sizeof(nonZero_t));
-
-//   // determine number to receive
-//   MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
-//                         (*A), ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
-//                         mesh->comm);
-
-//   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-//   qsort((*A), *nnz, sizeof(nonZero_t), parallelCompareRowColumn);
-
-//   // compress duplicates
-//   cnt = 0;
-//   for(dlong n=1;n<*nnz;++n){
-//     if((*A)[n].row == (*A)[cnt].row &&
-//        (*A)[n].col == (*A)[cnt].col){
-//       (*A)[cnt].val += (*A)[n].val;
-//     }
-//     else{
-//       ++cnt;
-//       (*A)[cnt] = (*A)[n];
-//     }
-//   }
-//   if (*nnz) cnt++;
-//   *nnz = cnt;
-
-// #if 0
-//   // Write matlab dat for postprocess
-//   char fname[BUFSIZ];
-//   sprintf(fname, "Ax.dat");
-//   FILE *fp;
-//   fp = fopen(fname, "w");
-
-//   for(dlong n=1;n<*nnz;++n){
-//       fprintf(fp,"%d %d %.8e\n", (*A)[n].row+1, (*A)[n].col+1, (*A)[n].val);
-//   }
-
-//  fclose(fp);
-// #endif
-
-//   if(mesh->rank==0) printf("done.\n");
-
-//   MPI_Barrier(mesh->comm);
-//   MPI_Type_free(&MPI_NONZERO_T);
-
-//   free(sendNonZeros);
-//   free(globalNumbering); free(globalOwners);
-
-//   free(AsendCounts);
-//   free(ArecvCounts);
-//   free(AsendOffsets);
-//   free(ArecvOffsets);
-// }
-
-// void ellipticBuildContinuousTet3D(elliptic_t *elliptic, dfloat lambda, nonZero_t **A, dlong *nnz, ogs_t **ogs, hlong *globalStarts) {
-
-//   mesh2D *mesh = elliptic->mesh;
-//   setupAide options = elliptic->options;
-
-//   int rank = mesh->rank;
-
-//   //use the masked gs handle to define a global ordering
-
-//   // number of degrees of freedom on this rank (after gathering)
-//   hlong Ngather = elliptic->ogs->Ngather;
-//   dlong Ntotal  = mesh->Np*mesh->Nelements;
-
-//   // create a global numbering system
-//   hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong));
-//   int   *owner     = (int *) calloc(Ngather,sizeof(int));
-
-//   // every gathered degree of freedom has its own global id
-//   MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm);
-//   for(int r=0;r<mesh->size;++r)
-//     globalStarts[r+1] = globalStarts[r]+globalStarts[r+1];
-
-//   //use the offsets to set a consecutive global numbering
-//   for (dlong n =0;n<elliptic->ogs->Ngather;n++) {
-//     globalIds[n] = n + globalStarts[rank];
-//     owner[n] = rank;
-//   }
-
-//   //scatter this numbering to the original nodes
-//   hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong));
-//   int *globalOwners = (int *) calloc(Ntotal,sizeof(int));
-//   for (dlong n=0;n<Ntotal;n++) globalNumbering[n] = -1;
-//   ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, elliptic->ogs);
-//   ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs);
-
-//   free(globalIds); free(owner);
-
-//   // Build non-zeros of stiffness matrix (unassembled)
-//   dlong nnzLocal = mesh->Np*mesh->Np*mesh->Nelements;
-
-//   nonZero_t *sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
-//   int *AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
-//   int *ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
-//   int *AsendOffsets = (int*) calloc(mesh->size+1, sizeof(int));
-//   int *ArecvOffsets = (int*) calloc(mesh->size+1, sizeof(int));
-
-//   int *mask = (int *) calloc(mesh->Np*mesh->Nelements,sizeof(int));
-//   for (dlong n=0;n<elliptic->Nmasked;n++) mask[elliptic->maskIds[n]] = 1;
-
-//   //Build unassembed non-zeros
-//   if(mesh->rank==0) printf("Building full FEM matrix...");fflush(stdout);
-
-//   dlong cnt =0;
-//   #pragma omp parallel for
-//   for (dlong e=0;e<mesh->Nelements;e++) {
-
-//     dfloat Grr = mesh->ggeo[e*mesh->Nggeo + G00ID];
-//     dfloat Grs = mesh->ggeo[e*mesh->Nggeo + G01ID];
-//     dfloat Grt = mesh->ggeo[e*mesh->Nggeo + G02ID];
-//     dfloat Gss = mesh->ggeo[e*mesh->Nggeo + G11ID];
-//     dfloat Gst = mesh->ggeo[e*mesh->Nggeo + G12ID];
-//     dfloat Gtt = mesh->ggeo[e*mesh->Nggeo + G22ID];
-//     dfloat J   = mesh->ggeo[e*mesh->Nggeo + GWJID];
-
-//     for (int n=0;n<mesh->Np;n++) {
-//       if (mask[e*mesh->Np + n]) continue; //skip masked nodes
-//       for (int m=0;m<mesh->Np;m++) {
-//         if (mask[e*mesh->Np + m]) continue; //skip masked nodes
-//         dfloat val = 0.;
-
-//         val += Grr*mesh->Srr[m+n*mesh->Np];
-//         val += Grs*mesh->Srs[m+n*mesh->Np];
-//         val += Grt*mesh->Srt[m+n*mesh->Np];
-//         val += Grs*mesh->Ssr[m+n*mesh->Np];
-//         val += Gss*mesh->Sss[m+n*mesh->Np];
-//         val += Gst*mesh->Sst[m+n*mesh->Np];
-//         val += Grt*mesh->Str[m+n*mesh->Np];
-//         val += Gst*mesh->Sts[m+n*mesh->Np];
-//         val += Gtt*mesh->Stt[m+n*mesh->Np];
-//         val += J*lambda*mesh->MM[m+n*mesh->Np];
-
-//         dfloat nonZeroThreshold = 1e-7;
-//         if (fabs(val)>nonZeroThreshold) {
-//           #pragma omp critical
-//           {
-//             // pack non-zero
-//             sendNonZeros[cnt].val = val;
-//             sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + n];
-//             sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + m];
-//             sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + n];
-//             cnt++;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   // Make the MPI_NONZERO_T data type
-//   MPI_Datatype MPI_NONZERO_T;
-//   MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
-//   int blength[4] = {1, 1, 1, 1};
-//   MPI_Aint addr[4], displ[4];
-//   MPI_Get_address ( &(sendNonZeros[0]          ), addr+0);
-//   MPI_Get_address ( &(sendNonZeros[0].col      ), addr+1);
-//   MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr+2);
-//   MPI_Get_address ( &(sendNonZeros[0].val      ), addr+3);
-//   displ[0] = 0;
-//   displ[1] = addr[1] - addr[0];
-//   displ[2] = addr[2] - addr[0];
-//   displ[3] = addr[3] - addr[0];
-//   MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
-//   MPI_Type_commit (&MPI_NONZERO_T);
-
-//   // count how many non-zeros to send to each process
-//   for(dlong n=0;n<cnt;++n)
-//     AsendCounts[sendNonZeros[n].ownerRank] += 1;
-
-//   // sort by row ordering
-//   qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
-
-//   // find how many nodes to expect (should use sparse version)
-//   MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
-
-//   // find send and recv offsets for gather
-//   *nnz = 0;
-//   for(int r=0;r<mesh->size;++r){
-//     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
-//     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
-//     *nnz += ArecvCounts[r];
-//   }
-
-//   *A = (nonZero_t*) calloc(*nnz, sizeof(nonZero_t));
-
-//   // determine number to receive
-//   MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
-//                         (*A), ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
-//                         mesh->comm);
-
-//   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-//   qsort((*A), *nnz, sizeof(nonZero_t), parallelCompareRowColumn);
-
-//   // compress duplicates
-//   cnt = 0;
-//   for(dlong n=1;n<*nnz;++n){
-//     if((*A)[n].row == (*A)[cnt].row &&
-//        (*A)[n].col == (*A)[cnt].col){
-//       (*A)[cnt].val += (*A)[n].val;
-//     }
-//     else{
-//       ++cnt;
-//       (*A)[cnt] = (*A)[n];
-//     }
-//   }
-//   if (*nnz) cnt++;
-//   *nnz = cnt;
-
-//   if(mesh->rank==0) printf("done.\n");
-
-//   MPI_Barrier(mesh->comm);
-//   MPI_Type_free(&MPI_NONZERO_T);
-
-//   free(sendNonZeros);
-//   free(globalNumbering); free(globalOwners);
-
-//   free(AsendCounts);
-//   free(ArecvCounts);
-//   free(AsendOffsets);
-//   free(ArecvOffsets);
-
-//   free(mask);
-// }
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildLocalPatches.c b/src/libP/solvers/elliptic/src/ellipticBuildLocalPatches.c
deleted file mode 100644
index b9a634b7b..000000000
--- a/src/libP/solvers/elliptic/src/ellipticBuildLocalPatches.c
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-//returns the ipdg patch A matrix for element eM
-void BuildLocalIpdgPatchAxTri2D(elliptic_t* elliptic,
-                                mesh_t* mesh,
-                                int basisNp,
-                                dfloat* basis,
-                                dfloat lambda,
-                                dfloat* MS,
-                                dlong eM,
-                                dfloat* A);
-void BuildLocalIpdgPatchAxQuad2D(elliptic_t* elliptic,
-                                 mesh_t* mesh,
-                                 dfloat lambda,
-                                 dfloat* B,
-                                 dfloat* Br,
-                                 dfloat* Bs,
-                                 dlong eM,
-                                 dfloat* A);
-void BuildLocalIpdgPatchAxTet3D(elliptic_t* elliptic,
-                                mesh_t* mesh,
-                                dfloat lambda,
-                                dfloat* MS,
-                                dlong eM,
-                                dfloat* A);
-void BuildLocalIpdgPatchAxHex3D(elliptic_t* elliptic,
-                                mesh_t* mesh,
-                                dfloat lambda,
-                                dfloat* B,
-                                dfloat* Br,
-                                dfloat* Bs,
-                                dfloat* Bt,
-                                dlong eM,
-                                dfloat* A);
-
-//returns the C0FEM patch A matrix for element eM
-void BuildLocalContinuousPatchAxTri2D(elliptic_t* elliptic,
-                                      mesh_t* mesh,
-                                      dfloat lambda,
-                                      dlong eM,
-                                      dfloat* A);
-void BuildLocalContinuousPatchAxQuad2D(elliptic_t* elliptic,
-                                       mesh_t* mesh,
-                                       dfloat lambda,
-                                       dlong eM,
-                                       dfloat* B,
-                                       dfloat* Br,
-                                       dfloat* Bs,
-                                       dfloat* A);
-void BuildLocalContinuousPatchAxTet3D(elliptic_t* elliptic,
-                                      mesh_t* mesh,
-                                      dfloat lambda,
-                                      dlong eM,
-                                      dfloat* A);
-void BuildLocalContinuousPatchAxHex3D(elliptic_t* elliptic,
-                                      mesh_t* mesh,
-                                      dfloat lambda,
-                                      dlong eM,
-                                      dfloat* B,
-                                      dfloat* Br,
-                                      dfloat* Bs,
-                                      dfloat* Bt,
-                                      dfloat* A);
-
-void ellipticBuildLocalPatchesTri2D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                    dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA);
-void ellipticBuildLocalPatchesQuad2D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                     dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA);
-void ellipticBuildLocalPatchesTet3D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                    dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA);
-void ellipticBuildLocalPatchesHex3D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                    dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA);
-
-void ellipticBuildLocalPatches(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                               dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA)
-{
-  switch(elliptic->elementType) {
-  case TRIANGLES:
-    ellipticBuildLocalPatchesTri2D(elliptic,
-                                   lambda,
-                                   rateTolerance,
-                                   Npatches,
-                                   patchesIndex,
-                                   patchesInvA);
-    break;
-  case QUADRILATERALS:
-    ellipticBuildLocalPatchesQuad2D(elliptic,
-                                    lambda,
-                                    rateTolerance,
-                                    Npatches,
-                                    patchesIndex,
-                                    patchesInvA);
-    break;
-  case TETRAHEDRA:
-    ellipticBuildLocalPatchesTet3D(elliptic,
-                                   lambda,
-                                   rateTolerance,
-                                   Npatches,
-                                   patchesIndex,
-                                   patchesInvA);
-    break;
-  case HEXAHEDRA:
-    ellipticBuildLocalPatchesHex3D(elliptic,
-                                   lambda,
-                                   rateTolerance,
-                                   Npatches,
-                                   patchesIndex,
-                                   patchesInvA);
-    break;
-  }
-}
-
-void ellipticBuildLocalPatchesTri2D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                    dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA)
-{
-  mesh_t* mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  // surface mass matrices MS = MM*LIFT
-  dfloat* MS = (dfloat*) calloc(mesh->Nfaces * mesh->Nfp * mesh->Nfp,sizeof(dfloat));
-  for (int f = 0; f < mesh->Nfaces; f++)
-    for (int n = 0; n < mesh->Nfp; n++) {
-      int fn = mesh->faceNodes[f * mesh->Nfp + n];
-
-      for (int m = 0; m < mesh->Nfp; m++) {
-        dfloat MSnm = 0;
-
-        for (int i = 0; i < mesh->Np; i++)
-          MSnm += mesh->MM[fn + i * mesh->Np] *
-                  mesh->LIFT[i * mesh->Nfp * mesh->Nfaces + f * mesh->Nfp + m];
-
-        MS[m + n * mesh->Nfp + f * mesh->Nfp * mesh->Nfp]  = MSnm;
-      }
-    }
-
-  //patch inverse storage
-  *patchesInvA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  *patchesIndex = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  //temp patch storage
-  dfloat* patchA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* invRefAA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-
-  (*Npatches) = 1;
-  dlong refPatches = 0;
-
-  //build a mini mesh struct for the reference patch
-  mesh_t* refMesh = (mesh_t*) calloc(1,sizeof(mesh_t));
-  memcpy(refMesh,mesh,sizeof(mesh_t));
-
-  //vertices of reference patch
-  dfloat V1x = -1., V2x = 1., V3x =        0.;
-  dfloat V1y =  0., V2y = 0., V3y =  sqrt(3.);
-
-  refMesh->Nelements = 1;
-
-  refMesh->EX = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-  refMesh->EY = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-
-  refMesh->EX[0] = V1x;
-  refMesh->EY[0] = V1y;
-  refMesh->EX[1] = V2x;
-  refMesh->EY[1] = V2y;
-  refMesh->EX[2] = V3x;
-  refMesh->EY[2] = V3y;
-
-  refMesh->EToV = (hlong*) calloc(mesh->Nverts, sizeof(hlong));
-
-  refMesh->EToV[0] = 0;
-  refMesh->EToV[1] = 1;
-  refMesh->EToV[2] = 2;
-
-  refMesh->EToB = (int*) calloc(mesh->Nfaces,sizeof(int));
-  for (int n = 0; n < mesh->Nfaces; n++) refMesh->EToB[n] = 0;
-
-  meshConnect(refMesh);
-  meshLoadReferenceNodesTri2D(refMesh, mesh->N);
-  meshPhysicalNodesTri2D(refMesh);
-  meshGeometricFactorsTri2D(refMesh);
-  meshConnectFaceNodes2D(refMesh);
-  meshSurfaceGeometricFactorsTri2D(refMesh);
-
-  int basisNp = mesh->Np;
-  dfloat* basis;
-  if(options.compareArgs("BASIS","BERN")) {
-    basis = mesh->VB;
-  } else {// default to degree N Lagrange basis
-    basis = (dfloat*) calloc(basisNp * basisNp, sizeof(dfloat));
-    for(int n = 0; n < basisNp; ++n)
-      basis[n + n * basisNp] = 1;
-  }
-
-  //start with reference patch
-  dfloat* refPatchInvA = *patchesInvA;
-  if (options.compareArgs("DISCRETIZATION","IPDG"))
-    BuildLocalIpdgPatchAxTri2D(elliptic, refMesh, basisNp, basis, lambda, MS, 0, refPatchInvA);
-  else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-    BuildLocalContinuousPatchAxTri2D(elliptic, refMesh, lambda, 0, refPatchInvA);
-
-  matrixInverse(mesh->Np, refPatchInvA);
-
-  // loop over all elements
-  for(dlong eM = 0; eM < mesh->Nelements; ++eM) {
-    //build the patch A matrix for this element
-    if (options.compareArgs("DISCRETIZATION","IPDG"))
-      BuildLocalIpdgPatchAxTri2D(elliptic, mesh, basisNp, basis, lambda, MS, eM, patchA);
-    else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-      BuildLocalContinuousPatchAxTri2D(elliptic, mesh, lambda, eM, refPatchInvA);
-
-    dlong eP0 = mesh->EToE[eM * mesh->Nfaces + 0];
-    dlong eP1 = mesh->EToE[eM * mesh->Nfaces + 1];
-    dlong eP2 = mesh->EToE[eM * mesh->Nfaces + 2];
-
-    if(eP0 >= 0 && eP1 >= 0 && eP2 >= 0) { //check if this is an interior patch
-      refPatchInvA = *patchesInvA;
-
-      //hit the patch with the reference inverse
-      for(int n = 0; n < mesh->Np; ++n)
-        for(int m = 0; m < mesh->Np; ++m) {
-          invRefAA[n * mesh->Np + m] = 0.;
-          for (int k = 0; k < mesh->Np; k++)
-            invRefAA[n * mesh->Np + m] += refPatchInvA[n * mesh->Np + k] * patchA[k * mesh->Np + m];
-        }
-
-      dfloat cond = matrixConditionNumber(mesh->Np,invRefAA);
-      dfloat rate = (sqrt(cond) - 1.) / (sqrt(cond) + 1.);
-
-      // printf("Element %d's conditioned patch reports cond = %g and rate = %g \n", eM, cond, rate);
-
-      if (rate < rateTolerance) {
-        (*patchesIndex)[eM] = 0;
-        refPatches++;
-        continue;
-      }
-    }
-    ++(*Npatches);
-    *patchesInvA = (dfloat*) realloc(*patchesInvA,
-                                     (*Npatches) * mesh->Np * mesh->Np * sizeof(dfloat));
-
-    matrixInverse(mesh->Np, patchA);
-
-    //copy inverse into patchesInvA
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        dlong id = ((*Npatches) - 1) * mesh->Np * mesh->Np + n * mesh->Np + m;
-        (*patchesInvA)[id] = patchA[n * mesh->Np + m];
-      }
-
-    (*patchesIndex)[eM] = (*Npatches) - 1;
-  }
-
-  printf("saving " dlongFormat " full patches\n",*Npatches);
-  printf("using " dlongFormat " reference patches\n", refPatches);
-
-  free(refMesh);
-  free(patchA);
-  free(invRefAA);
-  free(MS);
-}
-
-void ellipticBuildLocalPatchesQuad2D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                     dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA)
-{
-  mesh_t* mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  // build some monolithic basis arrays
-  dfloat* B  = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* Br = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* Bs = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-
-  int mode = 0;
-  for(int nj = 0; nj < mesh->N + 1; ++nj)
-    for(int ni = 0; ni < mesh->N + 1; ++ni) {
-      int node = 0;
-
-      for(int j = 0; j < mesh->N + 1; ++j)
-        for(int i = 0; i < mesh->N + 1; ++i) {
-          if(nj == j && ni == i)
-            B[mode * mesh->Np + node] = 1;
-          if(nj == j)
-            Br[mode * mesh->Np + node] = mesh->D[ni + mesh->Nq * i];
-          if(ni == i)
-            Bs[mode * mesh->Np + node] = mesh->D[nj + mesh->Nq * j];
-
-          ++node;
-        }
-      ++mode;
-    }
-
-  //patch inverse storage
-  *patchesInvA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  *patchesIndex = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  //temp patch storage
-  dfloat* patchA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* invRefAA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-
-  (*Npatches) = 1;
-  dlong refPatches = 0;
-
-  //build a mini mesh struct for the reference patch
-  mesh_t* refMesh = (mesh_t*) calloc(1,sizeof(mesh_t));
-  memcpy(refMesh,mesh,sizeof(mesh_t));
-
-  //vertices of reference patch
-  dfloat V1x = -1., V2x =  1., V3x =  1., V4x = -1.;
-  dfloat V1y = -1., V2y = -1., V3y =  1., V4y =  1.;
-
-  refMesh->Nelements = 1;
-
-  refMesh->EX = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-  refMesh->EY = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-
-  refMesh->EX[0] = V1x;
-  refMesh->EY[0] = V1y;
-  refMesh->EX[1] = V2x;
-  refMesh->EY[1] = V2y;
-  refMesh->EX[2] = V3x;
-  refMesh->EY[2] = V3y;
-  refMesh->EX[3] = V4x;
-  refMesh->EY[3] = V4y;
-
-  refMesh->EToV = (hlong*) calloc(mesh->Nverts, sizeof(hlong));
-
-  refMesh->EToV[0] = 0;
-  refMesh->EToV[1] = 1;
-  refMesh->EToV[2] = 2;
-  refMesh->EToV[3] = 3;
-
-  refMesh->EToB = (int*) calloc(mesh->Nfaces,sizeof(int));
-  for (int n = 0; n < mesh->Nfaces; n++) refMesh->EToB[n] = 0;
-
-  meshConnect(refMesh);
-  meshLoadReferenceNodesQuad2D(refMesh, mesh->N);
-  meshPhysicalNodesQuad2D(refMesh);
-  meshGeometricFactorsQuad2D(refMesh);
-  meshConnectFaceNodes2D(refMesh);
-  meshSurfaceGeometricFactorsQuad2D(refMesh);
-
-  //start with reference patch
-  dfloat* refPatchInvA = *patchesInvA;
-  if (options.compareArgs("DISCRETIZATION","IPDG"))
-    BuildLocalIpdgPatchAxQuad2D(elliptic, refMesh, lambda, B,Br,Bs, 0, refPatchInvA);
-  else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-    BuildLocalContinuousPatchAxQuad2D(elliptic, refMesh, lambda, 0, B,Br,Bs, refPatchInvA);
-
-  matrixInverse(mesh->Np, refPatchInvA);
-
-  // loop over all elements
-  for(dlong eM = 0; eM < mesh->Nelements; ++eM) {
-    //build the patch A matrix for this element
-    if (options.compareArgs("DISCRETIZATION","IPDG"))
-      BuildLocalIpdgPatchAxQuad2D(elliptic, mesh, lambda, B,Br,Bs, eM, patchA);
-    else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-      BuildLocalContinuousPatchAxQuad2D(elliptic, mesh, lambda, eM, B,Br,Bs, refPatchInvA);
-
-    dlong eP0 = mesh->EToE[eM * mesh->Nfaces + 0];
-    dlong eP1 = mesh->EToE[eM * mesh->Nfaces + 1];
-    dlong eP2 = mesh->EToE[eM * mesh->Nfaces + 2];
-    dlong eP3 = mesh->EToE[eM * mesh->Nfaces + 3];
-
-    int fP0 = mesh->EToF[eM * mesh->Nfaces + 0];
-    int fP1 = mesh->EToF[eM * mesh->Nfaces + 1];
-    int fP2 = mesh->EToF[eM * mesh->Nfaces + 2];
-    int fP3 = mesh->EToF[eM * mesh->Nfaces + 3];
-
-    if(eP0 >= 0 && eP1 >= 0 && eP2 >= 0 && eP3 >= 0) { //check if this is an interior patch
-      refPatchInvA = *patchesInvA;
-
-      //hit the patch with the reference inverse
-      for(int n = 0; n < mesh->Np; ++n)
-        for(int m = 0; m < mesh->Np; ++m) {
-          invRefAA[n * mesh->Np + m] = 0.;
-          for (int k = 0; k < mesh->Np; k++)
-            invRefAA[n * mesh->Np + m] += refPatchInvA[n * mesh->Np + k] * patchA[k * mesh->Np + m];
-        }
-
-      dfloat cond = matrixConditionNumber(mesh->Np,invRefAA);
-      dfloat rate = (sqrt(cond) - 1.) / (sqrt(cond) + 1.);
-
-      // printf("Element %d's conditioned patch reports cond = %g and rate = %g \n", eM, cond, rate);
-
-      if (rate < rateTolerance) {
-        (*patchesIndex)[eM] = 0;
-        refPatches++;
-        continue;
-      }
-    }
-    ++(*Npatches);
-    *patchesInvA = (dfloat*) realloc(*patchesInvA,
-                                     (*Npatches) * mesh->Np * mesh->Np * sizeof(dfloat));
-
-    matrixInverse(mesh->Np, patchA);
-
-    //copy inverse into patchesInvA
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        dlong id = ((*Npatches) - 1) * mesh->Np * mesh->Np + n * mesh->Np + m;
-        (*patchesInvA)[id] = patchA[n * mesh->Np + m];
-      }
-
-    (*patchesIndex)[eM] = (*Npatches) - 1;
-  }
-
-  printf("saving " dlongFormat " full patches\n",*Npatches);
-  printf("using " dlongFormat " reference patches\n", refPatches);
-
-  free(refMesh);
-  free(patchA);
-  free(invRefAA);
-  free(B);
-  free(Br);
-  free(Bs);
-}
-
-void ellipticBuildLocalPatchesTet3D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                    dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA)
-{
-  mesh_t* mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  // surface mass matrices MS = MM*LIFT
-  dfloat* MS = (dfloat*) calloc(mesh->Nfaces * mesh->Nfp * mesh->Nfp,sizeof(dfloat));
-  for (int f = 0; f < mesh->Nfaces; f++)
-    for (int n = 0; n < mesh->Nfp; n++) {
-      int fn = mesh->faceNodes[f * mesh->Nfp + n];
-
-      for (int m = 0; m < mesh->Nfp; m++) {
-        dfloat MSnm = 0;
-
-        for (int i = 0; i < mesh->Np; i++)
-          MSnm += mesh->MM[fn + i * mesh->Np] *
-                  mesh->LIFT[i * mesh->Nfp * mesh->Nfaces + f * mesh->Nfp + m];
-
-        MS[m + n * mesh->Nfp + f * mesh->Nfp * mesh->Nfp]  = MSnm;
-      }
-    }
-
-  (*Npatches) = 1;
-  dlong refPatches = 0;
-
-  //build a mini mesh struct for the reference patch
-  mesh_t* refMesh = (mesh_t*) calloc(1,sizeof(mesh_t));
-  memcpy(refMesh,mesh,sizeof(mesh_t));
-
-  //vertices of reference patch
-  dfloat V1x = -1., V2x = 1., V3x =        0., V4x = 0;
-  dfloat V1y =  0., V2y = 0., V3y =  sqrt(3.), V4y = 1. / sqrt(3.);
-  dfloat V1z =  0., V2z = 0., V3z =        0., V4z = 2 * sqrt(6.) / 3.;
-
-  refMesh->Nelements = 1;
-
-  refMesh->EX = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-  refMesh->EY = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-  refMesh->EZ = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-
-  refMesh->EX[0] = V1x;
-  refMesh->EY[0] = V1y;
-  refMesh->EZ[0] = V1z;
-  refMesh->EX[1] = V2x;
-  refMesh->EY[1] = V2y;
-  refMesh->EZ[1] = V2z;
-  refMesh->EX[2] = V3x;
-  refMesh->EY[2] = V3y;
-  refMesh->EZ[2] = V3z;
-  refMesh->EX[3] = V4x;
-  refMesh->EY[3] = V4y;
-  refMesh->EZ[3] = V4z;
-
-  refMesh->EToV = (hlong*) calloc(mesh->Nverts, sizeof(hlong));
-
-  refMesh->EToV[0] = 0;
-  refMesh->EToV[1] = 1;
-  refMesh->EToV[2] = 2;
-  refMesh->EToV[3] = 3;
-
-  refMesh->EToB = (int*) calloc(mesh->Nfaces,sizeof(int));
-  for (int n = 0; n < mesh->Nfaces; n++) refMesh->EToB[n] = 0;
-
-  meshConnect(refMesh);
-  meshLoadReferenceNodesTet3D(refMesh, mesh->N);
-  meshPhysicalNodesTet3D(refMesh);
-  meshGeometricFactorsTet3D(refMesh);
-  meshConnectFaceNodes3D(refMesh);
-  meshSurfaceGeometricFactorsTet3D(refMesh);
-
-  //patch inverse storage
-  *patchesInvA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  *patchesIndex = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  //temp patch storage
-  dfloat* patchA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* invRefAA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-
-  //start with reference patch
-  dfloat* refPatchInvA = *patchesInvA;
-  if (options.compareArgs("DISCRETIZATION","IPDG"))
-    BuildLocalIpdgPatchAxTet3D(elliptic, refMesh, lambda, MS, 0, refPatchInvA);
-  else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-    BuildLocalContinuousPatchAxTet3D(elliptic, refMesh, lambda, 0, refPatchInvA);
-
-  matrixInverse(mesh->Np, refPatchInvA);
-
-  dfloat maxRate = 0.;
-  dfloat maxCond = 0.;
-
-  // loop over all elements
-  for(dlong eM = 0; eM < mesh->Nelements; ++eM) {
-    //build the patch A matrix for this element
-    if (options.compareArgs("DISCRETIZATION","IPDG"))
-      BuildLocalIpdgPatchAxTet3D(elliptic, mesh, lambda, MS, eM, refPatchInvA);
-    else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-      BuildLocalContinuousPatchAxTet3D(elliptic, refMesh, lambda, eM, refPatchInvA);
-
-    dlong eP0 = mesh->EToE[eM * mesh->Nfaces + 0];
-    dlong eP1 = mesh->EToE[eM * mesh->Nfaces + 1];
-    dlong eP2 = mesh->EToE[eM * mesh->Nfaces + 2];
-    dlong eP3 = mesh->EToE[eM * mesh->Nfaces + 3];
-
-    int fP0 = mesh->EToF[eM * mesh->Nfaces + 0];
-    int fP1 = mesh->EToF[eM * mesh->Nfaces + 1];
-    int fP2 = mesh->EToF[eM * mesh->Nfaces + 2];
-    int fP3 = mesh->EToF[eM * mesh->Nfaces + 3];
-
-    if(eP0 >= 0 && eP1 >= 0 && eP2 >= 0 && eP3 >= 0) { //check if this is an interior patch
-      refPatchInvA = *patchesInvA;
-
-      //hit the patch with the reference inverse
-      for(int n = 0; n < mesh->Np; ++n)
-        for(int m = 0; m < mesh->Np; ++m) {
-          invRefAA[n * mesh->Np + m] = 0.;
-          for (int k = 0; k < mesh->Np; k++)
-            invRefAA[n * mesh->Np + m] += refPatchInvA[n * mesh->Np + k] * patchA[k * mesh->Np + m];
-        }
-
-      dfloat cond = matrixConditionNumber(mesh->Np,invRefAA);
-      dfloat rate = (sqrt(cond) - 1.) / (sqrt(cond) + 1.);
-
-      //printf("Element %d's conditioned patch reports cond = %g and rate = %g \n", eM, cond, rate);
-      maxRate = mymax(rate,maxRate);
-      maxCond = mymax(cond,maxCond);
-
-      if (rate < rateTolerance) {
-        (*patchesIndex)[eM] = 0;
-        refPatches++;
-        continue;
-      }
-    }
-    ++(*Npatches);
-    *patchesInvA = (dfloat*) realloc(*patchesInvA,
-                                     (*Npatches) * mesh->Np * mesh->Np * sizeof(dfloat));
-
-    matrixInverse(mesh->Np, patchA);
-
-    //copy inverse into patchesInvA
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        int id = ((*Npatches) - 1) * mesh->Np * mesh->Np + n * mesh->Np + m;
-        (*patchesInvA)[id] = patchA[n * mesh->Np + m];
-      }
-
-    (*patchesIndex)[eM] = (*Npatches) - 1;
-  }
-
-  printf("saving " dlongFormat " full patches\n",*Npatches);
-  printf("using " dlongFormat " reference patches\n", refPatches);
-  printf("Max condition number = %g, and slowest CG convergence rate = %g\n", maxCond, maxRate);
-
-  free(refMesh);
-  free(patchA);
-  free(invRefAA);
-  free(MS);
-}
-
-void ellipticBuildLocalPatchesHex3D(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance,
-                                    dlong* Npatches, dlong** patchesIndex, dfloat** patchesInvA)
-{
-  mesh_t* mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  // build some monolithic basis arrays
-  dfloat* B  = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* Br = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* Bs = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* Bt = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-
-  int mode = 0;
-  for(int nk = 0; nk < mesh->N + 1; ++nk)
-    for(int nj = 0; nj < mesh->N + 1; ++nj)
-      for(int ni = 0; ni < mesh->N + 1; ++ni) {
-        int node = 0;
-
-        for(int k = 0; k < mesh->N + 1; ++k)
-          for(int j = 0; j < mesh->N + 1; ++j)
-            for(int i = 0; i < mesh->N + 1; ++i) {
-              if(nk == k && nj == j && ni == i)
-                B[mode * mesh->Np + node] = 1;
-              if(nj == j && nk == k)
-                Br[mode * mesh->Np + node] = mesh->D[ni + mesh->Nq * i];
-              if(ni == i && nk == k)
-                Bs[mode * mesh->Np + node] = mesh->D[nj + mesh->Nq * j];
-              if(ni == i && nj == j)
-                Bt[mode * mesh->Np + node] = mesh->D[nk + mesh->Nq * k];
-
-              ++node;
-            }
-
-        ++mode;
-      }
-
-  //patch inverse storage
-  *patchesInvA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  *patchesIndex = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  //temp patch storage
-  dfloat* patchA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* invRefAA = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-
-  (*Npatches) = 1;
-  dlong refPatches = 0;
-
-  //build a mini mesh struct for the reference patch
-  mesh_t* refMesh = (mesh_t*) calloc(1,sizeof(mesh_t));
-  memcpy(refMesh,mesh,sizeof(mesh_t));
-
-  //vertices of reference patch
-  dfloat V1x = -1., V2x =  1., V3x =  1., V4x = -1., V5x = -1., V6x =  1., V7x =  1., V8x = -1.;
-  dfloat V1y = -1., V2y = -1., V3y =  1., V4y =  1., V5y = -1., V6y = -1., V7y =  1., V8y =  1.;
-  dfloat V1z = -1., V2z = -1., V3z = -1., V4z = -1., V5z =  1., V6z =  1., V7z =  1., V8z =  1.;
-
-  refMesh->Nelements = 1;
-
-  refMesh->EX = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-  refMesh->EY = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-  refMesh->EZ = (dfloat*) calloc(mesh->Nverts,sizeof(dfloat));
-
-  refMesh->EX[0] = V1x;
-  refMesh->EY[0] = V1y;
-  refMesh->EZ[0] = V1z;
-  refMesh->EX[1] = V2x;
-  refMesh->EY[1] = V2y;
-  refMesh->EZ[1] = V2z;
-  refMesh->EX[2] = V3x;
-  refMesh->EY[2] = V3y;
-  refMesh->EZ[2] = V3z;
-  refMesh->EX[3] = V4x;
-  refMesh->EY[3] = V4y;
-  refMesh->EZ[3] = V4z;
-  refMesh->EX[4] = V5x;
-  refMesh->EY[4] = V5y;
-  refMesh->EZ[4] = V5z;
-  refMesh->EX[5] = V6x;
-  refMesh->EY[5] = V6y;
-  refMesh->EZ[5] = V6z;
-  refMesh->EX[6] = V7x;
-  refMesh->EY[6] = V7y;
-  refMesh->EZ[6] = V7z;
-  refMesh->EX[7] = V8x;
-  refMesh->EY[7] = V8y;
-  refMesh->EZ[7] = V8z;
-
-  refMesh->EToV = (hlong*) calloc(mesh->Nverts, sizeof(hlong));
-
-  refMesh->EToV[0] = 0;
-  refMesh->EToV[1] = 1;
-  refMesh->EToV[2] = 2;
-  refMesh->EToV[3] = 3;
-  refMesh->EToV[4] = 4;
-  refMesh->EToV[5] = 5;
-  refMesh->EToV[6] = 6;
-  refMesh->EToV[7] = 7;
-
-  refMesh->EToB = (int*) calloc(mesh->Nfaces,sizeof(int));
-  for (int n = 0; n < mesh->Nfaces; n++) refMesh->EToB[n] = 0;
-
-  meshConnect(refMesh);
-  meshLoadReferenceNodesHex3D(refMesh, mesh->N);
-  meshPhysicalNodesHex3D(refMesh);
-  meshGeometricFactorsHex3D(refMesh);
-  meshConnectFaceNodes3D(refMesh);
-  meshSurfaceGeometricFactorsHex3D(refMesh);
-
-  //start with reference patch
-  dfloat* refPatchInvA = *patchesInvA;
-  if (options.compareArgs("DISCRETIZATION","IPDG"))
-    BuildLocalIpdgPatchAxHex3D(elliptic, refMesh, lambda, B,Br,Bs,Bt, 0, refPatchInvA);
-  else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-    BuildLocalContinuousPatchAxHex3D(elliptic, refMesh, lambda,  0, B,Br,Bs,Bt, refPatchInvA);
-  matrixInverse(mesh->Np, refPatchInvA);
-
-  // loop over all elements
-  for(dlong eM = 0; eM < mesh->Nelements; ++eM) {
-    //build the patch A matrix for this element
-    if (options.compareArgs("DISCRETIZATION","IPDG"))
-      BuildLocalIpdgPatchAxHex3D(elliptic, mesh, lambda, B,Br,Bs,Bt, eM,  refPatchInvA);
-    else if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-      BuildLocalContinuousPatchAxHex3D(elliptic, mesh, lambda,  eM, B,Br,Bs,Bt, refPatchInvA);
-
-    dlong eP0 = mesh->EToE[eM * mesh->Nfaces + 0];
-    dlong eP1 = mesh->EToE[eM * mesh->Nfaces + 1];
-    dlong eP2 = mesh->EToE[eM * mesh->Nfaces + 2];
-    dlong eP3 = mesh->EToE[eM * mesh->Nfaces + 3];
-    dlong eP4 = mesh->EToE[eM * mesh->Nfaces + 4];
-    dlong eP5 = mesh->EToE[eM * mesh->Nfaces + 5];
-
-    int fP0 = mesh->EToF[eM * mesh->Nfaces + 0];
-    int fP1 = mesh->EToF[eM * mesh->Nfaces + 1];
-    int fP2 = mesh->EToF[eM * mesh->Nfaces + 2];
-    int fP3 = mesh->EToF[eM * mesh->Nfaces + 3];
-    int fP4 = mesh->EToF[eM * mesh->Nfaces + 4];
-    int fP5 = mesh->EToF[eM * mesh->Nfaces + 5];
-
-    if(eP0 >= 0 && eP1 >= 0 && eP2 >= 0 && eP3 >= 0 && eP4 >= 0 && eP5 >= 0) { //check if this is an interior patch
-      refPatchInvA = *patchesInvA;
-
-      //hit the patch with the reference inverse
-      for(int n = 0; n < mesh->Np; ++n)
-        for(int m = 0; m < mesh->Np; ++m) {
-          invRefAA[n * mesh->Np + m] = 0.;
-          for (int k = 0; k < mesh->Np; k++)
-            invRefAA[n * mesh->Np + m] += refPatchInvA[n * mesh->Np + k] * patchA[k * mesh->Np + m];
-        }
-
-      dfloat cond = matrixConditionNumber(mesh->Np,invRefAA);
-      dfloat rate = (sqrt(cond) - 1.) / (sqrt(cond) + 1.);
-
-      // printf("Element %d's conditioned patch reports cond = %g and rate = %g \n", eM, cond, rate);
-
-      if (rate < rateTolerance) {
-        (*patchesIndex)[eM] = 0;
-        refPatches++;
-        continue;
-      }
-    }
-    ++(*Npatches);
-    *patchesInvA = (dfloat*) realloc(*patchesInvA,
-                                     (*Npatches) * mesh->Np * mesh->Np * sizeof(dfloat));
-
-    matrixInverse(mesh->Np, patchA);
-
-    //copy inverse into patchesInvA
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        dlong id = ((*Npatches) - 1) * mesh->Np * mesh->Np + n * mesh->Np + m;
-        (*patchesInvA)[id] = patchA[n * mesh->Np + m];
-      }
-
-    (*patchesIndex)[eM] = (*Npatches) - 1;
-  }
-
-  printf("saving " dlongFormat " full patches\n",*Npatches);
-  printf("using " dlongFormat " reference patches\n", refPatches);
-
-  free(refMesh);
-  free(patchA);
-  free(invRefAA);
-  free(B);
-  free(Br);
-  free(Bs);
-}
-
-//returns the ipdg patch A matrix for element eM
-void BuildLocalIpdgPatchAxTri2D(elliptic_t* elliptic,
-                                mesh_t* mesh,
-                                int basisNp,
-                                dfloat* basis,
-                                dfloat lambda,
-                                dfloat* MS,
-                                dlong eM,
-                                dfloat* A)
-{
-  dlong vbase = eM * mesh->Nvgeo;
-  dfloat drdx = mesh->vgeo[vbase + RXID];
-  dfloat drdy = mesh->vgeo[vbase + RYID];
-  dfloat dsdx = mesh->vgeo[vbase + SXID];
-  dfloat dsdy = mesh->vgeo[vbase + SYID];
-  dfloat J = mesh->vgeo[vbase + JID];
-
-  dfloat* Ae = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-
-  /* start with stiffness matrix  */
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      Ae[n * mesh->Np + m]  = J * lambda * mesh->MM[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * drdx * drdx * mesh->Srr[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * drdx * dsdx * mesh->Srs[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * dsdx * drdx * mesh->Ssr[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * dsdx * dsdx * mesh->Sss[n * mesh->Np + m];
-
-      Ae[n * mesh->Np + m] += J * drdy * drdy * mesh->Srr[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * drdy * dsdy * mesh->Srs[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * dsdy * drdy * mesh->Ssr[n * mesh->Np + m];
-      Ae[n * mesh->Np + m] += J * dsdy * dsdy * mesh->Sss[n * mesh->Np + m];
-    }
-
-  //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m)
-        Ae[n * mesh->Np + m] += elliptic->allNeumannPenalty * elliptic->allNeumannScale *
-                                elliptic->allNeumannScale;
-  }
-
-  for (int fM = 0; fM < mesh->Nfaces; fM++) {
-    // load surface geofactors for this face
-    dlong sid = mesh->Nsgeo * (eM * mesh->Nfaces + fM);
-    dfloat nx = mesh->sgeo[sid + NXID];
-    dfloat ny = mesh->sgeo[sid + NYID];
-    dfloat sJ = mesh->sgeo[sid + SJID];
-    dfloat hinv = mesh->sgeo[sid + IHID];
-
-    int bc = mesh->EToB[fM + mesh->Nfaces * eM]; //raw boundary flag
-
-    dfloat penalty = elliptic->tau * hinv;
-
-    int bcD = 0, bcN = 0;
-    int bcType = 0;
-
-    if(bc > 0) bcType = elliptic->BCType[bc];        //find its type (Dirichlet/Neumann)
-
-    // this needs to be double checked (and the code where these are used)
-    if(bcType == 1) { // Dirichlet
-      bcD = 1;
-      bcN = 0;
-    } else if(bcType == 2) { // Neumann
-      bcD = 0;
-      bcN = 1;
-    }
-
-    // mass matrix for this face
-    dfloat* MSf = MS + fM * mesh->Nfp * mesh->Nfp;
-
-    // penalty term just involves face nodes
-    for(int n = 0; n < mesh->Nfp; ++n)
-      for(int m = 0; m < mesh->Nfp; ++m) {
-        int nM = mesh->faceNodes[fM * mesh->Nfp + n];
-        int mM = mesh->faceNodes[fM * mesh->Nfp + m];
-
-        // OP11 = OP11 + 0.5*( gtau*mmE )
-        dfloat MSfnm = sJ * MSf[n * mesh->Nfp + m];
-        Ae[nM * mesh->Np + mM] += 0.5 * (1. - bcN) * (1. + bcD) * penalty * MSfnm;
-      }
-
-    // now add differential surface terms
-    for(int n = 0; n < mesh->Nfp; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        int nM = mesh->faceNodes[fM * mesh->Nfp + n];
-
-        for(int i = 0; i < mesh->Nfp; ++i) {
-          int iM = mesh->faceNodes[fM * mesh->Nfp + i];
-
-          dfloat MSfni = sJ * MSf[n * mesh->Nfp + i]; // surface Jacobian built in
-
-          dfloat DxMim = drdx * mesh->Dr[iM * mesh->Np + m] + dsdx * mesh->Ds[iM * mesh->Np + m];
-          dfloat DyMim = drdy * mesh->Dr[iM * mesh->Np + m] + dsdy * mesh->Ds[iM * mesh->Np + m];
-
-          // OP11 = OP11 + 0.5*( - mmE*Dn1)
-          Ae[nM * mesh->Np + m] += -0.5 * nx * (1 + bcD) * (1 - bcN) * MSfni * DxMim;
-          Ae[nM * mesh->Np + m] += -0.5 * ny * (1 + bcD) * (1 - bcN) * MSfni * DyMim;
-        }
-      }
-
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Nfp; ++m) {
-        int mM = mesh->faceNodes[fM * mesh->Nfp + m];
-
-        for(int i = 0; i < mesh->Nfp; ++i) {
-          int iM = mesh->faceNodes[fM * mesh->Nfp + i];
-
-          dfloat MSfim = sJ * MSf[i * mesh->Nfp + m];
-
-          dfloat DxMin = drdx * mesh->Dr[iM * mesh->Np + n] + dsdx * mesh->Ds[iM * mesh->Np + n];
-          dfloat DyMin = drdy * mesh->Dr[iM * mesh->Np + n] + dsdy * mesh->Ds[iM * mesh->Np + n];
-
-          // OP11 = OP11 + (- Dn1'*mmE );
-          Ae[n * mesh->Np + mM] +=  -0.5 * nx * (1 + bcD) * (1 - bcN) * DxMin * MSfim;
-          Ae[n * mesh->Np + mM] +=  -0.5 * ny * (1 + bcD) * (1 - bcN) * DyMin * MSfim;
-        }
-      }
-  }
-
-  for(int j = 0; j < basisNp; ++j)
-    for(int i = 0; i < basisNp; ++i) {
-      dfloat val = 0;
-      for (int n = 0; n < mesh->Np; n++)
-        for (int m = 0; m < mesh->Np; m++)
-          val += basis[n * basisNp + j] * Ae[n * mesh->Np + m] * basis[m * basisNp + i];
-
-      A[i + j * basisNp] = val;
-    }
-
-  free(Ae);
-}
-
-//returns the continuous C0 patch A matrix for element eM
-void BuildLocalContinuousPatchAxTri2D(elliptic_t* elliptic,
-                                      mesh_t* mesh,
-                                      dfloat lambda,
-                                      dlong eM,
-                                      dfloat* A)
-{
-  dlong gbase = eM * mesh->Nggeo;
-  dfloat Grr = mesh->ggeo[gbase + G00ID];
-  dfloat Grs = mesh->ggeo[gbase + G01ID];
-  dfloat Gss = mesh->ggeo[gbase + G11ID];
-  dfloat J   = mesh->ggeo[gbase + GWJID];
-
-  /* start with stiffness matrix  */
-  for(int n = 0; n < mesh->Np; ++n) {
-    if (elliptic->mapB[n + eM * mesh->Np] != 1) { //dont fill rows for masked nodes
-      for(int m = 0; m < mesh->Np; ++m) {
-        if (elliptic->mapB[m + eM * mesh->Np] != 1) {//dont fill rows for masked nodes
-          A[n * mesh->Np + m] = J * lambda * mesh->MM[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grr * mesh->Srr[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grs * mesh->Srs[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grs * mesh->Ssr[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Gss * mesh->Sss[m + n * mesh->Np];
-        } else {
-          A[n * mesh->Np + m] = 0;
-        }
-      }
-    } else {
-      A[n + n * mesh->Np] = 1; //just put a 1 so A is invertable
-    }
-  }
-
-  //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
-    for(int n = 0; n < mesh->Np; ++n)
-      if (elliptic->mapB[n + eM * mesh->Np] != 1) { //dont fill rows for masked nodes
-        for(int m = 0; m < mesh->Np; ++m) {
-          if (elliptic->mapB[m + eM * mesh->Np] == 1) continue; //skip masked nodes
-          A[n * mesh->Np + m] += elliptic->allNeumannPenalty * elliptic->allNeumannScale *
-                                 elliptic->allNeumannScale;
-        }
-      }
-  }
-}
-
-//returns the patch A matrix for element eM
-void BuildLocalIpdgPatchAxQuad2D(elliptic_t* elliptic, mesh_t* mesh, dfloat lambda,
-                                 dfloat* B, dfloat* Br, dfloat* Bs, dlong eM, dfloat* A)
-{
-  /* start with stiffness matrix  */
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      A[n * mesh->Np + m] = 0;
-
-      // (grad phi_n, grad phi_m)_{D^e}
-      for(int i = 0; i < mesh->Np; ++i) {
-        dlong base = eM * mesh->Np * mesh->Nvgeo + i;
-        dfloat drdx = mesh->vgeo[base + mesh->Np * RXID];
-        dfloat drdy = mesh->vgeo[base + mesh->Np * RYID];
-        dfloat dsdx = mesh->vgeo[base + mesh->Np * SXID];
-        dfloat dsdy = mesh->vgeo[base + mesh->Np * SYID];
-        dfloat JW   = mesh->vgeo[base + mesh->Np * JWID];
-
-        int idn = n * mesh->Np + i;
-        int idm = m * mesh->Np + i;
-        dfloat dlndx = drdx * Br[idn] + dsdx * Bs[idn];
-        dfloat dlndy = drdy * Br[idn] + dsdy * Bs[idn];
-        dfloat dlmdx = drdx * Br[idm] + dsdx * Bs[idm];
-        dfloat dlmdy = drdy * Br[idm] + dsdy * Bs[idm];
-        A[n * mesh->Np + m] += JW * (dlndx * dlmdx + dlndy * dlmdy);
-        A[n * mesh->Np + m] += lambda * JW * B[idn] * B[idm];
-      }
-
-      for (int fM = 0; fM < mesh->Nfaces; fM++)
-        // accumulate flux terms for negative and positive traces
-        for(int i = 0; i < mesh->Nfp; ++i) {
-          int vidM = mesh->faceNodes[i + fM * mesh->Nfp];
-
-          // grab vol geofacs at surface nodes
-          dlong baseM = eM * mesh->Np * mesh->Nvgeo + vidM;
-          dfloat drdxM = mesh->vgeo[baseM + mesh->Np * RXID];
-          dfloat drdyM = mesh->vgeo[baseM + mesh->Np * RYID];
-          dfloat dsdxM = mesh->vgeo[baseM + mesh->Np * SXID];
-          dfloat dsdyM = mesh->vgeo[baseM + mesh->Np * SYID];
-
-          // grab surface geometric factors
-          dlong base = mesh->Nsgeo * (eM * mesh->Nfp * mesh->Nfaces + fM * mesh->Nfp + i);
-          dfloat nx = mesh->sgeo[base + NXID];
-          dfloat ny = mesh->sgeo[base + NYID];
-          dfloat wsJ = mesh->sgeo[base + WSJID];
-          dfloat hinv = mesh->sgeo[base + IHID];
-
-          // form negative trace terms in IPDG
-          int idnM = n * mesh->Np + vidM;
-          int idmM = m * mesh->Np + vidM;
-
-          dfloat dlndxM = drdxM * Br[idnM] + dsdxM * Bs[idnM];
-          dfloat dlndyM = drdyM * Br[idnM] + dsdyM * Bs[idnM];
-          dfloat ndotgradlnM = nx * dlndxM + ny * dlndyM;
-          dfloat lnM = B[idnM];
-
-          dfloat dlmdxM = drdxM * Br[idmM] + dsdxM * Bs[idmM];
-          dfloat dlmdyM = drdyM * Br[idmM] + dsdyM * Bs[idmM];
-          dfloat ndotgradlmM = nx * dlmdxM + ny * dlmdyM;
-          dfloat lmM = B[idmM];
-
-          dfloat penalty = elliptic->tau * hinv;
-          int bc = mesh->EToB[fM + mesh->Nfaces * eM]; //raw boundary flag
-
-          int bcD = 0, bcN = 0;
-          int bcType = 0;
-
-          if(bc > 0) bcType = elliptic->BCType[bc];        //find its type (Dirichlet/Neumann)
-
-          // this needs to be double checked (and the code where these are used)
-          if(bcType == 1) { // Dirichlet
-            bcD = 1;
-            bcN = 0;
-          } else if(bcType == 2) { // Neumann
-            bcD = 0;
-            bcN = 1;
-          }
-
-          A[n * mesh->Np + m] += -0.5 * (1 + bcD) * (1 - bcN) * wsJ * lnM * ndotgradlmM;  // -(ln^-, N.grad lm^-)
-          A[n * mesh->Np + m] += -0.5 * (1 + bcD) * (1 - bcN) * wsJ * ndotgradlnM * lmM;  // -(N.grad ln^-, lm^-)
-          A[n * mesh->Np + m] += +0.5 * (1 + bcD) * (1 - bcN) * wsJ * penalty * lnM * lmM; // +((tau/h)*ln^-,lm^-)
-        }
-    }
-}
-
-void BuildLocalContinuousPatchAxQuad2D(elliptic_t* elliptic, mesh_t* mesh, dfloat lambda,
-                                       dlong eM, dfloat* B, dfloat* Br, dfloat* Bs, dfloat* A)
-{
-  for (int ny = 0; ny < mesh->Nq; ny++)
-    for (int nx = 0; nx < mesh->Nq; nx++) {
-      if (elliptic->mapB[nx + ny * mesh->Nq + eM * mesh->Np] != 1) {
-        for (int my = 0; my < mesh->Nq; my++)
-          for (int mx = 0; mx < mesh->Nq; mx++) {
-            if (elliptic->mapB[mx + my * mesh->Nq + eM * mesh->Np] == 1) continue;
-
-            int id;
-            int iid = (nx + ny * mesh->Nq) * mesh->Np + mx + my * mesh->Nq;
-            A[iid] = 0;
-
-            if (ny == my) {
-              for (int k = 0; k < mesh->Nq; k++) {
-                id = k + ny * mesh->Nq;
-                dfloat Grr = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G00ID * mesh->Np];
-
-                A[iid] += Grr * mesh->D[nx + k * mesh->Nq] * mesh->D[mx + k * mesh->Nq];
-              }
-            }
-
-            id = mx + ny * mesh->Nq;
-            dfloat Grs = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-            A[iid] += Grs * mesh->D[nx + mx * mesh->Nq] * mesh->D[my + ny * mesh->Nq];
-
-            id = nx + my * mesh->Nq;
-            dfloat Gsr = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-            A[iid] += Gsr * mesh->D[mx + nx * mesh->Nq] * mesh->D[ny + my * mesh->Nq];
-
-            if (nx == mx) {
-              for (int k = 0; k < mesh->Nq; k++) {
-                id = nx + k * mesh->Nq;
-                dfloat Gss = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G11ID * mesh->Np];
-
-                A[iid] += Gss * mesh->D[ny + k * mesh->Nq] * mesh->D[my + k * mesh->Nq];
-              }
-            }
-
-            if ((nx == mx) && (ny == my)) {
-              id = nx + ny * mesh->Nq;
-              dfloat JW = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + GWJID * mesh->Np];
-              A[iid] += JW * lambda;
-            }
-          }
-      } else {
-        int iid = (nx + ny * mesh->Nq) * mesh->Np + nx + ny * mesh->Nq;
-        A[iid] = 1; //just put a 1 so A is invertable
-      }
-    }
-
-  //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
-    for(int n = 0; n < mesh->Np; ++n)
-      if (elliptic->mapB[n + eM * mesh->Np] != 1) { //dont fill rows for masked nodes
-        for(int m = 0; m < mesh->Np; ++m) {
-          if (elliptic->mapB[m + eM * mesh->Np] == 1) continue; //skip masked nodes
-          A[n * mesh->Np + m] += elliptic->allNeumannPenalty * elliptic->allNeumannScale *
-                                 elliptic->allNeumannScale;
-        }
-      }
-  }
-}
-
-//returns the patch A matrix for element eM
-void BuildLocalIpdgPatchAxTet3D(elliptic_t* elliptic,
-                                mesh_t* mesh,
-                                dfloat lambda,
-                                dfloat* MS,
-                                dlong eM,
-                                dfloat* A)
-{
-  dlong vbase = eM * mesh->Nvgeo;
-  dfloat drdx = mesh->vgeo[vbase + RXID];
-  dfloat drdy = mesh->vgeo[vbase + RYID];
-  dfloat drdz = mesh->vgeo[vbase + RZID];
-  dfloat dsdx = mesh->vgeo[vbase + SXID];
-  dfloat dsdy = mesh->vgeo[vbase + SYID];
-  dfloat dsdz = mesh->vgeo[vbase + SZID];
-  dfloat dtdx = mesh->vgeo[vbase + TXID];
-  dfloat dtdy = mesh->vgeo[vbase + TYID];
-  dfloat dtdz = mesh->vgeo[vbase + TZID];
-  dfloat J = mesh->vgeo[vbase + JID];
-
-  dfloat G00 = drdx * drdx + drdy * drdy + drdz * drdz;
-  dfloat G01 = drdx * dsdx + drdy * dsdy + drdz * dsdz;
-  dfloat G02 = drdx * dtdx + drdy * dtdy + drdz * dtdz;
-
-  dfloat G10 = dsdx * drdx + dsdy * drdy + dsdz * drdz;
-  dfloat G11 = dsdx * dsdx + dsdy * dsdy + dsdz * dsdz;
-  dfloat G12 = dsdx * dtdx + dsdy * dtdy + dsdz * dtdz;
-
-  dfloat G20 = dtdx * drdx + dtdy * drdy + dtdz * drdz;
-  dfloat G21 = dtdx * dsdx + dtdy * dsdy + dtdz * dsdz;
-  dfloat G22 = dtdx * dtdx + dtdy * dtdy + dtdz * dtdz;
-
-  /* start with stiffness matrix  */
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      A[n * mesh->Np + m]  = J * lambda * mesh->MM[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G00 * mesh->Srr[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G01 * mesh->Srs[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G02 * mesh->Srt[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G10 * mesh->Ssr[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G11 * mesh->Sss[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G12 * mesh->Sst[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G20 * mesh->Str[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G21 * mesh->Sts[n * mesh->Np + m];
-      A[n * mesh->Np + m] += J * G22 * mesh->Stt[n * mesh->Np + m];
-    }
-
-  //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m)
-        A[n * mesh->Np + m] += elliptic->allNeumannPenalty * elliptic->allNeumannScale *
-                               elliptic->allNeumannScale;
-  }
-
-  for (int fM = 0; fM < mesh->Nfaces; fM++) {
-    // load surface geofactors for this face
-    dlong sid = mesh->Nsgeo * (eM * mesh->Nfaces + fM);
-    dfloat nx = mesh->sgeo[sid + NXID];
-    dfloat ny = mesh->sgeo[sid + NYID];
-    dfloat nz = mesh->sgeo[sid + NZID];
-    dfloat sJ = mesh->sgeo[sid + SJID];
-    dfloat hinv = mesh->sgeo[sid + IHID];
-
-    int bc = mesh->EToB[fM + mesh->Nfaces * eM]; //raw boundary flag
-
-    dfloat penalty = elliptic->tau * hinv;
-
-    int bcD = 0, bcN = 0;
-    int bcType = 0;
-
-    if(bc > 0) bcType = elliptic->BCType[bc];        //find its type (Dirichlet/Neumann)
-
-    // this needs to be double checked (and the code where these are used)
-    if(bcType == 1) { // Dirichlet
-      bcD = 1;
-      bcN = 0;
-    } else if(bcType == 2) { // Neumann
-      bcD = 0;
-      bcN = 1;
-    }
-
-    // mass matrix for this face
-    dfloat* MSf = MS + fM * mesh->Nfp * mesh->Nfp;
-
-    // penalty term just involves face nodes
-    for(int n = 0; n < mesh->Nfp; ++n)
-      for(int m = 0; m < mesh->Nfp; ++m) {
-        int nM = mesh->faceNodes[fM * mesh->Nfp + n];
-        int mM = mesh->faceNodes[fM * mesh->Nfp + m];
-
-        // OP11 = OP11 + 0.5*( gtau*mmE )
-        dfloat MSfnm = sJ * MSf[n * mesh->Nfp + m];
-        A[nM * mesh->Np + mM] += 0.5 * (1. - bcN) * (1. + bcD) * penalty * MSfnm;
-      }
-
-    // now add differential surface terms
-    for(int n = 0; n < mesh->Nfp; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        int nM = mesh->faceNodes[fM * mesh->Nfp + n];
-
-        for(int i = 0; i < mesh->Nfp; ++i) {
-          int iM = mesh->faceNodes[fM * mesh->Nfp + i];
-
-          dfloat MSfni = sJ * MSf[n * mesh->Nfp + i]; // surface Jacobian built in
-
-          dfloat DxMim = drdx * mesh->Dr[iM * mesh->Np + m] + dsdx * mesh->Ds[iM * mesh->Np + m] +
-                         dtdx * mesh->Dt[iM * mesh->Np + m];
-          dfloat DyMim = drdy * mesh->Dr[iM * mesh->Np + m] + dsdy * mesh->Ds[iM * mesh->Np + m] +
-                         dtdy * mesh->Dt[iM * mesh->Np + m];
-          dfloat DzMim = drdz * mesh->Dr[iM * mesh->Np + m] + dsdz * mesh->Ds[iM * mesh->Np + m] +
-                         dtdz * mesh->Dt[iM * mesh->Np + m];
-
-          // OP11 = OP11 + 0.5*( - mmE*Dn1)
-          A[nM * mesh->Np + m] += -0.5 * nx * (1 + bcD) * (1 - bcN) * MSfni * DxMim;
-          A[nM * mesh->Np + m] += -0.5 * ny * (1 + bcD) * (1 - bcN) * MSfni * DyMim;
-          A[nM * mesh->Np + m] += -0.5 * nz * (1 + bcD) * (1 - bcN) * MSfni * DzMim;
-        }
-      }
-
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Nfp; ++m) {
-        int mM = mesh->faceNodes[fM * mesh->Nfp + m];
-
-        for(int i = 0; i < mesh->Nfp; ++i) {
-          int iM = mesh->faceNodes[fM * mesh->Nfp + i];
-
-          dfloat MSfim = sJ * MSf[i * mesh->Nfp + m];
-
-          dfloat DxMin = drdx * mesh->Dr[iM * mesh->Np + n] + dsdx * mesh->Ds[iM * mesh->Np + n] +
-                         dtdx * mesh->Dt[iM * mesh->Np + n];
-          dfloat DyMin = drdy * mesh->Dr[iM * mesh->Np + n] + dsdy * mesh->Ds[iM * mesh->Np + n] +
-                         dtdy * mesh->Dt[iM * mesh->Np + n];
-          dfloat DzMin = drdz * mesh->Dr[iM * mesh->Np + n] + dsdz * mesh->Ds[iM * mesh->Np + n] +
-                         dtdz * mesh->Dt[iM * mesh->Np + n];
-
-          // OP11 = OP11 + (- Dn1'*mmE );
-          A[n * mesh->Np + mM] +=  -0.5 * nx * (1 + bcD) * (1 - bcN) * DxMin * MSfim;
-          A[n * mesh->Np + mM] +=  -0.5 * ny * (1 + bcD) * (1 - bcN) * DyMin * MSfim;
-          A[n * mesh->Np + mM] +=  -0.5 * nz * (1 + bcD) * (1 - bcN) * DzMin * MSfim;
-        }
-      }
-  }
-}
-
-//returns the continuous C0 patch A matrix for element eM
-void BuildLocalContinuousPatchAxTet3D(elliptic_t* elliptic,
-                                      mesh_t* mesh,
-                                      dfloat lambda,
-                                      dlong eM,
-                                      dfloat* A)
-{
-  dlong gbase = eM * mesh->Nggeo;
-  dfloat Grr = mesh->ggeo[gbase + G00ID];
-  dfloat Grs = mesh->ggeo[gbase + G01ID];
-  dfloat Grt = mesh->ggeo[gbase + G02ID];
-  dfloat Gss = mesh->ggeo[gbase + G11ID];
-  dfloat Gst = mesh->ggeo[gbase + G12ID];
-  dfloat Gtt = mesh->ggeo[gbase + G22ID];
-  dfloat J   = mesh->ggeo[gbase + GWJID];
-
-  /* start with stiffness matrix  */
-  for(int n = 0; n < mesh->Np; ++n) {
-    if (elliptic->mapB[n + eM * mesh->Np] != 1) { //dont fill rows for masked nodes
-      for(int m = 0; m < mesh->Np; ++m) {
-        if (elliptic->mapB[m + eM * mesh->Np] != 1) {//dont fill rows for masked nodes
-          A[n * mesh->Np + m] = J * lambda * mesh->MM[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grr * mesh->Srr[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grs * mesh->Srs[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grt * mesh->Srt[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grs * mesh->Ssr[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Gss * mesh->Sss[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Gst * mesh->Sst[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Grt * mesh->Str[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Gst * mesh->Sts[m + n * mesh->Np];
-          A[n * mesh->Np + m] += Gtt * mesh->Stt[m + n * mesh->Np];
-        } else {
-          A[n * mesh->Np + m] = 0;
-        }
-      }
-    } else {
-      A[n + n * mesh->Np] = 1; //just put a 1 so A is invertable
-    }
-  }
-
-  //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
-    for(int n = 0; n < mesh->Np; ++n)
-      if (elliptic->mapB[n + eM * mesh->Np] != 1) { //dont fill rows for masked nodes
-        for(int m = 0; m < mesh->Np; ++m) {
-          if (elliptic->mapB[m + eM * mesh->Np] == 1) continue; //skip masked nodes
-          A[n * mesh->Np + m] += elliptic->allNeumannPenalty * elliptic->allNeumannScale *
-                                 elliptic->allNeumannScale;
-        }
-      }
-  }
-}
-
-//returns the patch A matrix for element eM
-void BuildLocalIpdgPatchAxHex3D(elliptic_t* elliptic, mesh_t* mesh, dfloat lambda,
-                                dfloat* B, dfloat* Br, dfloat* Bs, dfloat* Bt, dlong eM, dfloat* A)
-{
-  /* start with stiffness matrix  */
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      A[n * mesh->Np + m] = 0;
-
-      // (grad phi_n, grad phi_m)_{D^e}
-      for(int i = 0; i < mesh->Np; ++i) {
-        dlong base = eM * mesh->Np * mesh->Nvgeo + i;
-        dfloat drdx = mesh->vgeo[base + mesh->Np * RXID];
-        dfloat drdy = mesh->vgeo[base + mesh->Np * RYID];
-        dfloat drdz = mesh->vgeo[base + mesh->Np * RZID];
-        dfloat dsdx = mesh->vgeo[base + mesh->Np * SXID];
-        dfloat dsdy = mesh->vgeo[base + mesh->Np * SYID];
-        dfloat dsdz = mesh->vgeo[base + mesh->Np * SZID];
-        dfloat dtdx = mesh->vgeo[base + mesh->Np * TXID];
-        dfloat dtdy = mesh->vgeo[base + mesh->Np * TYID];
-        dfloat dtdz = mesh->vgeo[base + mesh->Np * TZID];
-        dfloat JW   = mesh->vgeo[base + mesh->Np * JWID];
-
-        int idn = n * mesh->Np + i;
-        int idm = m * mesh->Np + i;
-        dfloat dlndx = drdx * Br[idn] + dsdx * Bs[idn] + dtdx * Bt[idn];
-        dfloat dlndy = drdy * Br[idn] + dsdy * Bs[idn] + dtdy * Bt[idn];
-        dfloat dlndz = drdz * Br[idn] + dsdz * Bs[idn] + dtdz * Bt[idn];
-        dfloat dlmdx = drdx * Br[idm] + dsdx * Bs[idm] + dtdx * Bt[idm];
-        dfloat dlmdy = drdy * Br[idm] + dsdy * Bs[idm] + dtdy * Bt[idm];
-        dfloat dlmdz = drdz * Br[idm] + dsdz * Bs[idm] + dtdz * Bt[idm];
-        A[n * mesh->Np + m] += JW * (dlndx * dlmdx + dlndy * dlmdy + dlndz * dlmdz);
-        A[n * mesh->Np + m] += lambda * JW * B[idn] * B[idm];
-      }
-
-      for (int fM = 0; fM < mesh->Nfaces; fM++)
-        // accumulate flux terms for negative and positive traces
-        for(int i = 0; i < mesh->Nfp; ++i) {
-          int vidM = mesh->faceNodes[i + fM * mesh->Nfp];
-
-          // grab vol geofacs at surface nodes
-          dlong baseM = eM * mesh->Np * mesh->Nvgeo + vidM;
-          dfloat drdxM = mesh->vgeo[baseM + mesh->Np * RXID];
-          dfloat drdyM = mesh->vgeo[baseM + mesh->Np * RYID];
-          dfloat drdzM = mesh->vgeo[baseM + mesh->Np * RZID];
-          dfloat dsdxM = mesh->vgeo[baseM + mesh->Np * SXID];
-          dfloat dsdyM = mesh->vgeo[baseM + mesh->Np * SYID];
-          dfloat dsdzM = mesh->vgeo[baseM + mesh->Np * SZID];
-          dfloat dtdxM = mesh->vgeo[baseM + mesh->Np * TXID];
-          dfloat dtdyM = mesh->vgeo[baseM + mesh->Np * TYID];
-          dfloat dtdzM = mesh->vgeo[baseM + mesh->Np * TZID];
-
-          // grab surface geometric factors
-          dlong base = mesh->Nsgeo * (eM * mesh->Nfp * mesh->Nfaces + fM * mesh->Nfp + i);
-          dfloat nx = mesh->sgeo[base + NXID];
-          dfloat ny = mesh->sgeo[base + NYID];
-          dfloat nz = mesh->sgeo[base + NZID];
-          dfloat wsJ = mesh->sgeo[base + WSJID];
-          dfloat hinv = mesh->sgeo[base + IHID];
-
-          // form negative trace terms in IPDG
-          int idnM = n * mesh->Np + vidM;
-          int idmM = m * mesh->Np + vidM;
-
-          dfloat dlndxM = drdxM * Br[idnM] + dsdxM * Bs[idnM] + dtdxM * Bt[idnM];
-          dfloat dlndyM = drdyM * Br[idnM] + dsdyM * Bs[idnM] + dtdyM * Bt[idnM];
-          dfloat dlndzM = drdzM * Br[idnM] + dsdzM * Bs[idnM] + dtdzM * Bt[idnM];
-          dfloat ndotgradlnM = nx * dlndxM + ny * dlndyM + nz * dlndzM;
-          dfloat lnM = B[idnM];
-
-          dfloat dlmdxM = drdxM * Br[idmM] + dsdxM * Bs[idmM] + dtdxM * Bt[idmM];
-          dfloat dlmdyM = drdyM * Br[idmM] + dsdyM * Bs[idmM] + dtdyM * Bt[idmM];
-          dfloat dlmdzM = drdzM * Br[idmM] + dsdzM * Bs[idmM] + dtdzM * Bt[idmM];
-          dfloat ndotgradlmM = nx * dlmdxM + ny * dlmdyM + nz * dlmdzM;
-          dfloat lmM = B[idmM];
-
-          dfloat penalty = elliptic->tau * hinv;
-          int bc = mesh->EToB[fM + mesh->Nfaces * eM]; //raw boundary flag
-
-          int bcD = 0, bcN = 0;
-          int bcType = 0;
-
-          if(bc > 0) bcType = elliptic->BCType[bc];        //find its type (Dirichlet/Neumann)
-
-          // this needs to be double checked (and the code where these are used)
-          if(bcType == 1) { // Dirichlet
-            bcD = 1;
-            bcN = 0;
-          } else if(bcType == 2) { // Neumann
-            bcD = 0;
-            bcN = 1;
-          }
-
-          A[n * mesh->Np + m] += -0.5 * (1 + bcD) * (1 - bcN) * wsJ * lnM * ndotgradlmM;  // -(ln^-, N.grad lm^-)
-          A[n * mesh->Np + m] += -0.5 * (1 + bcD) * (1 - bcN) * wsJ * ndotgradlnM * lmM;  // -(N.grad ln^-, lm^-)
-          A[n * mesh->Np + m] += +0.5 * (1 + bcD) * (1 - bcN) * wsJ * penalty * lnM * lmM; // +((tau/h)*ln^-,lm^-)
-        }
-    }
-}
-
-void BuildLocalContinuousPatchAxHex3D(elliptic_t* elliptic,
-                                      mesh_t* mesh,
-                                      dfloat lambda,
-                                      dlong eM,
-                                      dfloat* B,
-                                      dfloat* Br,
-                                      dfloat* Bs,
-                                      dfloat* Bt,
-                                      dfloat* A)
-{
-  for (int nz = 0; nz < mesh->Nq; nz++)
-    for (int ny = 0; ny < mesh->Nq; ny++)
-      for (int nx = 0; nx < mesh->Nq; nx++) {
-        int idn = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-        if (elliptic->mapB[idn + eM * mesh->Np] != 1) {
-          for (int mz = 0; mz < mesh->Nq; mz++)
-            for (int my = 0; my < mesh->Nq; my++)
-              for (int mx = 0; mx < mesh->Nq; mx++) {
-                int idm = mx + my * mesh->Nq + mz * mesh->Nq * mesh->Nq;
-                int iid = idn * mesh->Np + idm;
-                if (elliptic->mapB[idm + eM * mesh->Np] == 1) continue;
-
-                int id;
-                A[iid] = 0;
-
-                if ((ny == my) && (nz == mz)) {
-                  for (int k = 0; k < mesh->Nq; k++) {
-                    id = k + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                    dfloat Grr = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G00ID * mesh->Np];
-
-                    A[iid] += Grr * mesh->D[nx + k * mesh->Nq] * mesh->D[mx + k * mesh->Nq];
-                  }
-                }
-
-                if (nz == mz) {
-                  id = mx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Grs = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-                  A[iid] += Grs * mesh->D[nx + mx * mesh->Nq] * mesh->D[my + ny * mesh->Nq];
-
-                  id = nx + my * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Gsr = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G01ID * mesh->Np];
-                  A[iid] += Gsr * mesh->D[mx + nx * mesh->Nq] * mesh->D[ny + my * mesh->Nq];
-                }
-
-                if (ny == my) {
-                  id = mx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Grt = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G02ID * mesh->Np];
-                  A[iid] += Grt * mesh->D[nx + mx * mesh->Nq] * mesh->D[mz + nz * mesh->Nq];
-
-                  id = nx + ny * mesh->Nq + mz * mesh->Nq * mesh->Nq;
-                  dfloat Gst = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G02ID * mesh->Np];
-                  A[iid] += Gst * mesh->D[mx + nx * mesh->Nq] * mesh->D[nz + mz * mesh->Nq];
-                }
-
-                if ((nx == mx) && (nz == mz)) {
-                  for (int k = 0; k < mesh->Nq; k++) {
-                    id = nx + k * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                    dfloat Gss = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G11ID * mesh->Np];
-
-                    A[iid] += Gss * mesh->D[ny + k * mesh->Nq] * mesh->D[my + k * mesh->Nq];
-                  }
-                }
-
-                if (nx == mx) {
-                  id = nx + my * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat Gst = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G12ID * mesh->Np];
-                  A[iid] += Gst * mesh->D[ny + my * mesh->Nq] * mesh->D[mz + nz * mesh->Nq];
-
-                  id = nx + ny * mesh->Nq + mz * mesh->Nq * mesh->Nq;
-                  dfloat Gts = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G12ID * mesh->Np];
-                  A[iid] += Gts * mesh->D[my + ny * mesh->Nq] * mesh->D[nz + mz * mesh->Nq];
-                }
-
-                if ((nx == mx) && (ny == my)) {
-                  for (int k = 0; k < mesh->Nq; k++) {
-                    id = nx + ny * mesh->Nq + k * mesh->Nq * mesh->Nq;
-                    dfloat Gtt = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + G22ID * mesh->Np];
-
-                    A[iid] += Gtt * mesh->D[nz + k * mesh->Nq] * mesh->D[mz + k * mesh->Nq];
-                  }
-                }
-
-                if ((nx == mx) && (ny == my) && (nz == mz)) {
-                  id = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq;
-                  dfloat JW = mesh->ggeo[eM * mesh->Np * mesh->Nggeo + id + GWJID * mesh->Np];
-                  A[iid] += JW * lambda;
-                }
-              }
-        } else {
-          int iid = idn * mesh->Np + idn;
-          A[iid] = 1; //just put a 1 so A is invertable
-        }
-      }
-
-  //add the rank boost for the allNeumann Poisson problem
-  if (elliptic->allNeumann) {
-    for(int n = 0; n < mesh->Np; ++n)
-      if (elliptic->mapB[n + eM * mesh->Np] != 1) { //dont fill rows for masked nodes
-        for(int m = 0; m < mesh->Np; ++m) {
-          if (elliptic->mapB[m + eM * mesh->Np] == 1) continue; //skip masked nodes
-          A[n * mesh->Np + m] += elliptic->allNeumannPenalty * elliptic->allNeumannScale *
-                                 elliptic->allNeumannScale;
-        }
-      }
-  }
-}
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevelFine.c b/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevelFine.c
deleted file mode 100644
index 11a6c9e99..000000000
--- a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevelFine.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-elliptic_t* ellipticBuildMultigridLevelFine(elliptic_t* baseElliptic)
-{
-  elliptic_t* elliptic = new elliptic_t();
-  memcpy(elliptic, baseElliptic, sizeof(*baseElliptic));
-
-  const int serial = baseElliptic->options.compareArgs("THREAD MODEL", "SERIAL");
-
-  elliptic->var_coeff = 0;
-  elliptic->lambda = (dfloat*) calloc(elliptic->Nfields, sizeof(dfloat)); // enforce lambda = 0
-
-  mesh_t* mesh = elliptic->mesh;
-
-  if(!strstr(pfloatString,dfloatString)) {
-    mesh->o_ggeoPfloat = mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(pfloat));
-    mesh->o_DmatricesPfloat = mesh->device.malloc(mesh->Nq * mesh->Nq*sizeof(pfloat));
-    mesh->o_SmatricesPfloat = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(pfloat));
-
-    elliptic->copyDfloatToPfloatKernel(mesh->Nelements * mesh->Np * mesh->Nggeo,
-      elliptic->mesh->o_ggeoPfloat,
-      mesh->o_ggeo);
-    elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq,
-      elliptic->mesh->o_DmatricesPfloat,
-      mesh->o_Dmatrices);
-    elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq,
-      elliptic->mesh->o_SmatricesPfloat,
-      mesh->o_Smatrices);
-  }
-
-  char* suffix;
-  occa::properties kernelInfo = ellipticKernelInfo(mesh);
-
-  if(elliptic->elementType == TRIANGLES) {
-    if(elliptic->dim == 2)
-      suffix = strdup("Tri2D");
-    else
-      suffix = strdup("Tri3D");
-  }
-  if(elliptic->elementType == QUADRILATERALS) {
-    if(elliptic->dim == 2)
-      suffix = strdup("Quad2D");
-    else
-      suffix = strdup("Quad3D");
-  }
-  if(elliptic->elementType == TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(elliptic->elementType == HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
-
-  for (int r = 0; r < 2; r++) {
-    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
-      kernelInfo["defines/" "p_blockSize"] = blockSize;
-
-      // add custom defines
-      kernelInfo["defines/" "p_NpP"] = (mesh->Np + mesh->Nfp * mesh->Nfaces);
-      kernelInfo["defines/" "p_Nverts"] = mesh->Nverts;
-
-      int Nmax = mymax(mesh->Np, mesh->Nfaces * mesh->Nfp);
-      kernelInfo["defines/" "p_Nmax"] = Nmax;
-
-      int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
-      kernelInfo["defines/" "p_maxNodes"] = maxNodes;
-
-      int NblockV = mymax(1,maxNthreads / mesh->Np); // works for CUDA
-      kernelInfo["defines/" "p_NblockV"] = NblockV;
-
-      int one = 1; //set to one for now. TODO: try optimizing over these
-      kernelInfo["defines/" "p_NnodesV"] = one;
-
-      int NblockS = mymax(1,maxNthreads / maxNodes); // works for CUDA
-      kernelInfo["defines/" "p_NblockS"] = NblockS;
-
-      int NblockP = mymax(1,maxNthreads / (4 * mesh->Np)); // get close to maxNthreads threads
-      kernelInfo["defines/" "p_NblockP"] = NblockP;
-
-      int NblockG;
-      if(mesh->Np <= 32) NblockG = ( 32 / mesh->Np );
-      else NblockG = mymax(1,maxNthreads / mesh->Np);
-      kernelInfo["defines/" "p_NblockG"] = NblockG;
-
-      kernelInfo["defines/" "p_eNfields"] = elliptic->Nfields;
-      kernelInfo["defines/p_Nalign"] = USE_OCCA_MEM_BYTE_ALIGN;
-
-/*
-      //add standard boundary functions
-      char* boundaryHeaderFileName;
-      if (elliptic->dim == 2)
-        boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h");
-      else if (elliptic->dim == 3)
-        boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h");
-      kernelInfo["includes"] += boundaryHeaderFileName;
-*/
-
-      occa::properties AxKernelInfo = kernelInfo;
-
-      sprintf(fileName, DELLIPTIC "/okl/ellipticAx%s.okl", suffix);
-      sprintf(kernelName, "ellipticAx%s", suffix);
-      if(serial) {
-        AxKernelInfo["okl/enabled"] = false;
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticSerialAx%s.c", suffix);
-      }
-      elliptic->AxKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
-
-      if(!strstr(pfloatString,dfloatString)){
-        AxKernelInfo["defines/" "dfloat"] = pfloatString;
-        sprintf(kernelName, "ellipticAx%s", suffix);
-        elliptic->AxPfloatKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
-        AxKernelInfo["defines/" "dfloat"] = dfloatString;
-      }
-
-      if(elliptic->elementType != HEXAHEDRA) {
-        sprintf(kernelName, "ellipticPartialAx%s", suffix);
-      }else{
-        if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR"))
-          sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix);
-        else
-          sprintf(kernelName, "ellipticPartialAx%s", suffix);
-      }
-      if(!serial) {
-        elliptic->partialAxKernel = mesh->device.buildKernel(fileName,kernelName,AxKernelInfo);
-        if(!strstr(pfloatString,dfloatString)) {
-          AxKernelInfo["defines/" "dfloat"] = pfloatString;
-          elliptic->partialAxPfloatKernel = mesh->device.buildKernel(fileName, kernelName, AxKernelInfo);
-          AxKernelInfo["defines/" "dfloat"] = dfloatString;
-        }
-      }
-
-/*
-      // only for Hex3D - cubature Ax
-      if(elliptic->elementType == HEXAHEDRA) {
-        sprintf(fileName,  DELLIPTIC "/okl/ellipticCubatureAx%s.okl", suffix);
-
-        sprintf(kernelName, "ellipticCubaturePartialAx%s", suffix);
-        elliptic->partialCubatureAxKernel = mesh->device.buildKernel(fileName,
-                                                                     kernelName,
-                                                                     AxKernelInfo);
-      }
-*/
-    }
-
-    MPI_Barrier(mesh->comm);
-  }
-
-  return elliptic;
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticBuildOneRing.c b/src/libP/solvers/elliptic/src/ellipticBuildOneRing.c
deleted file mode 100644
index 666c8a972..000000000
--- a/src/libP/solvers/elliptic/src/ellipticBuildOneRing.c
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-typedef struct
-{
-  hlong vertex;
-  hlong element;
-  hlong rank;
-  hlong rankN;    // neighbor rank
-  hlong sortTag;
-}vertex_t;
-
-// generic comparator
-int compareSortTag(const void* a,
-                   const void* b)
-{
-  vertex_t* va = (vertex_t*) a;
-  vertex_t* vb = (vertex_t*) b;
-
-  if(va->sortTag < vb->sortTag) return -1;
-  if(va->sortTag > vb->sortTag) return +1;
-
-  return 0;
-}
-
-// use this to sort list of elements to send to each neighbor rank
-int compareRankNElement(const void* a,
-                        const void* b)
-{
-  vertex_t* va = (vertex_t*) a;
-  vertex_t* vb = (vertex_t*) b;
-
-  if(va->rankN < vb->rankN) return -1;
-  if(va->rankN > vb->rankN) return +1;
-
-  if(va->element < vb->element) return -1;
-  if(va->element > vb->element) return +1;
-
-  return 0;
-}
-
-// start one ring exchange (for q)
-void ellipticOneRingExchangeStart(MPI_Comm &comm,
-                                  size_t Nbytes, // message size per element
-                                  hlong NoneRingSendTotal,
-                                  int* NoneRingSend,
-                                  void* sendBuffer, // temporary buffer
-                                  MPI_Request* sendRequests,
-                                  int* NsendMessages,
-                                  hlong NoneRingRecvTotal,
-                                  int* NoneRingRecv,
-                                  void* recvBuffer,
-                                  MPI_Request* recvRequests,
-                                  int* NrecvMessages)
-{
-  // WATCH OUT - LOOPING OVER ALL RANKS BAD
-  if(NoneRingRecvTotal + NoneRingSendTotal > 0) {
-    // MPI info
-    int rank, size;
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &size);
-
-    // count outgoing and incoming meshes
-    int tag = 999;
-
-    // initiate immediate send  and receives to each other process as needed
-    int sendOffset = 0, recvOffset = 0, sendMessage = 0, recvMessage = 0;
-    for(int r = 0; r < size; ++r)
-      if(r != rank) {
-        size_t recvCount = NoneRingRecv[r] * Nbytes;
-        if(recvCount) {
-          MPI_Irecv(((char*)recvBuffer) + recvOffset, recvCount, MPI_CHAR, r, tag,
-                    comm, recvRequests + recvMessage);
-          recvOffset += recvCount;
-          ++recvMessage;
-        }
-
-        size_t sendCount = NoneRingSend[r] * Nbytes;
-        if(sendCount) {
-          MPI_Isend(((char*)sendBuffer) + sendOffset, sendCount, MPI_CHAR, r, tag,
-                    comm, sendRequests + sendMessage);
-
-          sendOffset += sendCount;
-          ++sendMessage;
-        }
-      }
-
-    *NsendMessages = sendMessage;
-    *NrecvMessages = recvMessage;
-  }
-}
-
-void ellipticOneRingExchangeFinish(MPI_Comm &comm,
-                                   int NsendMessages,
-                                   MPI_Request* sendRequests,
-                                   int NrecvMessages,
-                                   MPI_Request* recvRequests)
-{
-  if(NrecvMessages) {
-    // Wait for all sent messages to have left and received messages to have arrived
-    MPI_Status* recvStatus = (MPI_Status*) calloc(NrecvMessages, sizeof(MPI_Status));
-
-    MPI_Waitall(NrecvMessages, recvRequests, recvStatus);
-
-    free(recvStatus);
-  }
-
-  if(NsendMessages) {
-    MPI_Status* sendStatus = (MPI_Status*) calloc(NsendMessages, sizeof(MPI_Status));
-
-    MPI_Waitall(NsendMessages, sendRequests, sendStatus);
-
-    free(sendStatus);
-  }
-}
-
-void ellipticOneRingExchange(MPI_Comm &comm,
-                             hlong Nelements,
-                             size_t Nbytes, // message size per element
-                             void* q,
-                             hlong NoneRingSendTotal,
-                             hlong* oneRingSendList,
-                             hlong* NoneRingSend,
-                             void* sendBuffer, // temporary buffer
-                             MPI_Request* sendRequests,
-                             hlong NoneRingRecvTotal,
-                             hlong* NoneRingRecv,
-                             MPI_Request* recvRequests,
-                             void* qOneRing)
-{
-  // do oneRing extract
-  for(hlong n = 0; n < NoneRingSendTotal; ++n) {
-    hlong e = oneRingSendList[n];
-    memcpy((char*)sendBuffer + n * Nbytes, (char*)q + e * Nbytes, Nbytes);
-  }
-
-  void* recvBuffer = (char*)qOneRing + Nelements * Nbytes; // fix later
-
-  int NsendMessages, NrecvMessages;
-
-  ellipticOneRingExchangeStart(comm,
-                               Nbytes,
-                               NoneRingSendTotal,
-                               NoneRingSend,
-                               sendBuffer,
-                               sendRequests,
-                               &NsendMessages,
-                               NoneRingRecvTotal,
-                               NoneRingRecv,
-                               recvBuffer,
-                               recvRequests,
-                               &NrecvMessages);
-
-  // copy from q to qOneRing while data in transit
-  memcpy(qOneRing, q, Nbytes * Nelements);
-
-  ellipticOneRingExchangeFinish(comm,
-                                NsendMessages, sendRequests,
-                                NrecvMessages, recvRequests);
-}
-
-// occa memory version
-void ellipticOneRingExchange(elliptic_t* elliptic,
-                             elliptic_t* elliptic1,
-                             size_t Nbytes, // message size per element
-                             occa::memory &o_q,
-                             occa::memory &o_qOneRing)
-{
-  // extract from original mesh
-  mesh_t* mesh = elliptic->mesh;
-  mesh_t* mesh1 = elliptic1->mesh;
-
-  precon_t* precon = elliptic->precon;
-
-  hlong NelementSend = precon->NoneRingSendTotal;
-  hlong NelementRecv = precon->NoneRingRecvTotal;
-
-  if(NelementSend + NelementRecv > 0) {
-    mesh->device.finish();
-    mesh1->device.finish();
-    mesh->haloExtractKernel(NelementSend, mesh->Np,
-                            precon->o_oneRingSendList,
-                            o_q, precon->o_oneRingSendBuffer);
-
-    precon->o_oneRingSendBuffer.copyTo(precon->oneRingSendBuffer, Nbytes * NelementSend, 0);
-
-    dfloat* sendBuffer = (dfloat*) precon->oneRingSendBuffer;
-    dfloat* recvBuffer = (dfloat*) precon->oneRingRecvBuffer;
-
-    MPI_Request* sendRequests = precon->oneRingSendRequests;
-    MPI_Request* recvRequests = precon->oneRingRecvRequests;
-
-    // do exchange via MPI
-
-    // count outgoing and incoming meshes
-    int tag = 999;
-
-    // initiate immediate send  and receives to each other process as needed
-    int sendOffset = 0, recvOffset = 0, sendMessage = 0, recvMessage = 0;
-    for(int r = 0; r < mesh->size; ++r)
-      if(r != mesh->rank) {
-        size_t recvCount = precon->NoneRingRecv[r] * Nbytes;
-        if(recvCount) {
-          MPI_Irecv(((char*)recvBuffer) + recvOffset,
-                    recvCount, MPI_CHAR, r, tag, mesh->comm, recvRequests + recvMessage);
-
-          recvOffset += recvCount;
-          ++recvMessage;
-        }
-
-        size_t sendCount = precon->NoneRingSend[r] * Nbytes;
-        if(sendCount) {
-          MPI_Isend(((char*)sendBuffer) + sendOffset,
-                    sendCount, MPI_CHAR, r, tag, mesh->comm, sendRequests + sendMessage);
-
-          sendOffset += sendCount;
-          ++sendMessage;
-        }
-      }
-
-    MPI_Status* sendStatus = (MPI_Status*) calloc(sendMessage, sizeof(MPI_Status));
-    MPI_Status* recvStatus = (MPI_Status*) calloc(recvMessage, sizeof(MPI_Status));
-
-    MPI_Waitall(recvMessage, recvRequests, recvStatus);
-    MPI_Waitall(sendMessage, sendRequests, sendStatus);
-
-    free(recvStatus);
-    free(sendStatus);
-
-    mesh1->device.finish();
-
-    // copy incoming to end of o_qOneRing
-    o_qOneRing.copyFrom(recvBuffer, Nbytes * NelementRecv, Nbytes * mesh->Nelements); // offset into end of oneRing
-  }
-
-  mesh1->device.finish();
-
-  // copy core
-  o_qOneRing.copyFrom(o_q, Nbytes * mesh->Nelements, 0);
-}
-
-// build one ring including MPI exchange information
-
-void ellipticBuildOneRing(elliptic_t* elliptic, dfloat lambda, occa::properties &kernelInfo)
-{
-  mesh_t* mesh = elliptic->mesh;
-
-  //  occa::properties kernelInfo = ellipticKernelInfo(mesh);
-
-  vertex_t* vertexSendList = (vertex_t*) calloc(mesh->Nelements * mesh->Nverts, sizeof(vertex_t));
-
-  hlong* vertexSendCounts = (hlong*) calloc(mesh->size, sizeof(hlong));
-  hlong* vertexRecvCounts = (hlong*) calloc(mesh->size, sizeof(hlong));
-
-  hlong cnt = 0;
-  for(hlong e = 0; e < mesh->Nelements; ++e)
-    for(int v = 0; v < mesh->Nverts; ++v) {
-      vertexSendList[cnt].vertex = mesh->EToV[e * mesh->Nverts + v];
-      vertexSendList[cnt].element = e;
-      vertexSendList[cnt].rank = mesh->rank;
-      vertexSendList[cnt].rankN = mesh->rank;
-
-      vertexSendList[cnt].sortTag = vertexSendList[cnt].vertex % mesh->size;
-      ++vertexSendCounts[vertexSendList[cnt].sortTag];
-      ++cnt;
-    }
-
-  // sort based on sortTag (=vertex%size)
-  qsort(vertexSendList, cnt, sizeof(vertex_t), compareSortTag);
-
-  // send sortTagCounts (hackety)
-  MPI_Alltoall(vertexSendCounts, 1, MPI_HLONG,
-               vertexRecvCounts, 1, MPI_HLONG,
-               mesh->comm);
-
-  // exchange vertices
-  hlong* vertexSendDispls = (hlong*) calloc(mesh->size + 1, sizeof(hlong));
-  hlong* vertexRecvDispls = (hlong*) calloc(mesh->size + 1, sizeof(hlong));
-  hlong NvertexSend = 0;
-  hlong NvertexRecv = 0;
-  for(int r = 0; r < mesh->size; ++r) {
-    NvertexSend += vertexSendCounts[r];
-    NvertexRecv += vertexRecvCounts[r];
-
-    vertexSendCounts[r] *= sizeof(vertex_t); // hack-hack-hack
-    vertexRecvCounts[r] *= sizeof(vertex_t); // hack-hack-hack
-
-    vertexSendDispls[r + 1] = vertexSendDispls[r] + vertexSendCounts[r];
-    vertexRecvDispls[r + 1] = vertexRecvDispls[r] + vertexRecvCounts[r];
-  }
-
-  // hack-hack-hack
-  vertex_t* vertexRecvList = (vertex_t*) calloc(NvertexRecv, sizeof(vertex_t));
-
-  MPI_Alltoallv(vertexSendList, vertexSendCounts, vertexSendDispls, MPI_CHAR,
-                vertexRecvList, vertexRecvCounts, vertexRecvDispls, MPI_CHAR,
-                mesh->comm);
-
-  for(int v = 0; v < NvertexRecv; ++v)
-    vertexRecvList[v].sortTag = vertexRecvList[v].vertex;
-
-  // sort received vertex based on sortTag (=vertex number)
-  qsort(vertexRecvList, NvertexRecv, sizeof(vertex_t), compareSortTag);
-
-  // count number of unique received vertices
-  hlong NvertexUniqueRecv = (NvertexRecv > 0) ? 1:0;
-  for(hlong n = 1; n < NvertexRecv; ++n)
-    if(compareSortTag(vertexRecvList + n, vertexRecvList + n - 1) != 0) // new vertex
-      ++NvertexUniqueRecv;
-
-  // find offset of the start of each new unique vertex  in sorted list
-  hlong* vertexUniqueRecvOffsets = (hlong*) calloc(NvertexUniqueRecv + 1, sizeof(hlong));
-
-  cnt = 1;
-  vertexUniqueRecvOffsets[0] = 0;
-  for(hlong n = 1; n < NvertexRecv; ++n)
-    if(compareSortTag(vertexRecvList + n, vertexRecvList + n - 1) != 0) { // new vertex
-      vertexUniqueRecvOffsets[cnt] = n;
-      ++cnt;
-    }
-  vertexUniqueRecvOffsets[cnt] = NvertexRecv; // cap at end
-
-  // now count how many vertices to send to each rank
-  hlong* vertexOneRingSendCounts = (hlong*) calloc(mesh->size, sizeof(hlong));
-  hlong Ntotal = 0;
-  for(hlong n = 0; n < NvertexUniqueRecv; ++n) {
-    hlong start = vertexUniqueRecvOffsets[n];
-    hlong end   = vertexUniqueRecvOffsets[n + 1];
-
-    int NuniqueRecvMultiplicity = end - start;
-    for(hlong m = start; m < end; ++m) {
-      vertexOneRingSendCounts[vertexRecvList[m].rank]
-        += NuniqueRecvMultiplicity; // watch out for this
-      Ntotal += NuniqueRecvMultiplicity;
-    }
-  }
-
-  vertex_t* vertexOneRingSendList = (vertex_t*) calloc(Ntotal, sizeof(vertex_t));
-  cnt = 0;
-  for(hlong n = 0; n < NvertexUniqueRecv; ++n) {
-    hlong start = vertexUniqueRecvOffsets[n];
-    hlong end   = vertexUniqueRecvOffsets[n + 1];
-
-    for(hlong v1 = start; v1 < end; ++v1) // vertex v1 to be sent back with list of conns
-      for(hlong v2 = start; v2 < end; ++v2) {
-        vertexOneRingSendList[cnt] = vertexRecvList[v1];
-        vertexOneRingSendList[cnt].rankN    = vertexRecvList[v2].rank;
-
-        vertexOneRingSendList[cnt].sortTag  = vertexRecvList[v1].rank;
-        ++cnt;
-      }
-  }
-
-  hlong NvertexOneRingSend = cnt;
-
-  // sort OneRing send list based on sort rank (=destination tag)
-  qsort(vertexOneRingSendList, NvertexOneRingSend, sizeof(vertex_t), compareSortTag);   // check qsort counts
-
-  // now figure out how many oneRing vertices to expect
-  hlong* vertexOneRingRecvCounts = (hlong*) calloc(mesh->size, sizeof(hlong));
-  MPI_Alltoall(vertexOneRingSendCounts, 1, MPI_HLONG,
-               vertexOneRingRecvCounts, 1, MPI_HLONG,
-               mesh->comm);
-
-  // find displacements for
-  hlong* vertexOneRingSendDispls = (hlong*) calloc(mesh->size + 1, sizeof(hlong));
-  hlong* vertexOneRingRecvDispls = (hlong*) calloc(mesh->size + 1, sizeof(hlong));
-  hlong NvertexOneRingRecv = 0;
-
-  for(int r = 0; r < mesh->size; ++r) {
-    NvertexOneRingRecv += vertexOneRingRecvCounts[r];
-    vertexOneRingSendCounts[r] *= sizeof(vertex_t);
-    vertexOneRingRecvCounts[r] *= sizeof(vertex_t);
-    vertexOneRingSendDispls[r + 1] = vertexOneRingSendDispls[r] + vertexOneRingSendCounts[r];
-    vertexOneRingRecvDispls[r + 1] = vertexOneRingRecvDispls[r] + vertexOneRingRecvCounts[r];
-  }
-
-  vertex_t* vertexOneRingRecvList =
-    (vertex_t*) calloc(NvertexOneRingRecv, sizeof(vertex_t)); // hack-hack-hack
-
-  // send element lists to the relevant ranks
-  MPI_Alltoallv(vertexOneRingSendList, vertexOneRingSendCounts, vertexOneRingSendDispls, MPI_CHAR,
-                vertexOneRingRecvList, vertexOneRingRecvCounts, vertexOneRingRecvDispls, MPI_CHAR,
-                mesh->comm);
-
-  // finally we now have a list of all elements that we need to send to form the 1-ring (to rule them all)
-  vertex_t* vertexOneRingOut  = (vertex_t*) calloc(NvertexOneRingRecv, sizeof(vertex_t));
-  memcpy(vertexOneRingOut,  vertexOneRingRecvList, NvertexOneRingRecv * sizeof(vertex_t));
-
-  // sort the list by "neighbor rank then element"
-  qsort(vertexOneRingOut,  NvertexOneRingRecv, sizeof(vertex_t), compareRankNElement);
-
-  // remove elements connected to this rank from oneRing list
-  cnt = 0;
-  for(hlong v = 0; v < NvertexOneRingRecv; ++v)
-    if(vertexOneRingOut[v].rankN != mesh->rank) // only connect connections with off rank elements
-      vertexOneRingOut[cnt++] = vertexOneRingOut[v];
-
-  hlong NvertexOneRingOut = cnt;
-
-  // remove duplicate connections from oneRingInOut list
-  if(NvertexOneRingOut) {
-    cnt = 1; // assumes at least one oneRing element
-    for(hlong v = 1; v < NvertexOneRingOut; ++v)
-      if(!(vertexOneRingOut[v].element == vertexOneRingOut[cnt - 1].element
-           && vertexOneRingOut[v].rank == vertexOneRingOut[cnt - 1].rank
-           && vertexOneRingOut[v].rankN == vertexOneRingOut[cnt - 1].rankN
-           ))
-        vertexOneRingOut[cnt++] = vertexOneRingOut[v];
-    NvertexOneRingOut = cnt;
-  }
-
-  printf("NvertexOneRingOut = %d, Nelements = %d\n", NvertexOneRingOut, mesh->Nelements);
-
-  // next: put new stuff in elliptic
-  //-1. count how many elements send to each rankN
-  // 0. send count to each rankN
-  // 1. populate NoneRingExchanges[0:size),
-  // 4. adapt halo exchange to oneRingExchange
-  // 5. oneRingExchange: globalNumbers for gs stuff
-  // 3. set up the gs info  using exchange globalNumbers [ need to understand how to populate from the local elements on each rank to the oneRing ]
-  // 6. oneRingExchange: geofacs (ggeo)
-  // 7. build local continuous numbering and local global continuous numbering (see meshParallelConnectNodes)
-  // 8. o_qOneRing
-  // 9. how to precondition patch problem ?
-
-  hlong NoneRingSendTotal = NvertexOneRingOut; // should rename things above
-  hlong* oneRingSendList = (hlong*) calloc(NoneRingSendTotal + 1, sizeof(hlong));
-  hlong* NoneRingSend = (hlong*) calloc(mesh->size, sizeof(hlong));
-  hlong* NoneRingRecv = (hlong*) calloc(mesh->size, sizeof(hlong));
-
-  for(hlong e = 0; e < NoneRingSendTotal; ++e) {
-    vertex_t v = vertexOneRingOut[e];
-    oneRingSendList[e] = v.element;
-    ++NoneRingSend[v.rankN];
-  }
-
-  MPI_Alltoall(NoneRingSend, 1, MPI_HLONG,
-               NoneRingRecv, 1, MPI_HLONG, mesh->comm);
-
-  hlong NoneRingRecvTotal = 0;
-  for(int r = 0; r < mesh->size; ++r)
-    NoneRingRecvTotal += NoneRingRecv[r];
-
-  int maxNbytes = mesh->Np * sizeof(dfloat); // fingers crossed.
-  char* sendBuffer = (char*) calloc(maxNbytes * NoneRingSendTotal, sizeof(char));
-  char* recvBuffer = (char*) calloc(maxNbytes * NoneRingRecvTotal, sizeof(char));
-
-  MPI_Request* sendRequests = (MPI_Request*) calloc(mesh->size, sizeof(MPI_Request));
-  MPI_Request* recvRequests = (MPI_Request*) calloc(mesh->size, sizeof(MPI_Request));
-
-  //  mesh_t *mesh1 = new mesh_t[1];
-
-  mesh_t* mesh1 = (mesh_t*) calloc(1, sizeof(mesh_t)); // check
-
-  // single process communicator for mesh1
-  MPI_Comm_split(mesh->comm, mesh->rank, mesh->rank, &mesh1->comm);
-
-  MPI_Comm_rank(mesh1->comm, &mesh1->rank);
-  MPI_Comm_size(mesh1->comm, &mesh1->size);
-
-  mesh1->dim = mesh->dim;
-  mesh1->Nverts = mesh->Nverts;
-  mesh1->Nfaces = mesh->Nfaces;
-  mesh1->NfaceVertices = mesh->NfaceVertices;
-  mesh1->Nnodes = mesh->Nnodes;
-  mesh1->Nfields = mesh->Nfields;
-
-  mesh1->N   = mesh->N;
-
-  mesh1->faceVertices =
-    (int*) calloc(mesh1->NfaceVertices * mesh1->Nfaces, sizeof(int));
-
-  memcpy(mesh1->faceVertices, mesh->faceVertices, mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  mesh1->Nelements = mesh->Nelements + NoneRingRecvTotal;
-
-  mesh1->EX = (dfloat*) calloc(mesh1->Nelements * mesh1->Nverts, sizeof(dfloat));
-  mesh1->EY = (dfloat*) calloc(mesh1->Nelements * mesh1->Nverts, sizeof(dfloat));
-  mesh1->EZ = (dfloat*) calloc(mesh1->Nelements * mesh1->Nverts, sizeof(dfloat));
-  ellipticOneRingExchange(mesh->comm,
-                          mesh->Nelements, mesh->Nverts * sizeof(dfloat), mesh->EX,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->EX);
-
-  ellipticOneRingExchange(mesh->comm,
-                          mesh->Nelements, mesh->Nverts * sizeof(dfloat), mesh->EY,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->EY);
-
-  ellipticOneRingExchange(mesh->comm,
-                          mesh->Nelements, mesh->Nverts * sizeof(dfloat), mesh->EZ,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->EZ);
-
-  mesh1->NboundaryFaces = mesh->NboundaryFaces;
-  mesh1->boundaryInfo =
-    (hlong*) calloc(mesh1->NboundaryFaces * (mesh1->NfaceVertices + 1), sizeof(hlong));
-  memcpy(mesh1->boundaryInfo,
-         mesh->boundaryInfo,
-         mesh1->NboundaryFaces * (mesh1->NfaceVertices + 1) * sizeof(hlong));
-
-  mesh1->EToV = (hlong*) calloc(mesh1->Nelements * mesh1->Nverts, sizeof(hlong));
-  ellipticOneRingExchange(mesh->comm,
-                          mesh->Nelements, mesh->Nverts * sizeof(hlong), mesh->EToV,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->EToV);
-
-  meshParallelConnect(mesh1);
-
-  meshConnectBoundary(mesh1);
-
-  // correct bcs (replaces unconnected faces with Dirichlet)
-  for(hlong e = 0; e < mesh1->Nelements; ++e)
-    for(int f = 0; f < mesh1->Nfaces; ++f) {
-      hlong id = e * mesh1->Nfaces + f;
-      if(mesh1->EToE[id] == -1 &&
-         mesh1->EToB[id] == -1) {
-        mesh1->EToB[id] = 1; // hack to 1 assume Dirichlet
-        mesh1->EToE[id] = e; // hack to 1 assume Dirichlet
-      }
-    }
-
-  meshLoadReferenceNodesHex3D(mesh1, mesh1->N);
-
-  mesh1->x = (dfloat*) calloc(mesh1->Nelements * mesh1->Np, sizeof(dfloat));
-  mesh1->y = (dfloat*) calloc(mesh1->Nelements * mesh1->Np, sizeof(dfloat));
-  mesh1->z = (dfloat*) calloc(mesh1->Nelements * mesh1->Np, sizeof(dfloat));
-
-  ellipticOneRingExchange(mesh->comm, mesh->Nelements, mesh->Np * sizeof(dfloat), mesh->x,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->x);
-
-  ellipticOneRingExchange(mesh->comm, mesh->Nelements, mesh->Np * sizeof(dfloat), mesh->y,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->y);
-
-  ellipticOneRingExchange(mesh->comm,  mesh->Nelements, mesh->Np * sizeof(dfloat), mesh->z,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, mesh1->z);
-
-  // this is all vanilla HEX --->
-  meshGeometricFactorsHex3D(mesh1);
-
-  meshHaloSetup(mesh1); // nada
-
-  meshConnectFaceNodes3D(mesh1);
-
-  meshSurfaceGeometricFactorsHex3D(mesh1);
-
-  meshParallelConnectNodes(mesh1); // data
-  // <------
-
-  setupAide options1 = elliptic->options; // check this
-
-  // manually specify preconditioner for oneRing grid
-  //  options1.setArgs(string("PRECONDITIONER"),     string("MULTIGRID"));
-  // options1.setArgs(string("PRECONDITIONER"),     string("JACOBI"));/
-  //  options1.setArgs(string("MAXIMUM ITERATIONS"), string("1"));
-  //  options1.setArgs(string("FIXED ITERATION COUNT"), string("1"));
-
-  //options1.setArgs(string("PRECONDITIONER"), string("SEMFEM"));
-  //  options1.setArgs(string("POLYNOMIAL DEGREE"),  string("1"));
-  //options1.setArgs("PRECONDITIONER", "JACOBI");
-
-  //  occa::properties kernelInfo1 = kernelInfo;
-
-  options1.setArgs(string("KRYLOV SOLVER"),        string("PCG+FLEXIBLE"));
-  options1.setArgs(string("PRECONDITIONER"),       string("MULTIGRID"));
-  //  options1.setArgs(string("PRECONDITIONER"),       string("JACOBI"));
-  options1.setArgs(string("MAXIMUM ITERATIONS"),   string("2"));
-  options1.setArgs(string("FIXED ITERATION COUNT"),string("TRUE"));
-  options1.setArgs(string("PARALMOND CYCLE"),string("KCYCLE"));
-
-  mesh1->device = mesh->device; // check this
-#if 1
-  mesh1->defaultStream = mesh->defaultStream;
-  mesh1->dataStream = mesh->dataStream;
-  mesh1->computeStream = mesh->computeStream;
-  mesh1->device.setStream(mesh->defaultStream);
-#endif
-
-  meshOccaPopulateDevice3D(mesh1, options1, kernelInfo);
-
-  // set up
-  elliptic_t* elliptic1 = ellipticSetup(mesh1, lambda, kernelInfo, options1);
-
-  cout << "options1: " << elliptic1->options << endl;
-  //  cout << "options: " << elliptic->options << endl;
-
-  dfloat* ggeoNoJW = (dfloat*) calloc(mesh1->Np * mesh1->Nelements * 6,sizeof(dfloat));
-  for(int e = 0; e < mesh1->Nelements; ++e)
-    for(int n = 0; n < mesh1->Np; ++n) {
-      ggeoNoJW[e * mesh1->Np * 6 + n + 0 *
-               mesh1->Np] = mesh1->ggeo[e * mesh1->Np * mesh1->Nggeo + n + G00ID * mesh1->Np];
-      ggeoNoJW[e * mesh1->Np * 6 + n + 1 *
-               mesh1->Np] = mesh1->ggeo[e * mesh1->Np * mesh1->Nggeo + n + G01ID * mesh1->Np];
-      ggeoNoJW[e * mesh1->Np * 6 + n + 2 *
-               mesh1->Np] = mesh1->ggeo[e * mesh1->Np * mesh1->Nggeo + n + G02ID * mesh1->Np];
-      ggeoNoJW[e * mesh1->Np * 6 + n + 3 *
-               mesh1->Np] = mesh1->ggeo[e * mesh1->Np * mesh1->Nggeo + n + G11ID * mesh1->Np];
-      ggeoNoJW[e * mesh1->Np * 6 + n + 4 *
-               mesh1->Np] = mesh1->ggeo[e * mesh1->Np * mesh1->Nggeo + n + G12ID * mesh1->Np];
-      ggeoNoJW[e * mesh1->Np * 6 + n + 5 *
-               mesh1->Np] = mesh1->ggeo[e * mesh1->Np * mesh1->Nggeo + n + G22ID * mesh1->Np];
-    }
-
-  elliptic1->o_ggeoNoJW = mesh1->device.malloc(mesh1->Np * mesh1->Nelements * 6 * sizeof(dfloat),
-                                               ggeoNoJW);
-
-  // store the extended patch ring
-  elliptic->precon->ellipticOneRing = (void*) elliptic1;
-
-  elliptic->precon->NoneRingSendTotal = NoneRingSendTotal;
-  elliptic->precon->NoneRingRecvTotal = NoneRingRecvTotal;
-
-  elliptic->precon->oneRingSendList = oneRingSendList;
-
-  elliptic->precon->NoneRingSend = NoneRingSend;
-  elliptic->precon->NoneRingRecv = NoneRingRecv;
-
-  elliptic->precon->oneRingSendBuffer = sendBuffer;
-  elliptic->precon->oneRingRecvBuffer = recvBuffer;
-
-  elliptic->precon->oneRingSendRequests = sendRequests;
-  elliptic->precon->oneRingRecvRequests = recvRequests;
-
-#if 0
-
-  ellipticOneRingDiagnostics(elliptic, elliptic1, lambda);
-
-#endif
-
-  // build gs op to gather all contributions
-  hlong* globalNums = (hlong*) calloc(mesh1->Nelements * mesh1->Np, sizeof(hlong));
-
-  ellipticOneRingExchange(mesh->comm,
-                          mesh->Nelements, mesh->Np * sizeof(hlong), mesh->globalIds,
-                          NoneRingSendTotal, oneRingSendList, NoneRingSend, sendBuffer,
-                          sendRequests,
-                          NoneRingRecvTotal, NoneRingRecv, recvRequests, globalNums);
-
-  elliptic->precon->oasOgs = ogsSetup(mesh1->Nelements * mesh1->Np,
-                                      globalNums,
-                                      mesh->comm,
-                                      1,
-                                      mesh->device);
-
-  elliptic->precon->o_oneRingSendList =
-    mesh->device.malloc(elliptic->precon->NoneRingSendTotal * sizeof(hlong),
-                        elliptic->precon->oneRingSendList);
-
-  elliptic->precon->o_oneRingSendBuffer =
-    mesh->device.malloc(elliptic->precon->NoneRingSendTotal * mesh->Np * sizeof(dfloat));
-
-  elliptic->precon->o_oneRingRecvBuffer =
-    mesh->device.malloc(elliptic->precon->NoneRingRecvTotal * mesh->Np * sizeof(dfloat));
-
-  free(vertexSendList);
-  free(vertexSendCounts);
-  free(vertexRecvCounts);
-  free(vertexSendDispls);
-  free(vertexRecvDispls);
-  free(vertexRecvList);
-  free(vertexUniqueRecvOffsets);
-  free(vertexOneRingSendCounts);
-  free(vertexOneRingSendList);
-  free(vertexOneRingRecvCounts);
-  free(vertexOneRingSendDispls);
-  free(vertexOneRingRecvDispls);
-  free(vertexOneRingRecvList);
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticMain.c b/src/libP/solvers/elliptic/src/ellipticMain.c
deleted file mode 100644
index bb92580e2..000000000
--- a/src/libP/solvers/elliptic/src/ellipticMain.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-int main(int argc, char** argv)
-{
-  // start up MPI
-  MPI_Init(&argc, &argv);
-
-  if(argc != 2) {
-    printf("usage: ./ellipticMain setupfile\n");
-
-    MPI_Finalize();
-    exit(-1);
-  }
-
-  // if argv > 2 then should load input data from argv
-  setupAide options(argv[1]);
-
-  // set up mesh stuff
-  string fileName;
-  int N, dim, elementType;
-
-  options.getArgs("POLYNOMIAL DEGREE", N);
-  options.getArgs("ELEMENT TYPE", elementType);
-  options.getArgs("MESH DIMENSION", dim);
-
-  mesh_t* mesh;
-
-  switch(elementType) {
-  case QUADRILATERALS: {
-    if(dim == 2) {
-      if(options.compareArgs("BOX DOMAIN", "TRUE"))
-        mesh = meshSetupBoxQuad2D(N, options);
-      else if(options.getArgs("MESH FILE", fileName))
-        mesh = meshSetupQuad2D((char*)fileName.c_str(), N);
-    }
-    break;
-  }
-  case HEXAHEDRA:
-    if(options.compareArgs("BOX DOMAIN", "TRUE"))
-      mesh = meshSetupBoxHex3D(N, options);
-    else if(options.getArgs("MESH FILE", fileName))
-      mesh = meshSetupHex3D((char*)fileName.c_str(), N);
-    break;
-  }
-
-  // set up
-  occa::properties kernelInfo;
-  kernelInfo["defines"].asObject();
-  kernelInfo["includes"].asArray();
-  kernelInfo["header"].asArray();
-  kernelInfo["flags"].asObject();
-
-  if(dim == 3) {
-    if(elementType == TRIANGLES)
-      meshOccaSetupTri3D(mesh, options, kernelInfo);
-    else if(elementType == QUADRILATERALS)
-      meshOccaSetupQuad3D(mesh, options, kernelInfo);
-    else
-      meshOccaSetup3D(mesh, options, kernelInfo);
-  }else {
-    meshOccaSetup2D(mesh, options, kernelInfo);
-  }
-
-  elliptic_t* elliptic;
-
-  elliptic = ellipticSetup(mesh, kernelInfo, options);
-
-#if 1
-  dfloat tol = 1e-8;
-
-  ellipticSolve(elliptic, tol, elliptic->o_r, elliptic->o_x);
-
-  elliptic->o_x.copyTo(elliptic->x);
-
-  for(int fld = 0; fld < elliptic->Nfields; fld++) {
-    dfloat maxError = 0;
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      for(int n = 0; n < mesh->Np; ++n) {
-        dlong id = e * mesh->Np + n;
-        dfloat xn = mesh->x[id];
-        dfloat yn = mesh->y[id];
-        dfloat zn = mesh->z[id];
-
-        double exact = sin(M_PI * xn) * sin(M_PI * yn) * sin(M_PI * zn);
-        double error = fabs(exact - elliptic->x[id + fld * elliptic->Ntotal]);
-        maxError     = mymax(maxError, error);
-      }
-
-    dfloat globalMaxError = 0;
-    MPI_Allreduce(&maxError, &globalMaxError, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-    if(mesh->rank == 0)
-      printf("Inf Error[%d] = %.8e\n", fld, globalMaxError);
-  }
-
-#else  /* if 1 */
-  {
-    occa::memory o_r = mesh->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat),
-                                           elliptic->o_r);
-    occa::memory o_x = mesh->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat),
-                                           elliptic->o_x);
-
-    // convergence tolerance
-    dfloat tol = 1e-8;
-
-    int it;
-
-    MPI_Barrier(mesh->comm);
-
-    occa::streamTag startTag = mesh->device.tagStream();
-    int Ntests = 1;
-    it = 0;
-    for(int test = 0; test < Ntests; ++test) {
-      o_r.copyTo(elliptic->o_r);
-      o_x.copyTo(elliptic->o_x);
-      it += ellipticSolve(elliptic, lambda, tol, elliptic->o_r, elliptic->o_x);
-    }
-
-    MPI_Barrier(mesh->comm);
-
-    occa::streamTag stopTag = mesh->device.tagStream();
-    mesh->device.finish();
-
-    double elapsed = mesh->device.timeBetween(startTag, stopTag);
-
-    double globalElapsed;
-    hlong globalNelements, localNelements = mesh->Nelements;
-
-    MPI_Reduce(&elapsed, &globalElapsed, 1, MPI_DOUBLE, MPI_MAX, 0, mesh->comm);
-    MPI_Reduce(&localNelements, &globalNelements, 1, MPI_HLONG, MPI_SUM, 0, mesh->comm);
-
-    printf("elapsed = %lf, globalElapsed = %lf, globalNelements = %lld\n",
-           elapsed,
-           globalElapsed,
-           globalNelements);
-
-    if (mesh->rank == 0)
-      printf(
-        "%d, %d, %g, %d, %g, %g; \%\%global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n",
-        mesh->N,
-        globalNelements * mesh->Np,
-        globalElapsed,
-        it,
-        globalElapsed / (mesh->Np * globalNelements),
-        globalNelements * (it * mesh->Np / globalElapsed),
-        (char*) options.getArgs("PRECONDITIONER").c_str());
-
-    if (options.compareArgs("VERBOSE", "TRUE")) {
-      fflush(stdout);
-      MPI_Barrier(mesh->comm);
-      printf("rank %d has %d internal elements and %d non-internal elements\n",
-             mesh->rank,
-             mesh->NinternalElements,
-             mesh->NnotInternalElements);
-      MPI_Barrier(mesh->comm);
-    }
-
-    if(options.compareArgs("DISCRETIZATION","CONTINUOUS") &&
-       !(elliptic->dim == 3 && elliptic->elementType == QUADRILATERALS)) {
-      dfloat zero = 0.;
-      elliptic->addBCKernel(mesh->Nelements,
-                            zero,
-                            mesh->o_x,
-                            mesh->o_y,
-                            mesh->o_z,
-                            elliptic->o_mapB,
-                            elliptic->o_x);
-    }
-
-    // copy solution from DEVICE to HOST
-    elliptic->o_x.copyTo(mesh->q);
-
-    if (options.compareArgs("BASIS","BERN"))
-      meshApplyElementMatrix(mesh,mesh->VB,mesh->q,mesh->q);
-
-    dfloat maxError = 0;
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      for(int n = 0; n < mesh->Np; ++n) {
-        dlong id = e * mesh->Np + n;
-        dfloat xn = mesh->x[id];
-        dfloat yn = mesh->y[id];
-        dfloat zn = mesh->z[id];
-
-        dfloat exact;
-        if (elliptic->dim == 2) {
-          exact = sin(M_PI * xn) * sin(M_PI * yn);
-        }else{
-          if(elliptic->elementType == QUADRILATERALS) {
-            dfloat a = 1, b = 2, c = 3;
-            exact = sin(a * xn) * sin(b * yn) * sin(c * zn);
-          }else {
-            double mode = 1.0;
-            exact = cos(mode * M_PI * xn) * cos(mode * M_PI * yn) * cos(mode * M_PI * zn);
-          }
-        }
-
-        dfloat error = fabs(exact - mesh->q[id]);
-
-        //  mesh->q[id] -= exact;
-
-        // store error
-        // mesh->q[id] = fabs(mesh->q[id] - exact);
-        maxError = mymax(maxError, error);
-      }
-
-    dfloat globalMaxError = 0;
-    MPI_Allreduce(&maxError, &globalMaxError, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-    if(mesh->rank == 0)
-      printf("globalMaxError = %g\n", globalMaxError);
-
-    char fname[BUFSIZ];
-    string outName;
-    options.getArgs("OUTPUT FILE NAME", outName);
-    sprintf(fname, "%s_%04d.vtu",(char*)outName.c_str(), mesh->rank);
-    if(elliptic->dim == 3)
-      meshPlotVTU3D(mesh, fname, 0);
-    else
-      meshPlotVTU2D(mesh, fname, 0);
-  }
-
-  // cout << kernelInfo;
-
-  // build one-ring ( to rule them all )
-  // ellipticBuildOneRing(elliptic, kernelInfo);
-
-#endif
-  // close down MPI
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticOasSetup.c b/src/libP/solvers/elliptic/src/ellipticOasSetup.c
deleted file mode 100644
index 6fc50d22f..000000000
--- a/src/libP/solvers/elliptic/src/ellipticOasSetup.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-void ellipticOasSetup(elliptic_t* elliptic, dfloat lambda,
-                      occa::properties &kernelInfo)
-{
-  mesh_t* mesh = elliptic->mesh;
-
-  setupAide options = elliptic->options;
-
-  /* STAGE 1: build overlapping extended partition problem */
-
-  /* build one ring patch extension using a single process MPI sub-communicator
-     and store in elliptic->precon->ellipticOneRing */
-  ellipticBuildOneRing(elliptic, lambda, kernelInfo);
-
-  /* STAGE 2: build coarse problem */
-  //set up the base level
-  int Nc = 1;
-  int Nf = mesh->N;
-
-  // build coarsener
-  int NqFine   = Nf + 1;
-  int NqCoarse = Nc + 1;
-
-  int NpFine   = (Nf + 1) * (Nf + 1) * (Nf + 1);
-  int NpCoarse = (Nc + 1) * (Nc + 1) * (Nc + 1);
-
-  int NblockVFine = maxNthreads / NpFine;
-  int NblockVCoarse = maxNthreads / NpCoarse;
-
-  elliptic_t* ellipticOasCoarse;
-
-  mesh_t* meshN1 = new mesh_t[1];
-
-  if (mesh->N > 1) { // assume
-    // TW: should create   meshCloneHex3D(mesh_t *mesh)
-    printf("=============BUILDING OAS COARSE LEVEL OF DEGREE %d==================\n", Nc);
-    //    ellipticOasCoarse = ellipticBuildMultigridLevel(elliptic,Nc,%m    mesh_t *mesh1 = (mesh_t*) calloc(1, sizeof(mesh_t)); // check
-    meshN1->N   = 1;
-
-    meshN1->comm = mesh->comm;
-    meshN1->rank = mesh->rank;
-    meshN1->size = mesh->size;
-
-    meshN1->dim = mesh->dim;
-    meshN1->Nverts = mesh->Nverts;
-    meshN1->Nfaces = mesh->Nfaces;
-    meshN1->NfaceVertices = mesh->NfaceVertices;
-    meshN1->Nnodes = mesh->Nnodes;
-    meshN1->Nfields = mesh->Nfields;
-
-    meshN1->faceVertices = mesh->faceVertices;
-    meshN1->Nelements = mesh->Nelements;
-    meshN1->EX = mesh->EX;
-    meshN1->EY = mesh->EY;
-    meshN1->EZ = mesh->EZ;
-
-    meshN1->NboundaryFaces = mesh->NboundaryFaces;
-    meshN1->boundaryInfo = mesh->boundaryInfo;
-    meshN1->EToV = mesh->EToV;
-
-    meshParallelConnect(meshN1);
-
-    meshConnectBoundary(meshN1);
-
-    meshLoadReferenceNodesHex3D(meshN1, meshN1->N); // degree 1
-
-    meshPhysicalNodesHex3D(meshN1); // rely on trilinear map for hexes
-
-    meshGeometricFactorsHex3D(meshN1);
-
-    meshHaloSetup(meshN1); // nada
-
-    meshConnectFaceNodes3D(meshN1);
-
-    meshSurfaceGeometricFactorsHex3D(meshN1);
-
-    meshParallelConnectNodes(meshN1); // data
-
-    setupAide optionsN1 = elliptic->options; // check this
-    // optionsN1.setArgs(string("PRECONDITIONER"),    string("MULTIGRID"));
-    // optionsN1.setArgs(string("PRECONDITIONER"),    string("FULLALLMOND"));
-    optionsN1.setArgs(string("KRYLOV SOLVER"),        string("PCG+NONBLOCKING+FLEXIBLE"));
-    optionsN1.setArgs(string("PRECONDITIONER"),       string("MULTIGRID"));
-    optionsN1.setArgs(string("MAXIMUM ITERATIONS"),   string("2"));
-    optionsN1.setArgs(string("FIXED ITERATION COUNT"),string("TRUE"));
-    optionsN1.setArgs(string("PARALMOND CYCLE"),      string("KCYCLE"));
-    optionsN1.setArgs(string("POLYNOMIAL DEGREE"),    string("1"));
-
-    occa::properties kernelInfoN1;
-
-    kernelInfoN1["defines"].asObject();
-    kernelInfoN1["includes"].asArray();
-    kernelInfoN1["header"].asArray();
-    kernelInfoN1["flags"].asObject();
-
-#if 0
-    optionsN1.setArgs(string("THREAD MODEL"),    string("SERIAL"));
-    meshOccaSetup3D(meshN1, optionsN1, kernelInfoN1);
-#else
-    meshN1->device = mesh->device; // check this
-    meshN1->defaultStream = mesh->defaultStream;
-    meshN1->dataStream = mesh->dataStream;
-    meshN1->computeStream = mesh->computeStream;
-    meshN1->device.setStream(mesh->defaultStream);
-    meshOccaPopulateDevice3D(meshN1, optionsN1, kernelInfoN1);
-#endif
-
-    //    std::cout << "KINFO LOOK AT THIS: " << kernelInfoN1 << std::endl;
-    //    std::cout << "OPTIO LOOK AT THIS: " << optionsN1 << std::endl;
-
-    // set up
-    ellipticOasCoarse = ellipticSetup(meshN1, lambda, kernelInfoN1, optionsN1);
-  }else{
-    ellipticOasCoarse = elliptic;
-  }
-
-  dfloat* P    = (dfloat*) calloc(NqFine * NqCoarse,sizeof(dfloat));
-  dfloat* R    = (dfloat*) calloc(NqFine * NqCoarse,sizeof(dfloat));
-
-  // hard wire for linears
-  for(int n = 0; n < NqFine; ++n) {
-    P[n * NqCoarse + 0] = 0.5 * (1 - mesh->gllz[n]);
-    P[n * NqCoarse + 1] = 0.5 * (1 + mesh->gllz[n]);
-    R[0 * NqFine + n] = 0.5 * (1 - mesh->gllz[n]);
-    R[1 * NqFine + n] = 0.5 * (1 + mesh->gllz[n]);
-  }
-
-  occa::memory o_R = elliptic->mesh->device.malloc(NqFine * NqCoarse * sizeof(dfloat), R);
-  occa::memory o_P = elliptic->mesh->device.malloc(NqFine * NqCoarse * sizeof(dfloat), P);
-
-  free(P);
-  free(R);
-
-  elliptic->precon->ellipticOasCoarse = ellipticOasCoarse;
-  elliptic->precon->o_oasRestrictionMatrix = o_R;
-  elliptic->precon->o_oasProlongationMatrix = o_P;
-
-  // build degree 1 coarsening and prolongation matrices and kernels
-
-  kernelInfo["defines/" "p_NqFine"] = Nf + 1;
-  kernelInfo["defines/" "p_NqCoarse"] = Nc + 1;
-
-  kernelInfo["defines/" "p_NpFine"] = NpFine;
-  kernelInfo["defines/" "p_NpCoarse"] = NpCoarse;
-
-  kernelInfo["defines/" "p_NblockVFine"] = NblockVFine;
-  kernelInfo["defines/" "p_NblockVCoarse"] = NblockVCoarse;
-
-  char* suffix;
-
-  if(elliptic->elementType == HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
-
-  for (int r = 0; r < 2; r++) {
-    if ((r == 0 && mesh->rank == 0) || (r == 1 && mesh->rank > 0)) {
-      sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix);
-      sprintf(kernelName, "ellipticPreconCoarsen%s", suffix);
-      elliptic->precon->oasRestrictionKernel =
-        mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-
-      sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix);
-      sprintf(kernelName, "ellipticPreconProlongate%s", suffix);
-      elliptic->precon->oasProlongationKernel = mesh->device.buildKernel(fileName,
-                                                                         kernelName,
-                                                                         kernelInfo);
-    }
-    MPI_Barrier(mesh->comm);
-  }
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticOasSolve.c b/src/libP/solvers/elliptic/src/ellipticOasSolve.c
deleted file mode 100644
index c99f8822a..000000000
--- a/src/libP/solvers/elliptic/src/ellipticOasSolve.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-void ellipticOasSolve(elliptic_t* elliptic, dfloat lambda,
-                      occa::memory &o_r, occa::memory &o_z)
-{
-  precon_t* precon = elliptic->precon;
-  mesh_t* mesh = elliptic->mesh;
-
-  elliptic_t* elliptic1 = (elliptic_t*) precon->ellipticOneRing; // should rename
-  mesh_t* mesh1 = elliptic1->mesh;
-
-  elliptic_t* ellipticOasCoarse = (elliptic_t*) (precon->ellipticOasCoarse);
-  mesh_t* meshCoarse   = ellipticOasCoarse->mesh;
-
-  // TW: possibility these device have difference queues
-  mesh1->device.finish();
-  mesh->device.finish();
-
-  // 1. collect patch residual
-  ellipticOneRingExchange(elliptic, elliptic1, mesh1->Np * sizeof(dfloat), o_r, elliptic1->o_r);
-
-  // mask local overlapping patch residual
-  if (elliptic1->Nmasked)
-    mesh1->maskKernel(elliptic1->Nmasked, elliptic1->o_maskIds, elliptic1->o_r);
-
-  // TW: possibility these device have difference queues
-  mesh1->device.finish();
-  mesh->device.finish();
-
-  ellipticPreconditioner(elliptic1, lambda, elliptic1->o_r, elliptic1->o_x); // may need to zero o_x
-
-  // gathering over all patches - so have to remove local multiplicity
-  elliptic1->dotMultiplyKernel(mesh1->Nelements * mesh1->Np,
-                               elliptic1->ogs->o_invDegree,
-                               elliptic1->o_x,
-                               elliptic1->o_z);
-
-  // sum up overlapping patches
-  ogsGatherScatter(elliptic1->o_z, ogsDfloat, ogsAdd, elliptic->precon->oasOgs);
-
-  o_z.copyFrom(elliptic1->o_z, mesh->Nelements * mesh->Np * sizeof(dfloat), 0);
-
-  // 2. solve coarse problem
-  //   a. call solver
-  //   b. prolongate (watch out for +=)
-
-  mesh1->device.finish();
-  mesh->device.finish();
-  meshCoarse->device.finish();
-
-  precon->oasRestrictionKernel(meshCoarse->Nelements,
-                               precon->o_oasRestrictionMatrix,
-                               o_r, ellipticOasCoarse->o_r);
-
-  mesh1->device.finish();
-  mesh->device.finish();
-  meshCoarse->device.finish();
-
-  // why do I Have to do (1/deg)*S*G*o_rCoarse here ? ---------->
-  ogsGatherScatter(ellipticOasCoarse->o_r, ogsDfloat, ogsAdd, ellipticOasCoarse->ogs);
-
-  ellipticOasCoarse->dotMultiplyKernel(meshCoarse->Nelements * meshCoarse->Np,
-                                       meshCoarse->ogs->o_invDegree,
-                                       ellipticOasCoarse->o_r,
-                                       ellipticOasCoarse->o_r);
-
-  if (ellipticOasCoarse->Nmasked)
-    meshCoarse->maskKernel(ellipticOasCoarse->Nmasked,
-                           ellipticOasCoarse->o_maskIds,
-                           ellipticOasCoarse->o_r);
-
-  // <----------
-
-  ellipticPreconditioner(ellipticOasCoarse, lambda, ellipticOasCoarse->o_r, ellipticOasCoarse->o_x);
-
-  mesh1->device.finish();
-  mesh->device.finish();
-  meshCoarse->device.finish();
-
-  // prolongate to QN (note kernel expects restriction matrix)
-  // do we need to weight the sum against patches?
-  precon->oasProlongationKernel(mesh->Nelements, precon->o_oasRestrictionMatrix,
-                                ellipticOasCoarse->o_x, o_z);
-
-  mesh1->device.finish();
-  mesh->device.finish();
-  meshCoarse->device.finish();
-
-  if (elliptic->Nmasked)
-    mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_z);
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticOneRingDiagnostics.c b/src/libP/solvers/elliptic/src/ellipticOneRingDiagnostics.c
deleted file mode 100644
index 6081d2cfa..000000000
--- a/src/libP/solvers/elliptic/src/ellipticOneRingDiagnostics.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-void ellipticOneRingDiagnostics(elliptic_t* elliptic, elliptic_t* elliptic1, dfloat lambda)
-{
-  mesh_t* mesh  = elliptic->mesh;
-  mesh_t* mesh1 = elliptic1->mesh;
-
-  char fname[BUFSIZ];
-  sprintf(fname, "diagnostics%04d.dat", mesh->rank);
-  FILE* fp = fopen(fname, "w");
-  fprintf(fp, "EToV=[\n");
-  for(int e = 0; e < mesh1->Nelements; ++e) {
-    for(int v = 0; v < mesh1->Nverts; ++v)
-      fprintf(fp, "%d ", mesh1->EToV[e * mesh1->Nverts + v]);
-    if(e < mesh->Nelements) fprintf(fp, "%% original \n");
-    else fprintf(fp, "%% overlap \n");
-  }
-  fprintf(fp, "];\n");
-
-  fprintf(fp, "EToE=[\n");
-  for(int e = 0; e < mesh1->Nelements; ++e) {
-    for(int f = 0; f < mesh1->Nfaces; ++f)
-      fprintf(fp, "%d ", mesh1->EToE[e * mesh1->Nfaces + f]);
-    if(e < mesh->Nelements) fprintf(fp, "%% original \n");
-    else fprintf(fp, "%% overlap \n");
-  }
-  fprintf(fp, "];\n");
-
-  fprintf(fp, "EToB=[\n");
-  for(int e = 0; e < mesh1->Nelements; ++e) {
-    for(int f = 0; f < mesh1->Nfaces; ++f)
-      fprintf(fp, "%d ", mesh1->EToB[e * mesh1->Nfaces + f]);
-    if(e < mesh->Nelements) fprintf(fp, "%% original \n");
-    else fprintf(fp, "%% overlap \n");
-  }
-  fprintf(fp, "];\n");
-
-  fclose(fp);
-
-  // TEST FOR ONE RING
-  dfloat tol = 1e-8;
-
-  int it = ellipticSolve(elliptic1, lambda, tol, elliptic1->o_r, elliptic1->o_x);
-
-  if(elliptic1->options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
-    dfloat zero = 0.;
-    elliptic1->addBCKernel(mesh1->Nelements,
-                           zero,
-                           mesh1->o_x,
-                           mesh1->o_y,
-                           mesh1->o_z,
-                           elliptic1->o_mapB,
-                           elliptic1->o_x);
-  }
-
-  // copy solution from DEVICE to HOST
-  elliptic1->o_x.copyTo(mesh1->q);
-
-  dfloat maxError = 0;
-  for(dlong e = 0; e < mesh1->Nelements; ++e)
-    for(int n = 0; n < mesh1->Np; ++n) {
-      dlong id = e * mesh1->Np + n;
-      dfloat xn = mesh1->x[id];
-      dfloat yn = mesh1->y[id];
-      dfloat zn = mesh1->z[id];
-
-      dfloat exact;
-      int mode = 1;
-      exact = cos(mode * M_PI * xn) * cos(mode * M_PI * yn) * cos(mode * M_PI * zn);
-
-      dfloat error = fabs(exact - mesh1->q[id]);
-
-      mesh1->q[id] -= exact;
-      //      mesh1->q[id] = exact;
-
-      // store error
-      // mesh->q[id] = fabs(mesh->q[id] - exact);
-      maxError = mymax(maxError, error);
-    }
-
-  dfloat globalMaxError = 0;
-  MPI_Allreduce(&maxError, &globalMaxError, 1, MPI_DFLOAT, MPI_MAX, mesh1->comm);
-  if(mesh1->rank == 0)
-    printf("globalMaxError = %g\n", globalMaxError);
-
-  string outName;
-  elliptic1->options.getArgs("OUTPUT FILE NAME", outName);
-  sprintf(fname, "%s_oneRing_%04d",(char*)outName.c_str(), mesh->rank);
-  ellipticPlotVTUHex3D(mesh1, fname, 0);
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticPlotVTUHex3D.c b/src/libP/solvers/elliptic/src/ellipticPlotVTUHex3D.c
deleted file mode 100644
index b5887f94f..000000000
--- a/src/libP/solvers/elliptic/src/ellipticPlotVTUHex3D.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "mesh3D.h"
-
-// interpolate data to plot nodes and save to file (one per process
-extern "C"
-{
-void ellipticPlotVTUHex3D(mesh3D* mesh, char* fileNameBase, int fld);
-}
-
-void ellipticPlotVTUHex3D(mesh3D* mesh, char* fileNameBase, int fld)
-{
-  int rank;
-  rank = mesh->rank;
-
-  FILE* fp;
-  char fileName[BUFSIZ];
-  sprintf(fileName, "%s_%04d.vtu", fileNameBase, rank);
-  // strcpy(fileName,fileNameBase);
-
-  fp = fopen(fileName, "w");
-
-  fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-  fprintf(fp, "  <UnstructuredGrid>\n");
-
-  int Eloc = (mesh->Nq - 1) * (mesh->Nq - 1) * (mesh->Nq - 1);
-  //  printf("N = %d, Eloc = %d, Nel = %d\n",
-  //	 mesh->Nq-1, Eloc, mesh->Nelements);
-
-  fprintf(fp, "    <Piece NumberOfPoints=\"" dlongFormat "\" NumberOfCells=\"" dlongFormat "\">\n",
-          mesh->Nelements * mesh->Np,
-          mesh->Nelements * Eloc);
-
-  // write out nodes
-  fprintf(fp, "      <Points>\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-  // compute plot node coordinates on the fly
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Np; ++n) {
-      dlong id = n + e * mesh->Np;
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n",
-              mesh->x[id],
-              mesh->y[id],
-              mesh->z[id]);
-    }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Points>\n");
-
-  //  printf("Nelements = %d, Np = %d\n", mesh->Nelements, mesh->Np);
-
-  // write out pressure
-  fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" Name=\"pressure\" Format=\"ascii\">\n");
-
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Np; ++n) {
-      dfloat qn = mesh->q[n + fld * mesh->Np + e * mesh->Nfields * mesh->Np];
-      fprintf(fp, "       ");
-      fprintf(fp, "%17.15lf\n", qn);
-    }
-
-  fprintf(fp, "       </DataArray>\n");
-  fprintf(fp, "     </PointData>\n");
-
-  fprintf(fp, "    <Cells>\n");
-  fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int k = 0; k < mesh->Nq - 1; ++k)
-      for(int j = 0; j < mesh->Nq - 1; ++j)
-        for(int i = 0; i < mesh->Nq - 1; ++i) {
-          int b = e * mesh->Np + i + j * mesh->Nq + k * mesh->Nq * mesh->Nq;
-          fprintf(fp,
-                  dlongFormat " "
-                  dlongFormat " "
-                  dlongFormat " "
-                  dlongFormat " "
-                  dlongFormat " "
-                  dlongFormat " "
-                  dlongFormat " "
-                  dlongFormat "\n ",
-                  b,
-                  b + 1,
-                  b + mesh->Nq + 1,
-                  b + mesh->Nq,
-                  b + mesh->Nq * mesh->Nq,
-                  b + 1 + mesh->Nq * mesh->Nq,
-                  b + mesh->Nq + 1 + mesh->Nq * mesh->Nq,
-                  b + mesh->Nq + mesh->Nq * mesh->Nq);
-        }
-
-  fprintf(fp, "        </DataArray>\n");
-
-  fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < Eloc; ++n) {
-      cnt += 8;
-      fprintf(fp, "       ");
-      fprintf(fp, dlongFormat "\n", cnt);
-    }
-  fprintf(fp, "       </DataArray>\n");
-
-  fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < Eloc; ++n) {
-      fprintf(fp, "12\n"); // HEX code ?
-    }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Cells>\n");
-  fprintf(fp, "    </Piece>\n");
-  fprintf(fp, "  </UnstructuredGrid>\n");
-  fprintf(fp, "</VTKFile>\n");
-  fclose(fp);
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticPreconditionerSetup.c b/src/libP/solvers/elliptic/src/ellipticPreconditionerSetup.c
deleted file mode 100644
index 627d10bc2..000000000
--- a/src/libP/solvers/elliptic/src/ellipticPreconditionerSetup.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-void ellipticPreconditionerSetup(elliptic_t* elliptic, ogs_t* ogs, occa::properties &kernelInfo)
-{
-  mesh2D* mesh = elliptic->mesh;
-  precon_t* precon = elliptic->precon;
-  setupAide options = elliptic->options;
-
-  if(options.compareArgs("PRECONDITIONER", "FULLALMOND")) { //build full A matrix and pass to Almond
-    dlong nnz;
-    nonZero_t* A;
-
-    hlong* globalStarts = (hlong*) calloc(mesh->size + 1, sizeof(hlong));
-
-    int basisNp = mesh->Np;
-    dfloat* basis = NULL;
-
-    // if (options.compareArgs("BASIS", "BERN")) basis = mesh->VB;
-
-    // if (options.compareArgs("DISCRETIZATION", "IPDG")) {
-    //   ellipticBuildIpdg(elliptic, basisNp, basis, lambda, &A, &nnz, globalStarts);
-    // } else if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {
-    ellipticBuildContinuous(elliptic, &A,&nnz, &(precon->ogs), globalStarts);
-    // }
-
-    hlong* Rows = (hlong*) calloc(nnz, sizeof(hlong));
-    hlong* Cols = (hlong*) calloc(nnz, sizeof(hlong));
-    dfloat* Vals = (dfloat*) calloc(nnz,sizeof(dfloat));
-
-    for (dlong n = 0; n < nnz; n++) {
-      Rows[n] = A[n].row;
-      Cols[n] = A[n].col;
-      Vals[n] = A[n].val;
-    }
-    free(A);
-
-    precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options);
-    parAlmond::AMGSetup(precon->parAlmond,
-                        globalStarts,
-                        nnz,
-                        Rows,
-                        Cols,
-                        Vals,
-                        elliptic->allNeumann,
-                        elliptic->allNeumannPenalty);
-    free(Rows);
-    free(Cols);
-    free(Vals);
-
-    if (options.compareArgs("VERBOSE", "TRUE"))
-      parAlmond::Report(precon->parAlmond);
-
-    if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {//tell parAlmond to gather this level
-      parAlmond::multigridLevel* baseLevel = precon->parAlmond->levels[0];
-
-      precon->rhsG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
-      precon->xG   = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
-      precon->o_rhsG = mesh->device.malloc(baseLevel->Ncols * sizeof(dfloat));
-      precon->o_xG   = mesh->device.malloc(baseLevel->Ncols * sizeof(dfloat));
-    }
-  } else if(options.compareArgs("PRECONDITIONER", "MULTIGRID")) {
-    ellipticMultiGridSetup(elliptic,precon);
-  } else if(options.compareArgs("PRECONDITIONER", "SEMFEM")) {
-    //ellipticSEMFEMSetup(elliptic,precon);
-    printf("ERROR: SEMFEM does not work right now.\n");
-    exit(-1);
-  } else if(options.compareArgs("PRECONDITIONER", "JACOBI")) {
-    dfloat* invDiagA;
-    ellipticBuildJacobi(elliptic,&invDiagA);
-    const dlong Nlocal =  mesh->Np * mesh->Nelements;
-    int Ntotal = elliptic->blockSolver ? elliptic->Ntotal * elliptic->Nfields: Nlocal;
-    precon->o_invDiagA = mesh->device.malloc(Ntotal * sizeof(dfloat), invDiagA);
-    free(invDiagA);
-  } else if(options.compareArgs("PRECONDITIONER", "OAS")) {
-    //ellipticThinOasSetup(elliptic);
-    printf("ERROR:  OAS does not work right now.\n");
-    exit(-1);
-
-    //if(mesh->N>1)
-    //  ellipticOasSetup(elliptic, lambda, kernelInfo);
-    //else{
-    //  dfloat *invDiagA;
-    //  ellipticBuildJacobi(elliptic,lambda,&invDiagA);
-    //  precon->o_invDiagA = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat), invDiagA);
-    //  free(invDiagA);
-    //}
-  }
-}
diff --git a/src/libP/solvers/elliptic/src/ellipticSEMFEMSetup.c b/src/libP/solvers/elliptic/src/ellipticSEMFEMSetup.c
deleted file mode 100644
index 22ec4d80a..000000000
--- a/src/libP/solvers/elliptic/src/ellipticSEMFEMSetup.c
+++ /dev/null
@@ -1,1038 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "elliptic.h"
-
-typedef struct
-{
-  dfloat VX;
-  dfloat VY;
-
-  dlong localId;
-  hlong globalId;
-}FEMverts_t;
-
-typedef struct
-{
-  dlong localId;
-  hlong globalId;
-  int ownerRank;
-}parallelNode_t;
-
-// compare on global owners
-int parallelCompareOwnersAndGlobalId(const void* a, const void* b);
-
-// compare on global indices
-int parallelCompareGlobalId(const void* a, const void* b);
-
-// compare xy coordinates
-int parallelCompareFEMvertsLocation(const void* a, const void* b)
-{
-  dfloat NODETOL = 1e-6;
-
-  FEMverts_t* fa = (FEMverts_t*) a;
-  FEMverts_t* fb = (FEMverts_t*) b;
-
-  if(fa->VX < fb->VX - NODETOL) return -1;
-  if(fa->VX > fb->VX + NODETOL) return +1;
-
-  if(fa->VY < fb->VY - NODETOL) return -1;
-  if(fa->VY > fb->VY + NODETOL) return +1;
-
-  return 0;
-}
-
-// compare local id
-int parallelCompareFEMvertsLocalId(const void* a, const void* b)
-{
-  FEMverts_t* fa = (FEMverts_t*) a;
-  FEMverts_t* fb = (FEMverts_t*) b;
-
-  if(fa->localId < fb->localId) return -1;
-  if(fa->localId > fb->localId) return +1;
-
-  return 0;
-}
-
-int parallelCompareRowColumn(const void* a, const void* b);
-
-void BuildFEMMatrixTri2D (mesh_t* femMesh,
-                          mesh_t* pmesh,
-                          dfloat lambda,
-                          dlong* localIds,
-                          hlong* globalNumbering,
-                          int* globalOwners,
-                          dlong* cnt,
-                          nonZero_t* A);
-void BuildFEMMatrixQuad2D(mesh_t* femMesh,
-                          mesh_t* pmesh,
-                          dfloat lambda,
-                          dlong* localIds,
-                          hlong* globalNumbering,
-                          int* globalOwners,
-                          dlong* cnt,
-                          nonZero_t* A);
-void BuildFEMMatrixTet3D (mesh_t* femMesh,
-                          mesh_t* pmesh,
-                          dfloat lambda,
-                          dlong* localIds,
-                          hlong* globalNumbering,
-                          int* globalOwners,
-                          dlong* cnt,
-                          nonZero_t* A);
-void BuildFEMMatrixHex3D (mesh_t* femMesh,
-                          mesh_t* pmesh,
-                          dfloat lambda,
-                          dlong* localIds,
-                          hlong* globalNumbering,
-                          int* globalOwners,
-                          dlong* cnt,
-                          nonZero_t* A);
-
-void ellipticSEMFEMSetup(elliptic_t* elliptic, precon_t* precon)
-{
-  setupAide options = elliptic->options;
-
-  // currently constant coefficient
-  const dfloat lambda = elliptic->lambda[0];
-
-  if (!(options.compareArgs("DISCRETIZATION", "CONTINUOUS"))) {
-    printf("SEMFEM is supported for CONTINUOUS only\n");
-    MPI_Barrier(elliptic->mesh->comm);
-    MPI_Finalize();
-    exit(0);
-  }
-
-  mesh_t* mesh = elliptic->mesh; //original mesh
-
-  //  mesh_t* pmesh = (mesh_t*) calloc (1,sizeof(mesh_t)); //partially assembled fem mesh (result of projecting sem element to larger space)
-  mesh_t* pmesh = new mesh_t[1];
-
-  //  precon->femMesh = (mesh_t*) calloc (1,sizeof(mesh_t)); //full fem mesh
-  precon->femMesh = new mesh_t[1];
-
-  mesh_t* femMesh = precon->femMesh;
-
-  memcpy(pmesh,mesh,sizeof(mesh_t));
-  memcpy(femMesh,mesh,sizeof(mesh_t));
-
-  if (elliptic->elementType == TRIANGLES) {
-    //set semfem nodes as the grid points
-    pmesh->Np = mesh->NpFEM;
-    pmesh->r  = mesh->rFEM;
-    pmesh->s  = mesh->sFEM;
-
-    //count number of face nodes in the semfem element
-    dfloat NODETOL = 1e-6;
-    pmesh->Nfp = 0;
-    for (int n = 0; n < pmesh->Np; n++)
-      if (fabs(pmesh->s[n] + 1) < NODETOL) pmesh->Nfp++;
-
-    //remake the faceNodes array
-    pmesh->faceNodes = (int*) calloc(pmesh->Nfaces * pmesh->Nfp,sizeof(int));
-    int f0 = 0, f1 = 0, f2 = 0;
-    for (int n = 0; n < pmesh->Np; n++) {
-      if (fabs(pmesh->s[n] + 1) < NODETOL) pmesh->faceNodes[0 * pmesh->Nfp + f0++] = n;
-      if (fabs(pmesh->r[n] + pmesh->s[n]) < NODETOL) pmesh->faceNodes[1 * pmesh->Nfp + f1++] = n;
-      if (fabs(pmesh->r[n] + 1) < NODETOL) pmesh->faceNodes[2 * pmesh->Nfp + f2++] = n;
-    }
-
-    //remake vertexNodes array
-    pmesh->vertexNodes = (int*) calloc(pmesh->Nverts, sizeof(int));
-    for(int n = 0; n < pmesh->Np; ++n) {
-      if( (pmesh->r[n] + 1) * (pmesh->r[n] + 1) + (pmesh->s[n] + 1) * (pmesh->s[n] + 1) < NODETOL)
-        pmesh->vertexNodes[0] = n;
-      if( (pmesh->r[n] - 1) * (pmesh->r[n] - 1) + (pmesh->s[n] + 1) * (pmesh->s[n] + 1) < NODETOL)
-        pmesh->vertexNodes[1] = n;
-      if( (pmesh->r[n] + 1) * (pmesh->r[n] + 1) + (pmesh->s[n] - 1) * (pmesh->s[n] - 1) < NODETOL)
-        pmesh->vertexNodes[2] = n;
-    }
-
-    // connect elements using parallel sort
-    meshParallelConnect(pmesh);
-
-    // compute physical (x,y) locations of the element nodes
-    meshPhysicalNodesTri2D(pmesh);
-
-    // free(sendBuffer);
-    meshHaloSetup(pmesh);
-
-    // connect face nodes (find trace indices)
-    meshConnectFaceNodes2D(pmesh);
-
-    // global nodes
-    meshParallelConnectNodes(pmesh);
-    //pmesh->globalIds is now populated
-  } else if (elliptic->elementType == TETRAHEDRA) {
-    //set semfem nodes as the grid points
-    pmesh->Np = mesh->NpFEM;
-    pmesh->r  = mesh->rFEM;
-    pmesh->s  = mesh->sFEM;
-    pmesh->t  = mesh->tFEM;
-
-    //count number of face nodes in the semfem element
-    dfloat NODETOL = 1e-6;
-    pmesh->Nfp = 0;
-    for (int n = 0; n < pmesh->Np; n++)
-      if (fabs(pmesh->t[n] + 1) < NODETOL) pmesh->Nfp++;
-
-    //remake the faceNodes array
-    pmesh->faceNodes = (int*) calloc(pmesh->Nfaces * pmesh->Nfp,sizeof(int));
-    int f0 = 0, f1 = 0, f2 = 0, f3 = 0;
-    for (int n = 0; n < pmesh->Np; n++) {
-      if (fabs(pmesh->t[n] + 1) < NODETOL) pmesh->faceNodes[0 * pmesh->Nfp + f0++] = n;
-      if (fabs(pmesh->s[n] + 1) < NODETOL) pmesh->faceNodes[1 * pmesh->Nfp + f1++] = n;
-      if (fabs(pmesh->r[n] + pmesh->s[n] +
-               pmesh->t[n] + 1.0) < NODETOL) pmesh->faceNodes[2 * pmesh->Nfp + f2++] = n;
-      if (fabs(pmesh->r[n] + 1) < NODETOL) pmesh->faceNodes[3 * pmesh->Nfp + f3++] = n;
-    }
-
-    //remake vertexNodes array
-    pmesh->vertexNodes = (int*) calloc(pmesh->Nverts, sizeof(int));
-    for(int n = 0; n < pmesh->Np; ++n) {
-      if( (pmesh->r[n] + 1) * (pmesh->r[n] + 1) + (pmesh->s[n] + 1) * (pmesh->s[n] + 1) +
-          (pmesh->t[n] + 1) * (pmesh->t[n] + 1) < NODETOL)
-        pmesh->vertexNodes[0] = n;
-      if( (pmesh->r[n] - 1) * (pmesh->r[n] - 1) + (pmesh->s[n] + 1) * (pmesh->s[n] + 1) +
-          (pmesh->t[n] + 1) * (pmesh->t[n] + 1) < NODETOL)
-        pmesh->vertexNodes[1] = n;
-      if( (pmesh->r[n] + 1) * (pmesh->r[n] + 1) + (pmesh->s[n] - 1) * (pmesh->s[n] - 1) +
-          (pmesh->t[n] + 1) * (pmesh->t[n] + 1) < NODETOL)
-        pmesh->vertexNodes[2] = n;
-      if( (pmesh->r[n] + 1) * (pmesh->r[n] + 1) + (pmesh->s[n] + 1) * (pmesh->s[n] + 1) +
-          (pmesh->t[n] - 1) * (pmesh->t[n] - 1) < NODETOL)
-        pmesh->vertexNodes[3] = n;
-    }
-
-    // connect elements using parallel sort
-    meshParallelConnect(pmesh);
-
-    // compute physical (x,y) locations of the element nodes
-    meshPhysicalNodesTet3D(pmesh);
-
-    // free(sendBuffer);
-    meshHaloSetup(pmesh);
-
-    // connect face nodes (find trace indices)
-    meshConnectFaceNodes3D(pmesh);
-
-    // global nodes
-    meshParallelConnectNodes(pmesh);
-    //pmesh->globalIds is now populated
-  }
-
-  //now build the full degree 1 fem mesh
-  int femN = 1; //degree of fem approximation
-
-  /* allocate space for node coordinates */
-  femMesh->Nelements = mesh->NelFEM * mesh->Nelements;
-  femMesh->EToV = (hlong*) calloc(femMesh->Nelements * femMesh->Nverts, sizeof(hlong));
-  femMesh->EX = (dfloat*) calloc(femMesh->Nverts * femMesh->Nelements, sizeof(dfloat));
-  femMesh->EY = (dfloat*) calloc(femMesh->Nverts * femMesh->Nelements, sizeof(dfloat));
-  if (elliptic->dim == 3)
-    femMesh->EZ = (dfloat*) calloc(femMesh->Nverts * femMesh->Nelements, sizeof(dfloat));
-
-  dlong* localIds = (dlong*) calloc(femMesh->Nverts * femMesh->Nelements,sizeof(dlong));
-
-  // dlong NFEMverts = mesh->Nelements*mesh->NpFEM;
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for (int n = 0; n < mesh->NelFEM; n++) {
-      dlong id[femMesh->Nverts];
-
-      dlong femId = e * mesh->NelFEM * mesh->Nverts + n * mesh->Nverts;
-
-      for (int i = 0; i < femMesh->Nverts; i++) {
-        //local ids in the subelement fem grid
-        id[i] = e * mesh->NpFEM + mesh->FEMEToV[n * mesh->Nverts + i];
-
-        /* read vertex triplet for triangle */
-        femMesh->EToV[femId + i] = pmesh->globalIds[id[i]];
-
-        femMesh->EX[femId + i] = pmesh->x[id[i]];
-        femMesh->EY[femId + i] = pmesh->y[id[i]];
-        if (elliptic->dim == 3)
-          femMesh->EZ[femId + i] = pmesh->z[id[i]];
-      }
-
-      switch(elliptic->elementType) {
-      case TRIANGLES:
-        localIds[femId + 0] = id[0];
-        localIds[femId + 1] = id[1];
-        localIds[femId + 2] = id[2];
-        break;
-      case QUADRILATERALS:
-        localIds[femId + 0] = id[0];
-        localIds[femId + 1] = id[1];
-        localIds[femId + 2] = id[3];  //need to swap this as the Np nodes are ordered [0,1,3,2] in a degree 1 element
-        localIds[femId + 3] = id[2];
-        break;
-      case TETRAHEDRA:
-        localIds[femId + 0] = id[0];
-        localIds[femId + 1] = id[1];
-        localIds[femId + 2] = id[2];
-        localIds[femId + 3] = id[3];
-        break;
-      case HEXAHEDRA:
-        localIds[femId + 0] = id[0];
-        localIds[femId + 1] = id[1];
-        localIds[femId + 2] = id[3];  //need to swap this as the Np nodes are ordered [0,1,3,2,4,5,7,6] in a degree 1 element
-        localIds[femId + 3] = id[2];
-        localIds[femId + 4] = id[4];
-        localIds[femId + 5] = id[5];
-        localIds[femId + 6] = id[7];
-        localIds[femId + 7] = id[6];
-        break;
-      }
-    }
-
-  // connect elements using parallel sort
-  meshParallelConnect(femMesh);
-
-  switch(elliptic->elementType) {
-  case TRIANGLES:
-    meshLoadReferenceNodesTri2D(femMesh, femN);
-    break;
-  case QUADRILATERALS:
-    meshLoadReferenceNodesQuad2D(femMesh, femN);
-    break;
-  case TETRAHEDRA:
-    meshLoadReferenceNodesTet3D(femMesh, femN);
-    break;
-  case HEXAHEDRA:
-    meshLoadReferenceNodesHex3D(femMesh, femN);
-    break;
-  }
-
-  int* faceFlag = (int*) calloc(pmesh->Np * pmesh->Nfaces,sizeof(int));
-  for (int f = 0; f < pmesh->Nfaces; f++)
-    for (int n = 0; n < pmesh->Nfp; n++) {
-      int id = pmesh->faceNodes[f * pmesh->Nfp + n];
-      faceFlag[f * pmesh->Np + id] = 1; //flag the nodes on this face
-    }
-
-  //map from faces of fem sub-elements to the macro element face number
-  int* femFaceMap = (int*) calloc(mesh->NelFEM * femMesh->Nfaces,sizeof(int));
-  for (int n = 0; n < mesh->NelFEM * femMesh->Nfaces; n++) femFaceMap[n] = -1;
-
-  for (int n = 0; n < mesh->NelFEM; n++)
-    for (int f = 0; f < femMesh->Nfaces; f++)
-
-      for (int face = 0; face < pmesh->Nfaces; face++) {
-        //count the nodes on this face which are on a macro face
-        int NvertsOnFace = 0;
-        for (int i = 0; i < femMesh->Nfp; i++) {
-          int id = femMesh->faceNodes[f * femMesh->Nfp + i];
-          int v  = mesh->FEMEToV[n * pmesh->Nverts + id];
-          NvertsOnFace += faceFlag[face * pmesh->Np + v];
-        }
-        if (NvertsOnFace == femMesh->Nfp)
-          femFaceMap[n * femMesh->Nfaces + f] = face; //on macro face
-      }
-
-  //fill the boundary flag array
-  femMesh->EToB = (int*) calloc(femMesh->Nelements * femMesh->Nfaces, sizeof(int));
-  for (dlong e = 0; e < mesh->Nelements; e++)
-    for (int n = 0; n < mesh->NelFEM; n++)
-      for (int f = 0; f < femMesh->Nfaces; f++) {
-        int face = femFaceMap[n * femMesh->Nfaces + f];
-        if (face > -1)
-          femMesh->EToB[(e * mesh->NelFEM + n) * femMesh->Nfaces +
-                        f] = mesh->EToB[e * mesh->Nfaces + face];
-      }
-  free(faceFlag);
-  free(femFaceMap);
-
-  switch(elliptic->elementType) {
-  case TRIANGLES:
-    meshPhysicalNodesTri2D(femMesh);
-    meshGeometricFactorsTri2D(femMesh);
-    meshHaloSetup(femMesh);
-    meshConnectFaceNodes2D(femMesh);
-    meshSurfaceGeometricFactorsTri2D(femMesh);
-    break;
-  case QUADRILATERALS:
-    meshPhysicalNodesQuad2D(femMesh);
-    meshGeometricFactorsQuad2D(femMesh);
-    meshHaloSetup(femMesh);
-    meshConnectFaceNodes2D(femMesh);
-    meshSurfaceGeometricFactorsQuad2D(femMesh);
-    break;
-  case TETRAHEDRA:
-    meshPhysicalNodesTet3D(femMesh);
-    meshGeometricFactorsTet3D(femMesh);
-    meshHaloSetup(femMesh);
-    meshConnectFaceNodes3D(femMesh);
-    meshSurfaceGeometricFactorsTet3D(femMesh);
-    break;
-  case HEXAHEDRA:
-    meshPhysicalNodesHex3D(femMesh);
-    meshGeometricFactorsHex3D(femMesh);
-    meshHaloSetup(femMesh);
-    meshConnectFaceNodes3D(femMesh);
-    meshSurfaceGeometricFactorsHex3D(femMesh);
-    break;
-  }
-
-  // global nodes
-  meshParallelConnectNodes(femMesh);
-
-  dlong Ntotal = pmesh->Np * pmesh->Nelements;
-  int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0;
-
-  pmesh->maskedGlobalIds = (hlong*) calloc(Ntotal,sizeof(hlong));
-  memcpy(pmesh->maskedGlobalIds, pmesh->globalIds, Ntotal * sizeof(hlong));
-  if (elliptic->elementType == TRIANGLES || elliptic->elementType == TETRAHEDRA) {
-    //build a new mask for NpFEM>Np node sets
-
-    // gather-scatter
-    pmesh->ogs = ogsSetup(Ntotal, pmesh->globalIds, mesh->comm, verbose, mesh->device);
-
-    //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann)
-    int* mapB = (int*) calloc(Ntotal,sizeof(int));
-    for (dlong e = 0; e < pmesh->Nelements; e++) {
-      for (int n = 0; n < pmesh->Np; n++) mapB[n + e * pmesh->Np] = 1E9;
-      for (int f = 0; f < pmesh->Nfaces; f++) {
-        int bc = pmesh->EToB[f + e * pmesh->Nfaces];
-        if (bc > 0) {
-          for (int n = 0; n < pmesh->Nfp; n++) {
-            int BCFlag = elliptic->BCType[bc];
-            int fid = pmesh->faceNodes[n + f * pmesh->Nfp];
-            mapB[fid + e * pmesh->Np] = mymin(BCFlag,mapB[fid + e * pmesh->Np]);
-          }
-        }
-      }
-    }
-    ogsGatherScatter(mapB, ogsInt, ogsMin, pmesh->ogs);
-
-    //use the bc flags to find masked ids
-    for (dlong n = 0; n < pmesh->Nelements * pmesh->Np; n++)
-      if (mapB[n] == 1)   //Dirichlet boundary
-        pmesh->maskedGlobalIds[n] = 0;
-    free(mapB);
-  } else {
-    //mask using the original mask
-    for (dlong n = 0; n < elliptic->Nmasked; n++)
-      pmesh->maskedGlobalIds[elliptic->maskIds[n]] = 0;
-  }
-
-  //build masked gs handle
-  precon->FEMogs = ogsSetup(Ntotal, pmesh->maskedGlobalIds, mesh->comm, verbose, mesh->device);
-
-  // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = precon->FEMogs->Ngather;
-
-  // create a global numbering system
-  hlong* globalIds = (hlong*) calloc(Ngather,sizeof(hlong));
-  int* owner     = (int*) calloc(Ngather,sizeof(int));
-
-  // every gathered degree of freedom has its own global id
-  hlong* globalStarts = (hlong*) calloc(mesh->size + 1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts + 1, 1, MPI_HLONG, mesh->comm);
-  for(int r = 0; r < mesh->size; ++r)
-    globalStarts[r + 1] = globalStarts[r] + globalStarts[r + 1];
-
-  //use the offsets to set a consecutive global numbering
-  for (dlong n = 0; n < precon->FEMogs->Ngather; n++) {
-    globalIds[n] = n + globalStarts[mesh->rank];
-    owner[n] = mesh->rank;
-  }
-
-  //scatter this numbering to the original nodes
-  hlong* globalNumbering = (hlong*) calloc(Ntotal,sizeof(hlong));
-  int* globalOwners = (int*) calloc(Ntotal,sizeof(int));
-  for (dlong n = 0; n < Ntotal; n++) globalNumbering[n] = -1;
-  ogsScatter(globalNumbering, globalIds, ogsHlong, ogsAdd, precon->FEMogs);
-  ogsScatter(globalOwners, owner, ogsInt, ogsAdd, precon->FEMogs);
-
-  free(globalIds);
-  free(owner);
-
-  if (elliptic->elementType == TRIANGLES || elliptic->elementType == TETRAHEDRA) {
-    //dont need these anymore
-    free(pmesh->vmapM);
-    free(pmesh->vmapP);
-    free(pmesh->mapP);
-    //maybe more cleanup can go here
-  }
-
-  if (elliptic->elementType == TRIANGLES) {
-    //build stiffness matrices
-    femMesh->Srr = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Srs = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Ssr = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Sss = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    for (int n = 0; n < femMesh->Np; n++)
-      for (int m = 0; m < femMesh->Np; m++)
-        for (int k = 0; k < femMesh->Np; k++)
-          for (int l = 0; l < femMesh->Np; l++) {
-            femMesh->Srr[m + n * femMesh->Np] += femMesh->Dr[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dr[m + k * femMesh->Np];
-            femMesh->Srs[m + n * femMesh->Np] += femMesh->Dr[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Ds[m + k * femMesh->Np];
-            femMesh->Ssr[m + n * femMesh->Np] += femMesh->Ds[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dr[m + k * femMesh->Np];
-            femMesh->Sss[m + n * femMesh->Np] += femMesh->Ds[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Ds[m + k * femMesh->Np];
-          }
-  } else if (elliptic->elementType == TETRAHEDRA) {
-    //build stiffness matrices
-    femMesh->Srr = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Srs = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Srt = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Ssr = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Sss = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Sst = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Str = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Sts = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    femMesh->Stt = (dfloat*) calloc(femMesh->Np * femMesh->Np,sizeof(dfloat));
-    for (int n = 0; n < femMesh->Np; n++)
-      for (int m = 0; m < femMesh->Np; m++)
-        for (int k = 0; k < femMesh->Np; k++)
-          for (int l = 0; l < femMesh->Np; l++) {
-            femMesh->Srr[m + n * femMesh->Np] += femMesh->Dr[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dr[m + k * femMesh->Np];
-            femMesh->Srs[m + n * femMesh->Np] += femMesh->Dr[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Ds[m + k * femMesh->Np];
-            femMesh->Srt[m + n * femMesh->Np] += femMesh->Dr[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dt[m + k * femMesh->Np];
-            femMesh->Ssr[m + n * femMesh->Np] += femMesh->Ds[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dr[m + k * femMesh->Np];
-            femMesh->Sss[m + n * femMesh->Np] += femMesh->Ds[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Ds[m + k * femMesh->Np];
-            femMesh->Sst[m + n * femMesh->Np] += femMesh->Ds[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dt[m + k * femMesh->Np];
-            femMesh->Str[m + n * femMesh->Np] += femMesh->Dt[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dr[m + k * femMesh->Np];
-            femMesh->Sts[m + n * femMesh->Np] += femMesh->Dt[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Ds[m + k * femMesh->Np];
-            femMesh->Stt[m + n * femMesh->Np] += femMesh->Dt[n + l * femMesh->Np] *
-                                                 femMesh->MM[k + l * femMesh->Np] *
-                                                 femMesh->Dt[m + k * femMesh->Np];
-          }
-  }
-
-  if (mesh->rank == 0) printf("Building full SEMFEM matrix...");
-  fflush(stdout);
-
-  // Build non-zeros of stiffness matrix (unassembled)
-  dlong nnzLocal = femMesh->Np * femMesh->Np * femMesh->Nelements;
-
-  dlong cnt = 0;
-  nonZero_t* sendNonZeros = (nonZero_t*) calloc(nnzLocal, sizeof(nonZero_t));
-  int* AsendCounts  = (int*) calloc(mesh->size, sizeof(int));
-  int* ArecvCounts  = (int*) calloc(mesh->size, sizeof(int));
-  int* AsendOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
-  int* ArecvOffsets = (int*) calloc(mesh->size + 1, sizeof(int));
-
-  //Build unassembed non-zeros
-  switch(elliptic->elementType) {
-  case TRIANGLES:
-    BuildFEMMatrixTri2D(femMesh,
-                        pmesh,
-                        lambda,
-                        localIds,
-                        globalNumbering,
-                        globalOwners,
-                        &cnt,
-                        sendNonZeros);
-    break;
-  case QUADRILATERALS:
-    BuildFEMMatrixQuad2D(femMesh,
-                         pmesh,
-                         lambda,
-                         localIds,
-                         globalNumbering,
-                         globalOwners,
-                         &cnt,
-                         sendNonZeros);
-    break;
-  case TETRAHEDRA:
-    BuildFEMMatrixTet3D(femMesh,
-                        pmesh,
-                        lambda,
-                        localIds,
-                        globalNumbering,
-                        globalOwners,
-                        &cnt,
-                        sendNonZeros);
-    break;
-  case HEXAHEDRA:
-    BuildFEMMatrixHex3D(femMesh,
-                        pmesh,
-                        lambda,
-                        localIds,
-                        globalNumbering,
-                        globalOwners,
-                        &cnt,
-                        sendNonZeros);
-    break;
-  }
-
-  // Make the MPI_NONZERO_T data type
-  MPI_Datatype MPI_NONZERO_T;
-  MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
-  int blength[4] = {1, 1, 1, 1};
-  MPI_Aint addr[4], displ[4];
-  MPI_Get_address ( &(sendNonZeros[0]          ), addr + 0);
-  MPI_Get_address ( &(sendNonZeros[0].col      ), addr + 1);
-  MPI_Get_address ( &(sendNonZeros[0].ownerRank), addr + 2);
-  MPI_Get_address ( &(sendNonZeros[0].val      ), addr + 3);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  MPI_Type_create_struct (4, blength, displ, dtype, &MPI_NONZERO_T);
-  MPI_Type_commit (&MPI_NONZERO_T);
-
-  // count how many non-zeros to send to each process
-  for(dlong n = 0; n < cnt; ++n)
-    AsendCounts[sendNonZeros[n].ownerRank]++;
-
-  // sort by row ordering
-  qsort(sendNonZeros, cnt, sizeof(nonZero_t), parallelCompareRowColumn);
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh->comm);
-
-  // find send and recv offsets for gather
-  dlong nnz = 0;
-  for(int r = 0; r < mesh->size; ++r) {
-    AsendOffsets[r + 1] = AsendOffsets[r] + AsendCounts[r];
-    ArecvOffsets[r + 1] = ArecvOffsets[r] + ArecvCounts[r];
-    nnz += ArecvCounts[r];
-  }
-
-  nonZero_t* A = (nonZero_t*) calloc(nnz, sizeof(nonZero_t));
-
-  // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, MPI_NONZERO_T,
-                A, ArecvCounts, ArecvOffsets, MPI_NONZERO_T,
-                mesh->comm);
-
-  // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  qsort(A, nnz, sizeof(nonZero_t), parallelCompareRowColumn);
-
-  // compress duplicates
-  cnt = 0;
-  for(dlong n = 1; n < nnz; ++n) {
-    if(A[n].row == A[cnt].row && A[n].col == A[cnt].col) {
-      A[cnt].val += A[n].val;
-    } else{
-      ++cnt;
-      A[cnt] = A[n];
-    }
-  }
-  if (nnz) cnt++;
-  nnz = cnt;
-
-  if(mesh->rank == 0) printf("done.\n");
-
-  MPI_Barrier(mesh->comm);
-  MPI_Type_free(&MPI_NONZERO_T);
-
-  hlong* Rows = (hlong*) calloc(nnz, sizeof(hlong));
-  hlong* Cols = (hlong*) calloc(nnz, sizeof(hlong));
-  dfloat* Vals = (dfloat*) calloc(nnz,sizeof(dfloat));
-
-  for (dlong n = 0; n < nnz; n++) {
-    Rows[n] = A[n].row;
-    Cols[n] = A[n].col;
-    Vals[n] = A[n].val;
-  }
-  free(A);
-
-  precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options);
-  parAlmond::AMGSetup(precon->parAlmond,
-                      globalStarts,
-                      nnz,
-                      Rows,
-                      Cols,
-                      Vals,
-                      elliptic->allNeumann,
-                      elliptic->allNeumannPenalty);
-  free(Rows);
-  free(Cols);
-  free(Vals);
-
-  if (options.compareArgs("VERBOSE", "TRUE"))
-    parAlmond::Report(precon->parAlmond);
-
-  if (elliptic->elementType == TRIANGLES || elliptic->elementType == TETRAHEDRA) {
-    // //tell parAlmond not to gather this level (its done manually)
-    // agmgLevel *baseLevel = precon->parAlmond->levels[0];
-    // baseLevel->gatherLevel = false;
-    // baseLevel->weightedInnerProds = false;
-
-    // build interp and anterp
-    dfloat* SEMFEMAnterp = (dfloat*) calloc(mesh->NpFEM * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->NpFEM; ++n)
-      for(int m = 0; m < mesh->Np; ++m)
-        SEMFEMAnterp[n + m * mesh->NpFEM] = mesh->SEMFEMInterp[n * mesh->Np + m];
-
-    mesh->o_SEMFEMInterp = mesh->device.malloc(mesh->NpFEM * mesh->Np * sizeof(dfloat),
-                                               mesh->SEMFEMInterp);
-    mesh->o_SEMFEMAnterp =
-      mesh->device.malloc(mesh->NpFEM * mesh->Np * sizeof(dfloat),SEMFEMAnterp);
-
-    free(SEMFEMAnterp);
-
-    precon->o_rFEM = mesh->device.malloc(mesh->Nelements * mesh->NpFEM * sizeof(dfloat));
-    precon->o_zFEM = mesh->device.malloc(mesh->Nelements * mesh->NpFEM * sizeof(dfloat));
-
-    precon->o_GrFEM = mesh->device.malloc(precon->FEMogs->Ngather * sizeof(dfloat));
-    precon->o_GzFEM = mesh->device.malloc(precon->FEMogs->Ngather * sizeof(dfloat));
-  } else {
-    // //tell parAlmond to gather this level
-    // agmgLevel *baseLevel = precon->parAlmond->levels[0];
-
-    // baseLevel->gatherLevel = true;
-    parAlmond::multigridLevel* baseLevel = precon->parAlmond->levels[0];
-    precon->rhsG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
-    precon->xG   = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
-    precon->o_rhsG = mesh->device.malloc(baseLevel->Ncols * sizeof(dfloat));
-    precon->o_xG   = mesh->device.malloc(baseLevel->Ncols * sizeof(dfloat));
-
-    // baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
-    // baseLevel->Sx   = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
-    // baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
-    // baseLevel->o_Sx   = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
-
-    // baseLevel->weightedInnerProds = false;
-
-    // baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*));
-    // baseLevel->gatherArgs[0] = (void *) elliptic;
-    // baseLevel->gatherArgs[1] = (void *) precon->FEMogs;  //use the gs made from the partial gathered femgrid
-    // baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx);
-    // baseLevel->scatterArgs = baseLevel->gatherArgs;
-
-    // baseLevel->device_gather  = ellipticGather;
-    // baseLevel->device_scatter = ellipticScatter;
-  }
-}
-
-void BuildFEMMatrixTri2D(mesh_t* femMesh, mesh_t* pmesh, dfloat lambda,
-                         dlong* localIds, hlong* globalNumbering, int* globalOwners,
-                         dlong* cnt, nonZero_t* A)
-{
-#pragma omp parallel for
-  for (dlong e = 0; e < femMesh->Nelements; e++) {
-    for (int n = 0; n < femMesh->Np; n++) {
-      dlong idn = localIds[e * femMesh->Np + n];
-      if (globalNumbering[idn] < 0) continue; //skip masked nodes
-      for (int m = 0; m < femMesh->Np; m++) {
-        dlong idm = localIds[e * femMesh->Np + m];
-        if (globalNumbering[idm] < 0) continue; //skip masked nodes
-
-        dfloat val = 0.;
-
-        dfloat Grr = femMesh->ggeo[e * femMesh->Nggeo + G00ID];
-        dfloat Grs = femMesh->ggeo[e * femMesh->Nggeo + G01ID];
-        dfloat Gss = femMesh->ggeo[e * femMesh->Nggeo + G11ID];
-        dfloat J   = femMesh->ggeo[e * femMesh->Nggeo + GWJID];
-
-        val += Grr * femMesh->Srr[m + n * femMesh->Np];
-        val += Grs * femMesh->Srs[m + n * femMesh->Np];
-        val += Grs * femMesh->Ssr[m + n * femMesh->Np];
-        val += Gss * femMesh->Sss[m + n * femMesh->Np];
-        val += J * lambda * femMesh->MM[m + n * femMesh->Np];
-
-        dfloat nonZeroThreshold = 1e-7;
-        if (fabs(val) > nonZeroThreshold) {
-#pragma omp critical
-          {
-            // pack non-zero
-            A[*cnt].val = val;
-            A[*cnt].row = globalNumbering[idn];
-            A[*cnt].col = globalNumbering[idm];
-            A[*cnt].ownerRank = globalOwners[idn];
-            (*cnt)++;
-          }
-        }
-      }
-    }
-  }
-}
-
-void BuildFEMMatrixQuad2D(mesh_t* femMesh, mesh_t* pmesh, dfloat lambda,
-                          dlong* localIds, hlong* globalNumbering, int* globalOwners,
-                          dlong* cnt, nonZero_t* A)
-{
-#pragma omp parallel for
-  for (dlong e = 0; e < femMesh->Nelements; e++) {
-    for (int ny = 0; ny < femMesh->Nq; ny++) {
-      for (int nx = 0; nx < femMesh->Nq; nx++) {
-        dlong idn = localIds[e * femMesh->Np + nx + ny * femMesh->Nq];
-        if (globalNumbering[idn] < 0) continue; //skip masked nodes
-
-        for (int my = 0; my < femMesh->Nq; my++) {
-          for (int mx = 0; mx < femMesh->Nq; mx++) {
-            dlong idm = localIds[e * femMesh->Np + mx + my * femMesh->Nq];
-            if (globalNumbering[idm] < 0) continue; //skip masked nodes
-
-            int id;
-            dfloat val = 0.;
-
-            if (ny == my) {
-              for (int k = 0; k < femMesh->Nq; k++) {
-                id = k + ny * femMesh->Nq;
-                dfloat Grr =
-                  femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G00ID * femMesh->Np];
-
-                val += Grr * femMesh->D[nx + k * femMesh->Nq] * femMesh->D[mx + k * femMesh->Nq];
-              }
-            }
-
-            id = mx + ny * femMesh->Nq;
-            dfloat Grs = femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G01ID * femMesh->Np];
-            val += Grs * femMesh->D[nx + mx * femMesh->Nq] * femMesh->D[my + ny * femMesh->Nq];
-
-            id = nx + my * femMesh->Nq;
-            dfloat Gsr = femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G01ID * femMesh->Np];
-            val += Gsr * femMesh->D[mx + nx * femMesh->Nq] * femMesh->D[ny + my * femMesh->Nq];
-
-            if (nx == mx) {
-              for (int k = 0; k < femMesh->Nq; k++) {
-                id = nx + k * femMesh->Nq;
-                dfloat Gss =
-                  femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G11ID * femMesh->Np];
-
-                val += Gss * femMesh->D[ny + k * femMesh->Nq] * femMesh->D[my + k * femMesh->Nq];
-              }
-            }
-
-            if ((nx == mx) && (ny == my)) {
-              id = nx + ny * femMesh->Nq;
-              dfloat JW =
-                femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + GWJID * femMesh->Np];
-              val += JW * lambda;
-            }
-
-            dfloat nonZeroThreshold = 1e-7;
-            if (fabs(val) > nonZeroThreshold) {
-#pragma omp critical
-              {
-                // pack non-zero
-                A[*cnt].val = val;
-                A[*cnt].row = globalNumbering[idn];
-                A[*cnt].col = globalNumbering[idm];
-                A[*cnt].ownerRank = globalOwners[idn];
-                (*cnt)++;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void BuildFEMMatrixTet3D(mesh_t* femMesh, mesh_t* pmesh, dfloat lambda,
-                         dlong* localIds, hlong* globalNumbering, int* globalOwners,
-                         dlong* cnt, nonZero_t* A)
-{
-#pragma omp parallel for
-  for (dlong e = 0; e < femMesh->Nelements; e++) {
-    dfloat Grr = femMesh->ggeo[e * femMesh->Nggeo + G00ID];
-    dfloat Grs = femMesh->ggeo[e * femMesh->Nggeo + G01ID];
-    dfloat Grt = femMesh->ggeo[e * femMesh->Nggeo + G02ID];
-    dfloat Gss = femMesh->ggeo[e * femMesh->Nggeo + G11ID];
-    dfloat Gst = femMesh->ggeo[e * femMesh->Nggeo + G12ID];
-    dfloat Gtt = femMesh->ggeo[e * femMesh->Nggeo + G22ID];
-    dfloat J   = femMesh->ggeo[e * femMesh->Nggeo + GWJID];
-
-    for (int n = 0; n < femMesh->Np; n++) {
-      dlong idn = localIds[e * femMesh->Np + n];
-      if (globalNumbering[idn] < 0) continue; //skip masked nodes
-      for (int m = 0; m < femMesh->Np; m++) {
-        dlong idm = localIds[e * femMesh->Np + m];
-        if (globalNumbering[idm] < 0) continue; //skip masked nodes
-
-        dfloat val = 0.;
-        val += Grr * femMesh->Srr[m + n * femMesh->Np];
-        val += Grs * femMesh->Srs[m + n * femMesh->Np];
-        val += Grt * femMesh->Srt[m + n * femMesh->Np];
-        val += Grs * femMesh->Ssr[m + n * femMesh->Np];
-        val += Gss * femMesh->Sss[m + n * femMesh->Np];
-        val += Gst * femMesh->Sst[m + n * femMesh->Np];
-        val += Grt * femMesh->Str[m + n * femMesh->Np];
-        val += Gst * femMesh->Sts[m + n * femMesh->Np];
-        val += Gtt * femMesh->Stt[m + n * femMesh->Np];
-        val += J * lambda * femMesh->MM[m + n * femMesh->Np];
-
-        dfloat nonZeroThreshold = 1e-7;
-        if (fabs(val) > nonZeroThreshold) {
-#pragma omp critical
-          {
-            // pack non-zero
-            A[*cnt].val = val;
-            A[*cnt].row = globalNumbering[idn];
-            A[*cnt].col = globalNumbering[idm];
-            A[*cnt].ownerRank = globalOwners[idn];
-            (*cnt)++;
-          }
-        }
-      }
-    }
-  }
-}
-
-void BuildFEMMatrixHex3D(mesh_t* femMesh, mesh_t* pmesh, dfloat lambda,
-                         dlong* localIds, hlong* globalNumbering, int* globalOwners,
-                         dlong* cnt, nonZero_t* A)
-{
-#pragma omp parallel for
-  for (dlong e = 0; e < femMesh->Nelements; e++) {
-    for (int nz = 0; nz < femMesh->Nq; nz++) {
-      for (int ny = 0; ny < femMesh->Nq; ny++) {
-        for (int nx = 0; nx < femMesh->Nq; nx++) {
-          dlong nn = nx + ny * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-          dlong idn = localIds[e * femMesh->Np + nn];
-          if (globalNumbering[idn] < 0) continue; //skip masked nodes
-
-          for (int mz = 0; mz < femMesh->Nq; mz++) {
-            for (int my = 0; my < femMesh->Nq; my++) {
-              for (int mx = 0; mx < femMesh->Nq; mx++) {
-                dlong mm = mx + my * femMesh->Nq + mz * femMesh->Nq * femMesh->Nq;
-                dlong idm = localIds[e * femMesh->Np + mm];
-                if (globalNumbering[idm] < 0) continue; //skip masked nodes
-
-                int id;
-                dfloat val = 0.;
-
-                if ((ny == my) && (nz == mz)) {
-                  for (int k = 0; k < femMesh->Nq; k++) {
-                    id = k + ny * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                    dfloat Grr =
-                      femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G00ID * femMesh->Np];
-
-                    val += Grr * femMesh->D[nx + k * femMesh->Nq] *
-                           femMesh->D[mx + k * femMesh->Nq];
-                  }
-                }
-
-                if (nz == mz) {
-                  id = mx + ny * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                  dfloat Grs =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G01ID * femMesh->Np];
-                  val += Grs * femMesh->D[nx + mx * femMesh->Nq] *
-                         femMesh->D[my + ny * femMesh->Nq];
-
-                  id = nx + my * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                  dfloat Gsr =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G01ID * femMesh->Np];
-                  val += Gsr * femMesh->D[mx + nx * femMesh->Nq] *
-                         femMesh->D[ny + my * femMesh->Nq];
-                }
-
-                if (ny == my) {
-                  id = mx + ny * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                  dfloat Grt =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G02ID * femMesh->Np];
-                  val += Grt * femMesh->D[nx + mx * femMesh->Nq] *
-                         femMesh->D[mz + nz * femMesh->Nq];
-
-                  id = nx + ny * femMesh->Nq + mz * femMesh->Nq * femMesh->Nq;
-                  dfloat Gst =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G02ID * femMesh->Np];
-                  val += Gst * femMesh->D[mx + nx * femMesh->Nq] *
-                         femMesh->D[nz + mz * femMesh->Nq];
-                }
-
-                if ((nx == mx) && (nz == mz)) {
-                  for (int k = 0; k < femMesh->Nq; k++) {
-                    id = nx + k * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                    dfloat Gss =
-                      femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G11ID * femMesh->Np];
-
-                    val += Gss * femMesh->D[ny + k * femMesh->Nq] *
-                           femMesh->D[my + k * femMesh->Nq];
-                  }
-                }
-
-                if (nx == mx) {
-                  id = nx + my * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                  dfloat Gst =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G12ID * femMesh->Np];
-                  val += Gst * femMesh->D[ny + my * femMesh->Nq] *
-                         femMesh->D[mz + nz * femMesh->Nq];
-
-                  id = nx + ny * femMesh->Nq + mz * femMesh->Nq * femMesh->Nq;
-                  dfloat Gts =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G12ID * femMesh->Np];
-                  val += Gts * femMesh->D[my + ny * femMesh->Nq] *
-                         femMesh->D[nz + mz * femMesh->Nq];
-                }
-
-                if ((nx == mx) && (ny == my)) {
-                  for (int k = 0; k < femMesh->Nq; k++) {
-                    id = nx + ny * femMesh->Nq + k * femMesh->Nq * femMesh->Nq;
-                    dfloat Gtt =
-                      femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + G22ID * femMesh->Np];
-
-                    val += Gtt * femMesh->D[nz + k * femMesh->Nq] *
-                           femMesh->D[mz + k * femMesh->Nq];
-                  }
-                }
-
-                if ((nx == mx) && (ny == my) && (nz == mz)) {
-                  id = nx + ny * femMesh->Nq + nz * femMesh->Nq * femMesh->Nq;
-                  dfloat JW =
-                    femMesh->ggeo[e * femMesh->Np * femMesh->Nggeo + id + GWJID * femMesh->Np];
-                  val += JW * lambda;
-                }
-
-                // pack non-zero
-                dfloat nonZeroThreshold = 1e-7;
-                if (fabs(val) >= nonZeroThreshold) {
-#pragma omp critical
-                  {
-                    A[*cnt].val = val;
-                    A[*cnt].row = globalNumbering[idn];
-                    A[*cnt].col = globalNumbering[idm];
-                    A[*cnt].ownerRank = globalOwners[idn];
-                    (*cnt)++;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/libP/solvers/elliptic/src/setup.rc b/src/libP/solvers/elliptic/src/setup.rc
deleted file mode 100644
index 258637cd7..000000000
--- a/src/libP/solvers/elliptic/src/setup.rc
+++ /dev/null
@@ -1,44 +0,0 @@
-[FORMAT]
-1.0
-[DATA FILE]
-data / ellipticHomogeneous2D.h
-[MESH FILE]
-../../ meshes / cavityH05.msh
-[MESH DIMENSION]
-2
-[ELEMENT TYPE]
-3
-[POLYNOMIAL DEGREE]
-6
-[THREAD MODEL]
-CUDA
-[PLATFORM NUMBER]
-0
-[DEVICE NUMBER]
-0
-[LAMBDA]
-10
-[KRYLOV SOLVER]
-PCG + FLEXIBLE
-[DISCRETIZATION]
-IPDG
-[BASIS]
-NODAL
-[PRECONDITIONER]
-JACOBI
-[MULTIGRID COARSENING]
-HALFDOFS
-[MULTIGRID SMOOTHER]
-CHEBYSHEV
-[PARALMOND CYCLE]
-VCYCLE
-[PARALMOND SMOOTHER]
-CHEBYSHEV
-[PARALMOND PARTITION]
-DISTRIBUTED
-[PARALMOND AGGREGATION STRATEGY]
-DEFAULT
-[PARALMOND LPSCN ORDERING]
-MAX
-[VERBOSE]
-FALSE
diff --git a/src/libP/src/hash.c b/src/libP/src/hash.c
deleted file mode 100644
index e2f218ebc..000000000
--- a/src/libP/src/hash.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-
-unsigned int hash(const unsigned int value)
-{
-  const int p[8] = {
-    102679, 102701, 102761, 102763,
-    102769, 102793, 102797, 102811
-  };
-  int h[8] = {
-    101527, 101531, 101533, 101537,
-    101561, 101573, 101581, 101599
-  };
-
-  const char* ptr = (char*) &value;
-
-  for (int i = 0; i < sizeof(unsigned int); ++i)
-    for (int j = 0; j < 8; ++j)
-      h[j] = (h[j] * p[j]) ^ ptr[i];
-
-  unsigned int ret = h[0];
-  for (int i = 1; i < 8; ++i)
-    ret ^= h[i];
-
-  return h[0];
-}
diff --git a/src/libP/src/matrixEig.cpp b/src/libP/src/matrixEig.cpp
deleted file mode 100644
index e1a90ebe1..000000000
--- a/src/libP/src/matrixEig.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include <unistd.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "types.h"
-#include <mesh.h>
-
-extern "C" {
-  void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI,
-              double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO );
-}
-
-// compute right eigenvectors
-void matrixEig(int N, dfloat *A, dfloat *VR, dfloat *WR, dfloat *WI){
-
-  char JOBVL = 'N';
-  char JOBVR = 'V';
-  int LDA = N;
-  int LDVL = N;
-  int LDVR = N;
-  int LWORK = 8*N;
-
-  double *tmpA  = (double*) calloc(N*N,sizeof(double));
-  double *tmpWR = (double*) calloc(N,sizeof(double));
-  double *tmpWI = (double*) calloc(N,sizeof(double));
-  double *tmpVR = (double*) calloc(N*N,sizeof(double));
-  double *tmpVL = NULL;
-  double *WORK  = (double*) calloc(LWORK,sizeof(double));
-
-  int info;
-
-  for(int n=0;n<N;++n){
-    for(int m=0;m<N;++m){
-      tmpA[n+m*N] = A[n*N+m];
-    }
-  }
-
-  dgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, tmpWR, tmpWI, tmpVL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &info);
-
-  for(int n=0;n<N;++n){
-    WR[n] = tmpWR[n];
-    WI[n] = tmpWI[n];
-    for(int m=0;m<N;++m){
-      VR[n+m*N] = tmpVR[n*N+m];
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/libP/src/matrixRightSolve.cpp b/src/libP/src/matrixRightSolve.cpp
deleted file mode 100644
index 4fb5f814d..000000000
--- a/src/libP/src/matrixRightSolve.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include <unistd.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "types.h"
-#include <mesh.h>
-
-extern "C" {
-  void dgesv_ ( int     *N, int     *NRHS, double  *A,
-                int     *LDA,
-                int     *IPIV,
-                double  *B,
-                int     *LDB,
-                int     *INFO );
-}
-
-// C = A/B  = trans(trans(B)\trans(A))
-// assume row major
-void matrixRightSolve(int NrowsA, int NcolsA, dfloat *A, int NrowsB, int NcolsB, dfloat *B, dfloat *C){
-
-  int info;
-
-  int NrowsX = NcolsB;
-  int NcolsX = NrowsB;
-
-  int NrowsY = NcolsA;
-  int NcolsY = NrowsA;
-
-  int lwork = NrowsX*NcolsX;
-
-  // compute inverse mass matrix
-  double *tmpX = (double*) calloc(NrowsX*NcolsX, sizeof(double));
-  double *tmpY = (double*) calloc(NrowsY*NcolsY, sizeof(double));
-
-  int    *ipiv = (int*) calloc(NrowsX, sizeof(int));
-  double *work = (double*) calloc(lwork, sizeof(double));
-
-  for(int n=0;n<NrowsX*NcolsX;++n){
-    tmpX[n] = B[n];
-  }
-
-  for(int n=0;n<NrowsY*NcolsY;++n){
-    tmpY[n] =A[n];
-  }
-
-  dgesv_(&NrowsX, &NcolsY, tmpX, &NrowsX, ipiv, tmpY, &NrowsY, &info); // ?
-
-  for(int n=0;n<NrowsY*NcolsY;++n){
-    C[n] = tmpY[n];
-  }
-
-  if(info)
-    printf("matrixRightSolve: dgesv reports info = %d when inverting matrix\n", info);
-
-  free(work);
-  free(ipiv);
-  free(tmpX);
-  free(tmpY);
-}
\ No newline at end of file
diff --git a/src/libP/src/matrixTranspose.cpp b/src/libP/src/matrixTranspose.cpp
deleted file mode 100644
index e96950f42..000000000
--- a/src/libP/src/matrixTranspose.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include <unistd.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <mesh.h>
-
-void matrixTranspose(const int M, const int N,
-                     const dfloat  *A, const int LDA,
-                           dfloat *AT, const int LDAT) {
-  //A & A^T - Row major ordering
-  //M = number of rows of A, columns of A^T
-  //N = number of columns of A, rows of A^T
-  //LDA  - leading dimension of A (>=M)
-  //LDAT - leading dimension of A^T (>=N)
-
-  //quick return
-  if (N<1 || M<1) return;
-
-  //check for weird input
-  if (LDA<N || LDAT<M) {
-    printf("Bad input to matrixTranspose\n");
-    return;
-  }
-
-  for (int n=0;n<N;n++) { //for all cols of A^T
-    for (int m=0;m<M;m++) { //for all rows of A^T
-      AT[n*LDAT+m] = A[m*LDA+n];
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/libP/src/meshBasis1D.cpp b/src/libP/src/meshBasis1D.cpp
deleted file mode 100644
index b2b807375..000000000
--- a/src/libP/src/meshBasis1D.cpp
+++ /dev/null
@@ -1,526 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.h"
-
-// ------------------------------------------------------------------------
-// 1D NODES
-// ------------------------------------------------------------------------
-void Nodes1D(int _N, dfloat *_r){
-  JacobiGLL(_N, _r); //Gauss-Legendre-Lobatto nodes
-}
-
-void EquispacedNodes1D(int _N, dfloat *_r){
-  int _Nq = _N+1;
-
-  dfloat dr = 2.0/_N;
-  for (int i=0;i<_Nq;i++) _r[i] = -1.0 + i*dr;
-}
-
-// ------------------------------------------------------------------------
-// ORTHONORMAL BASIS POLYNOMIALS
-// ------------------------------------------------------------------------
-void OrthonormalBasis1D(dfloat a, int i, dfloat *P){
-  *P = JacobiP(a,0,0,i); //Legendre Polynomials
-}
-
-void GradOrthonormalBasis1D(dfloat a, int i, dfloat *Pr){
-  *Pr = GradJacobiP(a,0,0,i);
-}
-
-// ------------------------------------------------------------------------
-// 1D VANDERMONDE MATRICES
-// ------------------------------------------------------------------------
-void Vandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *V){
-
-  int _Np = (_N+1);
-
-  for(int n=0; n<Npoints; n++){
-    for(int i=0; i<_Np; i++){
-      int id = n*_Np+i;
-      OrthonormalBasis1D(_r[n], i, V+id);
-    }
-  }
-}
-
-void GradVandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *Vr){
-
-  int _Np = (_N+1);
-
-  for(int n=0; n<Npoints; n++){
-    for(int i=0; i<_Np; i++){
-      int id = n*_Np+i;
-      GradOrthonormalBasis1D(_r[n], i, Vr+id);
-    }
-  }
-}
-
-// ------------------------------------------------------------------------
-// 1D OPERATOR MATRICES
-// ------------------------------------------------------------------------
-void MassMatrix1D(int _Np, dfloat *V, dfloat *_MM){
-
-  // masMatrix = inv(V')*inv(V) = inv(V*V')
-  for(int n=0;n<_Np;++n){
-    for(int m=0;m<_Np;++m){
-      dfloat res = 0;
-      for(int i=0;i<_Np;++i){
-        res += V[n*_Np+i]*V[m*_Np+i];
-      }
-      _MM[n*_Np + m] = res;
-    }
-  }
-  matrixInverse(_Np, _MM);
-}
-
-void Dmatrix1D(int _N, int NpointsIn, dfloat *_rIn,
-                               int NpointsOut, dfloat *_rOut, dfloat *_Dr){
-
-  // need NpointsIn = (_N+1)
-  if (NpointsIn != _N+1){
-    std::cout << "Invalid Differentiation operator requested.\n";
-    exit(-1);
-  }
-
-  int _Np = _N+1;
-
-  dfloat *V  = (dfloat *) calloc(NpointsIn*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(NpointsOut*_Np, sizeof(dfloat));
-
-  Vandermonde1D(_N, NpointsIn, _rIn, V);
-  GradVandermonde1D(_N, NpointsOut, _rOut, Vr);
-
-  //D = Vr/V
-  matrixRightSolve(NpointsOut, _Np, Vr, _Np, _Np, V, _Dr);
-
-  free(V);
-  free(Vr);
-}
-
-void DWmatrix1D(int _N, dfloat *_D, dfloat *_DT){
-
-  int _Nq = _N+1;
-
-  for(int n=0;n<_Nq;++n){
-    for(int m=0;m<_Nq;++m){
-      _DT[n*_Nq+m] = 0.0;
-      for(int k=0;k<_Nq;++k) _DT[n*_Nq+m] += _D[m*_Nq+k];
-    }
-  }
-
-/*
-  dfloat *r1D  = (dfloat *) calloc(_Nq, sizeof(dfloat));
-  dfloat *w1D  = (dfloat *) calloc(_Nq, sizeof(dfloat));
-  JacobiGLL(_N, r1D, w1D); // i.e. 1D gll points and correspondin weights from mass lumping
-
-  dfloat *V1D  = (dfloat *) calloc(_Nq*_Nq, sizeof(dfloat));
-  dfloat *V1Dr = (dfloat *) calloc(_Nq*_Nq, sizeof(dfloat));
-  Vandermonde1D(_N, _Nq, r1D, V1D);  
-  GradVandermonde1D(_N, _Nq, r1D, V1Dr);
-
-  // DW1D = V*Vr'*diag(w)
-  for(int n=0;n<_Nq;++n){
-    for(int m=0;m<_Nq;++m){
-      dfloat dw = 0;
-      for(int i=0; i<_Nq; i++) dw += V1D[n*_Nq + i]*V1Dr[m*_Nq + i];
-      _DT[n*_Nq+m] = dw; //*w1D[m]; // scale by w
-    }
-  }
-
-  free(r1D);
-  free(w1D);
-  free(V1D);
-  free(V1Dr);
-*/
-}
-
-void InterpolationMatrix1D(int _N,
-                               int NpointsIn, dfloat *rIn,
-                               int NpointsOut, dfloat *rOut,
-                               dfloat *I){
-
-  // need NpointsIn = (_N+1)
-  if (NpointsIn != _N+1){
-    std::cout << "Invalid Interplation operator requested.\n";
-    exit(-1);
-  }
-
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*(_N+1)*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*(_N+1)*sizeof(dfloat));
-
-  Vandermonde1D(_N, NpointsIn,   rIn, VIn);
-  Vandermonde1D(_N, NpointsOut, rOut, VOut);
-
-  matrixRightSolve(NpointsOut, _N+1, VOut, NpointsIn, _N+1, VIn, I);
-
-  free(VIn); free(VOut);
-}
-
-void DegreeRaiseMatrix1D(int Nc, int Nf, dfloat *P){
-
-  int Nqc = Nc+1;
-  int Nqf = Nf+1;
-
-  dfloat *rc = (dfloat *) malloc(Nqc*sizeof(dfloat));
-  dfloat *rf = (dfloat *) malloc(Nqf*sizeof(dfloat));
-
-  Nodes1D(Nc, rc);
-  Nodes1D(Nf, rf);
-
-  InterpolationMatrix1D(Nc, Nqc, rc, Nqf, rf, P);
-
-  free(rc); free(rf);
-}
-
-void CubatureWeakDmatrix1D(int _Nq, int _cubNq,
-                                     dfloat *_cubProject, dfloat *_cubD, dfloat *_cubPDT){
-
-  // cubPDT = cubProject*cubD';
-  for(int n=0;n<_Nq;++n){
-    for(int m=0;m<_cubNq;++m){
-      _cubPDT[n*_cubNq+m] = 0.0;
-      for(int k=0;k<_cubNq;++k){
-        _cubPDT[n*_cubNq+m] += _cubProject[n*_cubNq+k]*_cubD[m*_cubNq+k];
-      }
-    }
-  }
-}
-
-// ------------------------------------------------------------------------
-// 1D JACOBI POLYNOMIALS
-// ------------------------------------------------------------------------
-static dfloat mygamma(dfloat x){
-  dfloat lgam = lgamma(x);
-  dfloat gam  = signgam*exp(lgam);
-  return gam;
-}
-
-dfloat JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
-
-  dfloat ax = a;
-
-  dfloat *P = (dfloat *) calloc((_N+1), sizeof(dfloat));
-
-  // Zero order
-  dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta);
-  dfloat p0     = 1.0/sqrt(gamma0);
-
-  if (_N==0){ free(P); return p0;}
-  P[0] = p0;
-
-  // first order
-  dfloat gamma1 = (alpha+1)*(beta+1)/(alpha+beta+3)*gamma0;
-  dfloat p1     = ((alpha+beta+2)*ax/2 + (alpha-beta)/2)/sqrt(gamma1);
-  if (_N==1){free(P); return p1;}
-
-  P[1] = p1;
-
-  /// Repeat value in recurrence.
-  dfloat aold = 2/(2+alpha+beta)*sqrt((alpha+1.)*(beta+1.)/(alpha+beta+3.));
-  /// Forward recurrence using the symmetry of the recurrence.
-  for(int i=1;i<=_N-1;++i){
-    dfloat h1 = 2.*i+alpha+beta;
-    dfloat anew = 2./(h1+2.)*sqrt( (i+1.)*(i+1.+alpha+beta)*(i+1+alpha)*(i+1+beta)/(h1+1)/(h1+3));
-    dfloat bnew = -(alpha*alpha-beta*beta)/h1/(h1+2);
-    P[i+1] = 1./anew*( -aold*P[i-1] + (ax-bnew)*P[i]);
-    aold =anew;
-  }
-
-  dfloat pN = P[_N];
-  free(P);
-  return pN;
-}
-
-dfloat GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
-
-  dfloat PNr = 0;
-
-  if(_N>0)
-    PNr = sqrt(_N*(_N+alpha+beta+1.))*JacobiP(a, alpha+1.0, beta+1.0, _N-1);
-
-  return PNr;
-}
-
-// ------------------------------------------------------------------------
-// 1D GAUSS-LEGENDRE-LOBATTO QUADRATURE
-// ------------------------------------------------------------------------
-void JacobiGLL(int _N, dfloat *_x, dfloat *_w){
-
-  _x[0] = -1.;
-  _x[_N] =  1.;
-
-  if(_N>1){
-    dfloat *wtmp = (dfloat*) calloc(_N-1, sizeof(dfloat));
-    JacobiGQ(1,1, _N-2, _x+1, wtmp);
-    free(wtmp);
-  }
-
-  if (_w!=NULL) {
-    int _Np = _N+1;
-    dfloat *_MM = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-    dfloat  *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-
-    Vandermonde1D(_N, _N+1, _x, V);
-    MassMatrix1D(_N+1, V, _MM);
-
-    // use weights from mass lumping
-    for(int n=0;n<=_N;++n){
-      dfloat res = 0;
-      for(int m=0;m<=_N;++m){
-        res += _MM[n*(_N+1)+m];
-      }
-      _w[n] = res;
-    }
-  }
-}
-
-// ------------------------------------------------------------------------
-// 1D GAUSS QUADRATURE
-// ------------------------------------------------------------------------
-void JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w){
-
-  // function NGQ = JacobiGQ(alpha,beta,_N, _x, _w)
-  // Purpose: Compute the _N'th order Gauss quadrature points, _x,
-  //          and weights, _w, associated with the Jacobi
-  //          polynomial, of type (alpha,beta) > -1 ( <> -0.5).
-  if (_N==0){
-    _x[0] = (alpha-beta)/(alpha+beta+2);
-    _w[0] = 2;
-  }
-
-  // Form symmetric matrix from recurrence.
-  dfloat *J = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
-  dfloat *h1 = (dfloat*) calloc(_N+1, sizeof(dfloat));
-
-  for(int n=0;n<=_N;++n){
-    h1[n] = 2*n+alpha+beta;
-  }
-
-  // J = J + J';
-  for(int n=0;n<=_N;++n){
-    // J = diag(-1/2*(alpha^2-beta^2)./(h1+2)./h1) + ...
-    J[n*(_N+1)+n]+= -0.5*(alpha*alpha-beta*beta)/((h1[n]+2)*h1[n])*2; // *2 for symm
-
-    //    diag(2./(h1(1:_N)+2).*sqrt((1:_N).*((1:_N)+alpha+beta).*((1:_N)+alpha).*((1:_N)+beta)./(h1(1:_N)+1)./(h1(1:_N)+3)),1);
-    if(n<_N){
-      J[n*(_N+1)+n+1]   += (2./(h1[n]+2.))*sqrt((n+1)*(n+1+alpha+beta)*(n+1+alpha)*(n+1+beta)/((h1[n]+1)*(h1[n]+3)));
-      J[(n+1)*(_N+1)+n] += (2./(h1[n]+2.))*sqrt((n+1)*(n+1+alpha+beta)*(n+1+alpha)*(n+1+beta)/((h1[n]+1)*(h1[n]+3)));
-    }
-  }
-
-  dfloat eps = 1;
-  while(1+eps>1){
-    eps = eps/2.;
-  }
-  // printf("MACHINE PRECISION %e\n", eps);
-
-  if (alpha+beta<10*eps) J[0] = 0;
-
-  // Compute quadrature by eigenvalue solve
-
-  //  [V,D] = eig(J);
-  dfloat *WR = (dfloat*) calloc(_N+1, sizeof(dfloat));
-  dfloat *WI = (dfloat*) calloc(_N+1, sizeof(dfloat));
-  dfloat *VR = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
-
-  // _x = diag(D);
-  matrixEig(_N+1, J, VR, _x, WI);
-
-  //_w = (V(1,:)').^2*2^(alpha+beta+1)/(alpha+beta+1)*gamma(alpha+1)*.gamma(beta+1)/gamma(alpha+beta+1);
-  for(int n=0;n<=_N;++n){
-    _w[n] = pow(VR[0*(_N+1)+n],2)*(pow(2,alpha+beta+1)/(alpha+beta+1))*mygamma(alpha+1)*mygamma(beta+1)/mygamma(alpha+beta+1);
-  }
-
-  // sloppy sort
-  for(int n=0;n<=_N;++n){
-    for(int m=n+1;m<=_N;++m){
-      if(_x[n]>_x[m]){
-        dfloat tmpx = _x[m];
-        dfloat tmpw = _w[m];
-        _x[m] = _x[n];
-        _w[m] = _w[n];
-        _x[n] = tmpx;
-        _w[n] = tmpw;
-      }
-    }
-  }
-
-#if 0
-  for(int n=0;n<=_N;++n){
-    printf("zgl[%d] = % e, wgl[%d] = % e\n", n, _x[0][n], n, _w[0][n]);
-  }
-#endif
-
-  free(WR);
-  free(WI);
-  free(VR);
-}
-
-/*
-// C0 basis
-int meshContinuousVandermonde1D(int _N, int Npoints, dfloat *_r, dfloat **V, dfloat **Vr){
-
-  int _Np = (_N+1);
-
-  *V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-
-  for(int n=0; n<Npoints; n++){
-
-    int sk = 0;
-    for(int i=0; i<=_N; i++){
-      int id = n*_Np+sk;
-      if(i==0){
-        V[0][id] = 0.5*(1-_r[n]);
-        Vr[0][id] = -0.5;
-      }
-      else  if(i==1){
-        V[0][id] = 0.5*(1+_r[n]);
-        Vr[0][id] = +0.5;
-      }
-      else{
-        // 0.25*(1+_r)*(1-_r)*P^{0,0}_{i-2}(_r)
-        dfloat P =  meshJacobiP(_r[n], 0, 0, i-2);
-        dfloat Pr = meshGradJacobiP(_r[n], 0, 0, i-2);
-        V[0][id]  = 0.25*(1+_r[n])*(1-_r[n])*P;
-        Vr[0][id] = 0.25*( (-2*_r[n])*P + (1+_r[n])*(1-_r[n])*Pr);
-      }
-
-      sk++;
-    }
-  }
-
-  return _Np;
-}
-*/
-
-/*
-void meshContinuousFilterMatrix1D(int _N, int Nlow, dfloat *_r, dfloat **F){
-
-  dfloat *VC0, *VrC0;
-  dfloat *L = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
-  dfloat *LinvF = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
-
-  int _Np = meshContinuousVandermonde1D(_N, _N+1, _r, &VC0, &VrC0);
-  //  int _Np = meshVandermonde1D(_N, _N+1, _r, &VC0, &VrC0); use
-  printf("CONTINUOUS VANDERMONDE MATRIX: [\n");
-  for(int n=0;n<_Np;++n){
-    for(int m=0;m<_Np;++m){
-      printf("% e ", VC0[n*_Np+m]);
-    }
-    printf("\n");
-  }
-  printf("\n");
-
-  *F = (dfloat *) calloc(_Np*_Np, sizeof(dfloat));
-
-  for(int n=0;n<=Nlow;++n){
-    L[n*(_N+1)+n] = 1;
-  }
-
-  matrixRightSolve(_Np, _Np, L, _Np, _Np, VC0, LinvF);
-
-  for(int n=0;n<_Np;++n){
-    for(int m=0;m<_Np;++m){
-      dfloat res = 0;
-      printf("% e ", LinvF[n*_Np+m]);
-    }
-    printf("\n");
-  }
-  printf("\n");
-
-  printf("FILTER MATRIX: [\n");
-  for(int n=0;n<_Np;++n){
-    for(int m=0;m<_Np;++m){
-      dfloat res = 0;
-      for(int i=0;i<_Np;++i){
-        res += VC0[n*_Np+i]*LinvF[i*_Np+m];
-      }
-      F[0][n*_Np+m] = res;
-      printf("% e ", res);
-    }
-    printf("\n");
-  }
-  printf("\n");
-
-  free(VC0);
-  free(VrC0);
-  free(L);
-  free(LinvF);
-}
-*/
-
-// ------------------------------------------------------------------------
-// 1D INTERPOLATION MATRICES
-// ------------------------------------------------------------------------
-
-/*
-
-*/
-
-/*
-
-void meshCubatureWeakDmatrices1D(int _N, int _Np, dfloat *V,
-                                 int cubNp, dfloat *cubr, dfloat *cubw,
-                                 dfloat **cubDrT, dfloat **cubProject){
-
-  dfloat *cubV, *cubVr;
-
-  meshVandermonde1D(_N, cubNp, cubr, &cubV, &cubVr);
-
-  // cubDrT = V*transpose(cVr)*diag(cubw);
-  // cubProject = V*cV'*diag(cubw); %% relies on (transpose(cV)*diag(cubw)*cV being the identity)
-
-  for(int n=0;n<cubNp;++n){
-    for(int m=0;m<_Np;++m){
-      // scale by cubw
-      cubVr[n*_Np+m] *= cubw[n];
-      cubV[n*_Np+m]  *= cubw[n];
-    }
-  }
-
-  *cubDrT = (dfloat*) calloc(cubNp*_Np, sizeof(dfloat));
-  *cubProject = (dfloat*) calloc(cubNp*_Np, sizeof(dfloat));
-
-  for(int n=0;n<_Np;++n){
-    for(int m=0;m<cubNp;++m){
-      dfloat resP = 0, resDrT = 0;
-
-      for(int i=0;i<_Np;++i){
-        dfloat Vni = V[n*_Np+i];
-        resDrT += Vni*cubVr[m*_Np+i];
-        resP   += Vni*cubV[m*_Np+i];
-      }
-
-      cubDrT[0][n*cubNp+m] = resDrT;
-      cubProject[0][n*cubNp+m] = resP;
-    }
-  }
-
-  free(cubV);
-  free(cubVr);
-}
-*/
diff --git a/src/libP/src/meshBasisHex3D.cpp b/src/libP/src/meshBasisHex3D.cpp
deleted file mode 100644
index 603a34333..000000000
--- a/src/libP/src/meshBasisHex3D.cpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.h"
-#include "mesh3D.h"
-
-// ------------------------------------------------------------------------
-// HEX 3D NODES
-// ------------------------------------------------------------------------
-void NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
-  int _Nq = _N+1;
-
-  dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat));
-  JacobiGLL(_N, r1D); //Gauss-Legendre-Lobatto nodes
-
-  //Tensor product
-  for (int k=0;k<_Nq;k++) {
-    for (int j=0;j<_Nq;j++) {
-      for (int i=0;i<_Nq;i++) {
-        _r[i+j*_Nq+k*_Nq*_Nq] = r1D[i];
-        _s[i+j*_Nq+k*_Nq*_Nq] = r1D[j];
-        _t[i+j*_Nq+k*_Nq*_Nq] = r1D[k];
-      }
-    }
-  }
-
-  free(r1D);
-}
-
-
-void FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes){
-  int _Nq = _N+1;
-  int _Nfp = _Nq*_Nq;
-  int _Np = _Nq*_Nq*_Nq;
-
-  int cnt[6];
-  for (int i=0;i<6;i++) cnt[i]=0;
-
-  dfloat deps = 1.;
-  while((1.+deps)>1.)
-    deps *= 0.5;
-
-  const dfloat NODETOL = 1000.*deps;
-
-  for (int n=0;n<_Np;n++) {
-    if(fabs(_t[n]+1)<NODETOL)
-      _faceNodes[0*_Nfp+(cnt[0]++)] = n;
-    if(fabs(_s[n]+1)<NODETOL)
-      _faceNodes[1*_Nfp+(cnt[1]++)] = n;
-    if(fabs(_r[n]-1)<NODETOL)
-      _faceNodes[2*_Nfp+(cnt[2]++)] = n;
-    if(fabs(_s[n]-1)<NODETOL)
-      _faceNodes[3*_Nfp+(cnt[3]++)] = n;
-    if(fabs(_r[n]+1)<NODETOL)
-      _faceNodes[4*_Nfp+(cnt[4]++)] = n;
-    if(fabs(_t[n]-1)<NODETOL)
-      _faceNodes[5*_Nfp+(cnt[5]++)] = n;
-  }
-}
-#if 0
-
-void mesh_t::VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes){
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  dfloat deps = 1.;
-  while((1.+deps)>1.)
-    deps *= 0.5;
-
-  const dfloat NODETOL = 1000.*deps;
-
-  for(int n=0;n<_Np;++n){
-    if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)+(_t[n]+1)*(_t[n]+1)<NODETOL)
-      _vertexNodes[0] = n;
-    if( (_r[n]-1)*(_r[n]-1)+(_s[n]+1)*(_s[n]+1)+(_t[n]+1)*(_t[n]+1)<NODETOL)
-      _vertexNodes[1] = n;
-    if( (_r[n]-1)*(_r[n]-1)+(_s[n]-1)*(_s[n]-1)+(_t[n]+1)*(_t[n]+1)<NODETOL)
-      _vertexNodes[2] = n;
-    if( (_r[n]+1)*(_r[n]+1)+(_s[n]-1)*(_s[n]-1)+(_t[n]+1)*(_t[n]+1)<NODETOL)
-      _vertexNodes[3] = n;
-    if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)+(_t[n]-1)*(_t[n]-1)<NODETOL)
-      _vertexNodes[4] = n;
-    if( (_r[n]-1)*(_r[n]-1)+(_s[n]+1)*(_s[n]+1)+(_t[n]-1)*(_t[n]-1)<NODETOL)
-      _vertexNodes[5] = n;
-    if( (_r[n]-1)*(_r[n]-1)+(_s[n]-1)*(_s[n]-1)+(_t[n]-1)*(_t[n]-1)<NODETOL)
-      _vertexNodes[6] = n;
-    if( (_r[n]+1)*(_r[n]+1)+(_s[n]-1)*(_s[n]-1)+(_t[n]-1)*(_t[n]-1)<NODETOL)
-      _vertexNodes[7] = n;
-  }
-}
-
-void mesh_t::EquispacedNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
-  int _Nq = _N+1;
-
-  //Equispaced 1D nodes
-  dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat));
-  dfloat dr = 2.0/_N;
-  for (int i=0;i<_Nq;i++) r1D[i] = -1.0 + i*dr;
-
-  //Tensor product
-  for (int k=0;k<_Nq;k++) {
-    for (int j=0;j<_Nq;j++) {
-      for (int i=0;i<_Nq;i++) {
-        _r[i+j*_Nq+k*_Nq*_Nq] = r1D[i];
-        _s[i+j*_Nq+k*_Nq*_Nq] = r1D[j];
-        _t[i+j*_Nq+k*_Nq*_Nq] = r1D[k];
-      }
-    }
-  }
-
-  free(r1D);
-}
-
-void mesh_t::EquispacedEToVHex3D(int _N, int *_EToV){
-  int _Nq = _N+1;
-  int _Nverts = 4;
-
-  //Tensor product
-  int cnt=0;
-  for (int k=0;k<_N;k++) {
-    for (int j=0;j<_N;j++) {
-      for (int i=0;i<_N;i++) {
-        //tet 1 (0,3,2,7)
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i+1+(j+1)*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i  +(j+1)*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-
-        //tet 2 (0,1,3,7)
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i+1+(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i+1+(j+1)*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-
-        //tet 3 (0,2,6,7)
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i  +(j+1)*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i  +(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-
-        //tet 4 (0,6,4,7)
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i  +(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i  +(j  )*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-
-        //tet 5 (0,5,1,7)
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i+1+(j  )*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i+1+(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-
-        //tet 6 (0,4,5,7)
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i  +(j  )*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i+1+(j  )*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-      }
-    }
-  }
-}
-
-void mesh_t::SEMFEMEToVHex3D(int _N, int *_EToV){
-  int _Nq = _N+1;
-  int _Nverts = 8;
-
-  //Tensor product
-  int cnt=0;
-  for (int k=0;k<_N;k++) {
-    for (int j=0;j<_N;j++) {
-      for (int i=0;i<_N;i++) {
-        _EToV[cnt*_Nverts+0] = i  +(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+1] = i+1+(j  )*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+2] = i+1+(j+1)*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+3] = i  +(j+1)*_Nq+(k  )*_Nq*_Nq;
-        _EToV[cnt*_Nverts+4] = i  +(j  )*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+5] = i+1+(j  )*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+6] = i+1+(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        _EToV[cnt*_Nverts+7] = i  +(j+1)*_Nq+(k+1)*_Nq*_Nq;
-        cnt++;
-      }
-    }
-  }
-}
-
-// ------------------------------------------------------------------------
-// ORTHONORMAL BASIS POLYNOMIALS
-// ------------------------------------------------------------------------
-void mesh_t::OrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P){
-  *P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k);
-}
-
-void mesh_t::GradOrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt){
-  *Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k);
-  *Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j)*JacobiP(c,0,0,k);
-  *Pt = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*GradJacobiP(c,0,0,k);
-}
-
-// ------------------------------------------------------------------------
-// 2D VANDERMONDE MATRICES
-// ------------------------------------------------------------------------
-
-void mesh_t::VandermondeHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *V){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  for(int n=0; n<Npoints; n++){
-    for(int k=0; k<_Nq; k++){
-      for(int j=0; j<_Nq; j++){
-        for(int i=0; i<_Nq; i++){
-          int id = n*_Np+i+j*_Nq+k*_Nq*_Nq;
-          OrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, V+id);
-        }
-      }
-    }
-  }
-}
-
-void mesh_t::GradVandermondeHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *Vr, dfloat *Vs, dfloat *Vt){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  for(int n=0; n<Npoints; n++){
-    for(int k=0; k<_Nq; k++){
-      for(int j=0; j<_Nq; j++){
-        for(int i=0; i<_Nq; i++){
-          int id = n*_Np+i+j*_Nq+k*_Nq*_Nq;
-          GradOrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, Vr+id, Vs+id, Vt+id);
-        }
-      }
-    }
-  }
-}
-
-// ------------------------------------------------------------------------
-// 2D OPERATOR MATRICES
-// ------------------------------------------------------------------------
-void mesh_t::MassMatrixHex3D(int _Np, dfloat *V, dfloat *_MM){
-
-  // masMatrix = inv(V')*inv(V) = inv(V*V')
-  for(int n=0;n<_Np;++n){
-    for(int m=0;m<_Np;++m){
-      dfloat res = 0;
-      for(int i=0;i<_Np;++i){
-        res += V[n*_Np+i]*V[m*_Np+i];
-      }
-      _MM[n*_Np + m] = res;
-    }
-  }
-  matrixInverse(_Np, _MM);
-}
-
-void mesh_t::LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  // LumpedMassMatrix = gllw \ctimes gllw \ctimes gllw
-  for(int k=0;k<_Nq;++k){
-    for(int n=0;n<_Nq;++n){
-      for(int m=0;m<_Nq;++m){
-        int id = n+m*_Nq+k*_Nq*_Nq;
-        _MM[id+id*_Np] = _gllw[n]*_gllw[m]*_gllw[k];
-      }
-    }
-  }
-}
-
-void mesh_t::invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  // invLumpedMassMatrix = invgllw \ctimes invgllw
-  for(int k=0;k<_Nq;++k){
-    for(int n=0;n<_Nq;++n){
-      for(int m=0;m<_Nq;++m){
-        int id = n+m*_Nq+k*_Nq*_Nq;
-        _invMM[id+id*_Np] = 1.0/(_gllw[n]*_gllw[m]*_gllw[k]);
-      }
-    }
-  }
-}
-
-void mesh_t::DmatrixHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t,
-                                                dfloat *_Dr, dfloat *_Ds, dfloat *_Dt){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  dfloat *V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vt = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-
-  VandermondeHex3D(_N, Npoints, _r, _s, _t, V);
-  GradVandermondeHex3D(_N, Npoints, _r, _s, _t, Vr, Vs, Vt);
-
-  //Dr = Vr/V, Ds = Vs/V, Dt = Vt/V
-  matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
-  matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
-  matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt);
-
-  free(V); free(Vr); free(Vs); free(Vt);
-}
-
-void mesh_t::InterpolationMatrixHex3D(int _N,
-                               int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn,
-                               int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut,
-                               dfloat *I){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
-  // need NpointsIn = _Np
-  if (NpointsIn != _Np)
-    LIBP_ABORT(string("Invalid Interplation operator requested."))
-
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat));
-
-  VandermondeHex3D(_N, NpointsIn,   rIn, sIn, tIn, VIn);
-  VandermondeHex3D(_N, NpointsOut, rOut, sOut, tOut, VOut);
-
-  matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I);
-
-  free(VIn); free(VOut);
-}
-#endif
\ No newline at end of file
diff --git a/src/libP/src/meshBuildMRABClusters2D.c b/src/libP/src/meshBuildMRABClusters2D.c
deleted file mode 100644
index 68d2e2331..000000000
--- a/src/libP/src/meshBuildMRABClusters2D.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh2D.h"
-
-typedef struct
-{
-  int id;
-  int level;
-  dfloat weight;
-
-  // 4 for maximum number of vertices per element in 2D
-  int v[4];
-  dfloat EX[4], EY[4];
-
-  int cRank;
-  int cId;
-  int type;
-} cElement_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-} cluster_t;
-
-int compareCluster2D(const void* a, const void* b)
-{
-  cElement_t* na = (cElement_t*) a;
-  cElement_t* nb = (cElement_t*) b;
-
-  if (na->cRank < nb->cRank) return -1;
-  if (nb->cRank < na->cRank) return +1;
-
-  if (na->cId < nb->cId) return -1;
-  if (nb->cId < na->cId) return +1;
-
-  return 0;
-}
-
-void meshBuildMRABClusters2D(mesh2D* mesh,
-                             int lev,
-                             dfloat* weights,
-                             int* levels,
-                             int* Nclusters,
-                             cluster_t** clusters,
-                             int* Nelements,
-                             cElement_t** elements)
-{
-  int rank, size;
-
-  rank = mesh->rank;
-  size = mesh->size;
-
-  // minimum {vertex id % size}
-  int* Nsend = (int*) calloc(size, sizeof(int));
-  int* Nrecv = (int*) calloc(size, sizeof(int));
-  int* Ncount = (int*) calloc(size, sizeof(int));
-  int* sendOffsets = (int*) calloc(size, sizeof(int));
-  int* recvOffsets = (int*) calloc(size, sizeof(int));
-  int* sendCounts = (int*) calloc(size, sizeof(int));
-
-  //build element struct
-  *elements = (cElement_t*) calloc(mesh->Nelements + mesh->totalHaloPairs,sizeof(cElement_t));
-  for (int e = 0; e < mesh->Nelements; e++) {
-    (*elements)[e].id = e;
-    (*elements)[e].level = 0.;
-    if (levels) (*elements)[e].level = levels[e];
-
-    (*elements)[e].weight = 1.;
-    if (weights) (*elements)[e].weight = weights[e];
-
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      (*elements)[e].v[n] = mesh->EToV[e * mesh->Nverts + n];
-      (*elements)[e].EX[n] = mesh->EX[e * mesh->Nverts + n];
-      (*elements)[e].EY[n] = mesh->EY[e * mesh->Nverts + n];
-    }
-    (*elements)[e].type = mesh->elementInfo[e];
-
-    //initialize the clustering numbering
-    (*elements)[e].cId = e;
-    (*elements)[e].cRank = rank;
-  }
-
-  cElement_t* sendBuffer = (cElement_t*) calloc(mesh->totalHaloPairs,sizeof(cElement_t));
-
-  //propagate clusters
-  int allDone = 0;
-  int rankDone, done;
-  while(!allDone) {
-    meshHaloExchange(mesh, sizeof(cElement_t), *elements, sendBuffer, *elements + mesh->Nelements);
-
-    rankDone = 1;
-    //local clustering
-    done = 0;
-    while(!done) {
-      done = 1;
-      for (int e = 0; e < mesh->Nelements; e++)
-        for (int f = 0; f < mesh->Nfaces; f++) {
-          int eP = mesh->EToE[e * mesh->Nfaces + f];
-          if (eP > -1) {
-            if (((*elements)[eP].level < lev + 1) || ((*elements)[e].level < lev + 1)) {
-              if (compareCluster2D(*elements + eP,*elements + e) < 0) {
-                (*elements)[e].cRank = (*elements)[eP].cRank;
-                (*elements)[e].cId   = (*elements)[eP].cId;
-                done = 0;
-                rankDone = 0;
-              }
-            }
-          }
-        }
-    }
-
-    MPI_Allreduce(&rankDone, &allDone, 1, MPI_INT, MPI_SUM, mesh->comm);
-    allDone /= size;
-  }
-
-  //clusters have been built
-  //transfer them to their owning rank
-
-  qsort((*elements), mesh->Nelements, sizeof(cElement_t), compareCluster2D);
-
-  //set up exchange along MPI interfaces
-  for (int r = 0; r < size; r++)
-    Nsend[r] = 0;
-
-  for(int e = 0; e < mesh->Nelements; ++e)
-    ++Nsend[(*elements)[e].cRank];
-
-  // find send offsets
-  sendOffsets[0] = 0;
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT,
-               Nrecv, 1, MPI_INT,
-               mesh->comm);
-
-  // count incoming faces
-  int allNrecv = 0;
-  for(int r = 0; r < size; ++r) {
-    allNrecv += Nrecv[r];
-    Nrecv[r] *= sizeof(cElement_t);
-    Nsend[r] *= sizeof(cElement_t);
-    sendOffsets[r] *= sizeof(cElement_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  // buffer for recvied elements
-  cElement_t* recvElements = (cElement_t*) calloc(allNrecv, sizeof(cElement_t));
-
-  // exchange parallel faces
-  MPI_Alltoallv(*elements, Nsend, sendOffsets, MPI_CHAR,
-                recvElements, Nrecv, recvOffsets, MPI_CHAR,
-                mesh->comm);
-
-  free(*elements);
-  *elements = recvElements;
-  *Nelements = allNrecv;
-
-  qsort((*elements), *Nelements, sizeof(cElement_t), compareCluster2D);
-
-  //build cluster lists
-  // the lists are already sorted by cluster, so we just scan for different indices
-  *Nclusters = 0;
-  if (*Nelements) {
-    (*Nclusters)++;
-    for (int e = 1; e < *Nelements; e++)
-      if ((*elements)[e].cId != (*elements)[e - 1].cId) (*Nclusters)++;
-
-    *clusters = (cluster_t*) calloc(*Nclusters,sizeof(cluster_t));
-
-    int cnt  = 0;
-    int ecnt = 1;
-    (*clusters)[0].Nelements = 1;
-    (*clusters)[0].offSet = 0;
-    for (int e = 1; e < *Nelements; e++) {
-      if ((*elements)[e].cId != (*elements)[e - 1].cId) {
-        cnt++;
-        (*clusters)[cnt].offSet = e;
-        (*clusters)[cnt].Nelements = 1;
-      } else {
-        (*clusters)[cnt].Nelements++;
-      }
-    }
-  }
-}
diff --git a/src/libP/src/meshBuildMRABClusters3D.c b/src/libP/src/meshBuildMRABClusters3D.c
deleted file mode 100644
index 2c44f2ed3..000000000
--- a/src/libP/src/meshBuildMRABClusters3D.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh3D.h"
-
-typedef struct
-{
-  int id;
-  int level;
-  dfloat weight;
-
-  // 8 for maximum number of vertices per element in 3D
-  int v[8];
-  dfloat EX[8], EY[8], EZ[8];
-
-  int cRank;
-  int cId;
-  int type;
-} cElement_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-} cluster_t;
-
-int compareCluster3D(const void* a, const void* b)
-{
-  cElement_t* na = (cElement_t*) a;
-  cElement_t* nb = (cElement_t*) b;
-
-  if (na->cRank < nb->cRank) return -1;
-  if (nb->cRank < na->cRank) return +1;
-
-  if (na->cId < nb->cId) return -1;
-  if (nb->cId < na->cId) return +1;
-
-  return 0;
-}
-
-void meshBuildMRABClusters3D(mesh3D* mesh,
-                             int lev,
-                             dfloat* weights,
-                             int* levels,
-                             int* Nclusters,
-                             cluster_t** clusters,
-                             int* Nelements,
-                             cElement_t** elements)
-{
-  int rank, size;
-
-  rank = mesh->rank;
-  size = mesh->size;
-
-  // minimum {vertex id % size}
-  int* Nsend = (int*) calloc(size, sizeof(int));
-  int* Nrecv = (int*) calloc(size, sizeof(int));
-  int* Ncount = (int*) calloc(size, sizeof(int));
-  int* sendOffsets = (int*) calloc(size, sizeof(int));
-  int* recvOffsets = (int*) calloc(size, sizeof(int));
-  int* sendCounts = (int*) calloc(size, sizeof(int));
-
-  //build element struct
-  *elements = (cElement_t*) calloc(mesh->Nelements + mesh->totalHaloPairs,sizeof(cElement_t));
-  for (int e = 0; e < mesh->Nelements; e++) {
-    (*elements)[e].id = e;
-    (*elements)[e].level = 0.;
-    if (levels) (*elements)[e].level = levels[e];
-
-    (*elements)[e].weight = 1.;
-    if (weights) (*elements)[e].weight = weights[e];
-
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      (*elements)[e].v[n] = mesh->EToV[e * mesh->Nverts + n];
-      (*elements)[e].EX[n] = mesh->EX[e * mesh->Nverts + n];
-      (*elements)[e].EY[n] = mesh->EY[e * mesh->Nverts + n];
-      (*elements)[e].EZ[n] = mesh->EZ[e * mesh->Nverts + n];
-    }
-    (*elements)[e].type = mesh->elementInfo[e];
-
-    //initialize the clustering numbering
-    (*elements)[e].cId = e;
-    (*elements)[e].cRank = rank;
-  }
-
-  cElement_t* sendBuffer = (cElement_t*) calloc(mesh->totalHaloPairs,sizeof(cElement_t));
-
-  //propagate clusters
-  int allDone = 0;
-  int rankDone, done;
-  while(!allDone) {
-    meshHaloExchange(mesh, sizeof(cElement_t), *elements, sendBuffer, *elements + mesh->Nelements);
-
-    rankDone = 1;
-    //local clustering
-    done = 0;
-    while(!done) {
-      done = 1;
-      for (int e = 0; e < mesh->Nelements; e++)
-        for (int f = 0; f < mesh->Nfaces; f++) {
-          int eP = mesh->EToE[e * mesh->Nfaces + f];
-          if (eP > -1) {
-            if (((*elements)[eP].level < lev + 1) || ((*elements)[e].level < lev + 1)) {
-              if (compareCluster3D(*elements + eP,*elements + e) < 0) {
-                (*elements)[e].cRank = (*elements)[eP].cRank;
-                (*elements)[e].cId   = (*elements)[eP].cId;
-                done = 0;
-                rankDone = 0;
-              }
-            }
-          }
-        }
-    }
-
-    MPI_Allreduce(&rankDone, &allDone, 1, MPI_INT, MPI_SUM, mesh->comm);
-    allDone /= size;
-  }
-
-  //clusters have been built
-  //transfer them to their owning rank
-
-  qsort((*elements), mesh->Nelements, sizeof(cElement_t), compareCluster3D);
-
-  //set up exchange along MPI interfaces
-  for (int r = 0; r < size; r++)
-    Nsend[r] = 0;
-
-  for(int e = 0; e < mesh->Nelements; ++e)
-    ++Nsend[(*elements)[e].cRank];
-
-  // find send offsets
-  sendOffsets[0] = 0;
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT,
-               Nrecv, 1, MPI_INT,
-               mesh->comm);
-
-  // count incoming faces
-  int allNrecv = 0;
-  for(int r = 0; r < size; ++r) {
-    allNrecv += Nrecv[r];
-    Nrecv[r] *= sizeof(cElement_t);
-    Nsend[r] *= sizeof(cElement_t);
-    sendOffsets[r] *= sizeof(cElement_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  // buffer for recvied elements
-  cElement_t* recvElements = (cElement_t*) calloc(allNrecv, sizeof(cElement_t));
-
-  // exchange parallel faces
-  MPI_Alltoallv(*elements, Nsend, sendOffsets, MPI_CHAR,
-                recvElements, Nrecv, recvOffsets, MPI_CHAR,
-                mesh->comm);
-
-  free(*elements);
-  *elements = recvElements;
-  *Nelements = allNrecv;
-
-  qsort((*elements), *Nelements, sizeof(cElement_t), compareCluster3D);
-
-  //build cluster lists
-  // the lists are already sorted by cluster, so we just scan for different indices
-  *Nclusters = 0;
-  if (*Nelements) {
-    (*Nclusters)++;
-    for (int e = 1; e < *Nelements; e++)
-      if ((*elements)[e].cId != (*elements)[e - 1].cId) (*Nclusters)++;
-
-    *clusters = (cluster_t*) calloc(*Nclusters,sizeof(cluster_t));
-
-    int cnt  = 0;
-    int ecnt = 1;
-    (*clusters)[0].Nelements = 1;
-    (*clusters)[0].offSet = 0;
-    for (int e = 1; e < *Nelements; e++) {
-      if ((*elements)[e].cId != (*elements)[e - 1].cId) {
-        cnt++;
-        (*clusters)[cnt].offSet = e;
-        (*clusters)[cnt].Nelements = 1;
-      } else {
-        (*clusters)[cnt].Nelements++;
-      }
-    }
-  }
-}
diff --git a/src/libP/src/meshClusteredGeometricPartition2D.c b/src/libP/src/meshClusteredGeometricPartition2D.c
deleted file mode 100644
index 75538a107..000000000
--- a/src/libP/src/meshClusteredGeometricPartition2D.c
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh2D.h"
-
-#define bitRange 10
-
-typedef struct
-{
-  int id;
-  int level;
-  dfloat weight;
-
-  // 4 for maximum number of vertices per element in 2D
-  int v[4];
-  dfloat EX[4], EY[4];
-
-  int cRank;
-  int cId;
-  int type;
-} cElement_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-} cluster_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-  int rank;
-
-  int destId;
-  int destOffset;
-  int destRank;
-
-  dfloat weight;
-  unsigned int index; //hilbert index
-} parallelCluster_t;
-
-//This is linked form meshGeometricPartition2D.c
-unsigned int hilbert2D(unsigned int n, unsigned int index1, unsigned int index2);
-void bogusMatch(void* a, void* b);
-
-dfloat improveClusteredPartition2D(int rank, int size, MPI_Comm comm,
-                                   int* Nclusters, parallelCluster_t** parallelClusters);
-
-// compare the Morton indices for two clusters
-int compareIndex2D(const void* a, const void* b)
-{
-  parallelCluster_t* ca = (parallelCluster_t*) a;
-  parallelCluster_t* cb = (parallelCluster_t*) b;
-
-  if(ca->index < cb->index) return -1;
-  if(ca->index > cb->index) return 1;
-
-  return 0;
-}
-
-// compare the Morton indices for two clusters
-int compareRank2D(const void* a, const void* b)
-{
-  parallelCluster_t* ca = (parallelCluster_t*) a;
-  parallelCluster_t* cb = (parallelCluster_t*) b;
-
-  if(ca->rank < cb->rank) return -1;
-  if(ca->rank > cb->rank) return 1;
-
-  if(ca->offSet < cb->offSet) return -1;
-  if(ca->offSet > cb->offSet) return 1;
-
-  return 0;
-}
-
-// geometric partition of clusters of elements in 2D mesh using Morton ordering + parallelSort
-dfloat meshClusteredGeometricPartition2D(mesh2D* mesh, int Nclusters, cluster_t* clusters,
-                                         int* Nelements, cElement_t** elements)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  int maxNclusters;
-  MPI_Allreduce(&Nclusters, &maxNclusters, 1, MPI_INT, MPI_MAX,mesh->comm);
-  maxNclusters = 2 * ((maxNclusters + 1) / 2);
-
-  // fix maxNclusters
-  parallelCluster_t* parallelClusters
-    = (parallelCluster_t*) calloc(maxNclusters, sizeof(parallelCluster_t));
-
-  // local bounding box of element centers
-  dfloat mincx = 1e9, maxcx = -1e9;
-  dfloat mincy = 1e9, maxcy = -1e9;
-
-  // compute cluster centers on this process
-  for(int cnt = 0; cnt < Nclusters; ++cnt) {
-    int id = clusters[cnt].offSet;
-    dfloat cx = 0, cy = 0;
-    for (int e = 0; e < clusters[cnt].Nelements; e++)
-      for(int n = 0; n < mesh->Nverts; ++n) {
-        cx += (*elements)[id + e].EX[n];
-        cy += (*elements)[id + e].EY[n];
-      }
-    cx /= (mesh->Nverts * clusters[cnt].Nelements);
-    cy /= (mesh->Nverts * clusters[cnt].Nelements);
-
-    mincx = mymin(mincx, cx);
-    maxcx = mymax(maxcx, cx);
-    mincy = mymin(mincy, cy);
-    maxcy = mymax(maxcy, cy);
-  }
-
-  dfloat delta = 1e-4;
-  mincx -= delta;
-  mincy -= delta;
-  maxcx += delta;
-  maxcy += delta;
-
-  // find global bounding box of cluster centers
-  dfloat gmincx, gmincy, gmaxcx, gmaxcy;
-  MPI_Allreduce(&mincx, &gmincx, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&mincy, &gmincy, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&maxcx, &gmaxcx, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-  MPI_Allreduce(&maxcy, &gmaxcy, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-
-  // choose sub-range of Morton lattice coordinates to embed cluster centers in
-  unsigned int Nboxes = (((unsigned int)1) << (bitRange - 1));
-
-  // compute Morton index for each cluster
-  for(int cnt = 0; cnt < Nclusters; ++cnt) {
-    // cluster center coordinates
-    dfloat cx = 0, cy = 0;
-    parallelClusters[cnt].weight = 0.;
-    int id = clusters[cnt].offSet;
-    for (int e = 0; e < clusters[cnt].Nelements; e++) {
-      for(int n = 0; n < mesh->Nverts; ++n) {
-        cx += (*elements)[id + e].EX[n];
-        cy += (*elements)[id + e].EY[n];
-      }
-      parallelClusters[cnt].weight += (*elements)[id + e].weight;
-    }
-    cx /= (mesh->Nverts * clusters[cnt].Nelements);
-    cy /= (mesh->Nverts * clusters[cnt].Nelements);
-
-    unsigned int ix = (cx - gmincx) * Nboxes / (gmaxcx - gmincx);
-    unsigned int iy = (cy - gmincy) * Nboxes / (gmaxcy - gmincy);
-
-    //fill the parallel cluster struct
-    parallelClusters[cnt].index =  hilbert2D(Nboxes, ix, iy);
-    parallelClusters[cnt].Nelements = clusters[cnt].Nelements;
-    parallelClusters[cnt].offSet = clusters[cnt].offSet;
-    parallelClusters[cnt].rank = rank;
-  }
-
-  // pad cluster array with dummy clusters
-  for(int n = Nclusters; n < maxNclusters; ++n) {
-    parallelClusters[n].Nelements = -1;
-    parallelClusters[n].index = hilbert2D(Nboxes, Nboxes - 1, Nboxes - 1);
-  }
-
-  // odd-even parallel sort of cluster capsules based on their Morton index
-  parallelSort(mesh->size, mesh->rank, mesh->comm,
-               maxNclusters, parallelClusters, sizeof(parallelCluster_t),
-               compareIndex2D, bogusMatch);
-
-  int newNclusters = 0;
-  for (int n = 0; n < maxNclusters; n++)
-    newNclusters += (parallelClusters[n].Nelements != -1);
-
-  //Do an initial partitioning
-  dfloat localTotalWeight = 0.;
-  for (int n = 0; n < newNclusters; n++)
-    localTotalWeight += parallelClusters[n].weight;
-
-  dfloat* totalWeights = (dfloat*) calloc(size,sizeof(dfloat));
-  dfloat* weightOffsets = (dfloat*) calloc(size + 1,sizeof(dfloat));
-
-  MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, mesh->comm);
-
-  for (int r = 0; r < size; r++)
-    weightOffsets[r + 1] = weightOffsets[r] + totalWeights[r];
-
-  dfloat globalTotalWeight = weightOffsets[size];
-  dfloat chunkSize = globalTotalWeight / ((dfloat)size);
-
-  int* Nsend = (int*) calloc(size, sizeof(int));
-  int* Nrecv = (int*) calloc(size, sizeof(int));
-  int* Ncount = (int*) calloc(size, sizeof(int));
-  int* sendOffsets = (int*) calloc(size, sizeof(int));
-  int* recvOffsets = (int*) calloc(size, sizeof(int));
-
-  //determine the destination rank based on which chunk the cluster is in
-  localTotalWeight = weightOffsets[rank];
-  for (int n = 0; n < newNclusters; n++) {
-    int destRank = (int) (localTotalWeight / chunkSize);
-    Nsend[destRank]++;
-    localTotalWeight += parallelClusters[n].weight;
-  }
-
-  // find send offsets
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  // count incoming clusters
-  newNclusters = 0;
-  for(int r = 0; r < size; ++r) {
-    newNclusters += Nrecv[r];
-    Nrecv[r] *= sizeof(parallelCluster_t);
-    Nsend[r] *= sizeof(parallelCluster_t);
-    sendOffsets[r] *= sizeof(parallelCluster_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  parallelCluster_t* tmpParallelClusters =
-    (parallelCluster_t*) calloc(newNclusters, sizeof(parallelCluster_t));
-
-  // exchange parallel clusters
-  MPI_Alltoallv(parallelClusters, Nsend, sendOffsets, MPI_CHAR,
-                tmpParallelClusters, Nrecv, recvOffsets, MPI_CHAR, mesh->comm);
-
-  if (parallelClusters) free(parallelClusters);
-  parallelClusters = tmpParallelClusters;
-
-  //improve the partitioning by exchanging elements between neighboring prcesses
-  dfloat partQuality = improveClusteredPartition2D(mesh->rank, mesh->size, mesh->comm,
-                                                   &newNclusters, &parallelClusters);
-
-  //now that we're partitioned and (hopefully) balance2Dd, send the elements
-
-  // count number of elements that should end up on this process
-  int newNelements = 0;
-  for(int n = 0; n < newNclusters; n++)
-    newNelements += parallelClusters[n].Nelements;
-
-  //record the destination info
-  if (newNclusters) {
-    parallelClusters[0].destId = 0;
-    parallelClusters[0].destOffset = 0;
-    parallelClusters[0].destRank = rank;
-  }
-  for (int n = 1; n < newNclusters; n++) {
-    parallelClusters[n].destId = n;
-    parallelClusters[n].destOffset = parallelClusters[n - 1].destOffset +
-                                     parallelClusters[n - 1].Nelements;
-    parallelClusters[n].destRank = rank;
-  }
-
-  //sort by original rank and offset
-  qsort(parallelClusters, newNclusters, sizeof(parallelCluster_t), compareRank2D);
-
-  //reset counters
-  for(int r = 0; r < size; ++r)
-    Nsend[r] = 0;
-
-  for (int n = 0; n < newNclusters; n++)
-    Nsend[parallelClusters[n].rank]++;
-
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  for(int r = 0; r < size; ++r) {
-    Nrecv[r] *= sizeof(parallelCluster_t);
-    Nsend[r] *= sizeof(parallelCluster_t);
-    sendOffsets[r] *= sizeof(parallelCluster_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  parallelCluster_t* recvParallelClusters;
-  if (Nclusters)
-    recvParallelClusters = (parallelCluster_t*) calloc(Nclusters, sizeof(parallelCluster_t));
-
-  MPI_Alltoallv(parallelClusters, Nsend, sendOffsets, MPI_CHAR,
-                recvParallelClusters, Nrecv, recvOffsets, MPI_CHAR, mesh->comm);
-
-  //build the array of elements to send
-  cElement_t* sendElements = (cElement_t*) calloc(1,sizeof(cElement_t));
-  cElement_t* recvElements = (cElement_t*) calloc(1,sizeof(cElement_t));
-
-  if (*Nelements) sendElements = (cElement_t*) calloc(*Nelements,sizeof(cElement_t));
-  if (newNelements) recvElements = (cElement_t*) calloc(newNelements,sizeof(cElement_t));
-
-  //reset send counts
-  for (int r = 0; r < size; r++)
-    Nsend[r] = 0;
-
-  for (int n = 0; n < Nclusters; n++)
-    Nsend[recvParallelClusters[n].destRank] += recvParallelClusters[n].Nelements;
-
-  // find send offsets
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  //build the array of elements to send
-  for (int n = 0; n < Nclusters; n++) {
-    int destRank = recvParallelClusters[n].destRank;
-    int cnt = recvParallelClusters[n].Nelements;
-
-    int sendId = sendOffsets[destRank] + Ncount[destRank];
-    int id = recvParallelClusters[n].offSet;
-    memcpy(sendElements + sendId, *elements + id, cnt * sizeof(cElement_t));
-    Ncount[destRank] += cnt;
-  }
-  free(recvParallelClusters);
-
-  // exchange element counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  for(int r = 0; r < size; ++r) {
-    Nrecv[r] *= sizeof(cElement_t);
-    Nsend[r] *= sizeof(cElement_t);
-    sendOffsets[r] *= sizeof(cElement_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  MPI_Alltoallv(sendElements, Nsend, sendOffsets, MPI_CHAR,
-                recvElements, Nrecv, recvOffsets, MPI_CHAR, mesh->comm);
-  free(sendElements);
-
-  //write the clusters in the proper order
-  cluster_t* newClusters = (cluster_t*) calloc(newNclusters,sizeof(cluster_t));
-  cElement_t* newElements = (cElement_t*) calloc(newNelements,sizeof(cElement_t));
-  int cnt = 0;
-  for (int n = 0; n < newNclusters; n++) {
-    int id = parallelClusters[n].destId;
-    newClusters[id].Nelements = parallelClusters[n].Nelements;
-    newClusters[id].offSet = parallelClusters[n].destOffset;
-    for (int e = 0; e < parallelClusters[n].Nelements; e++)
-      memcpy(newElements + newClusters[id].offSet + e, recvElements + cnt++, sizeof(cElement_t));
-  }
-  free(recvElements);
-  free(parallelClusters);
-
-  *Nelements = newNelements;
-
-  if (*elements) free(*elements);
-  *elements = newElements;
-
-  return partQuality;
-}
-
-//swap clusters between neighboring processes to try and improve the partitioning
-void balance2D(int rank, int size, MPI_Comm comm,
-               int rankL, int rankR, dfloat* weightL, dfloat* weightR,
-               int* Nclusters, parallelCluster_t** parallelClusters)
-{
-  int tag = 999;
-  MPI_Request recv, send;
-  MPI_Status status;
-
-  if (rank == rankL) {
-    if ( *weightL > *weightR) {
-      //count number of clusters to send to proc
-      int Nsend = 0;
-      for (int cnt = *Nclusters - 1; cnt > -1; cnt--) {
-        dfloat w = (*parallelClusters)[cnt].weight;
-        if ((*weightL - w) >= (*weightR + w)) {
-          //sending this cluster improves the balance2D
-          *weightL -= w;
-          *weightR += w;
-          Nsend++;
-        } else if((*weightL - w) > *weightR) {
-          //sending makes the neighbor have a higher weight, but it improves the balance2D
-          *weightL -= w;
-          *weightR += w;
-          Nsend++;
-          break;
-        } else {
-          break;
-        }
-      }
-
-      MPI_Isend(&Nsend, 1, MPI_INT,  rankR, tag, comm, &send);
-      MPI_Wait(&send, &status);
-
-      if (Nsend) {
-        *Nclusters -= Nsend;
-
-        MPI_Isend((*parallelClusters) + *Nclusters,
-                  Nsend * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankR,
-                  tag,
-                  comm,
-                  &send);
-        MPI_Wait(&send, &status);
-      }
-    } else if ( *weightL < *weightR) {
-      int Nrecv;
-      MPI_Irecv(&Nrecv, 1, MPI_INT,  rankR, tag, comm, &recv);
-      MPI_Wait(&recv, &status);
-
-      if (Nrecv) {
-        parallelCluster_t* newParallelClusters = (parallelCluster_t*) calloc(*Nclusters + Nrecv,
-                                                                             sizeof(
-                                                                               parallelCluster_t));
-        memcpy(newParallelClusters,*parallelClusters,*Nclusters * sizeof(parallelCluster_t));
-
-        MPI_Irecv(newParallelClusters + *Nclusters,
-                  Nrecv * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankR,
-                  tag,
-                  comm,
-                  &recv);
-        MPI_Wait(&recv, &status);
-
-        for (int n = *Nclusters; n < *Nclusters + Nrecv; n++) {
-          dfloat w = newParallelClusters[n].weight;
-          *weightL += w;
-          *weightR -= w;
-        }
-
-        *Nclusters += Nrecv;
-        free(*parallelClusters);
-        *parallelClusters = newParallelClusters;
-      }
-    }
-  } else if (rank == rankR) {
-    if (*weightL < *weightR) {
-      //count number of clusters to send to proc
-      int Nsend = 0;
-      for (int cnt = 0; cnt < *Nclusters; cnt++) {
-        dfloat w = (*parallelClusters)[cnt].weight;
-        if ((*weightR - w) >= (*weightL + w)) {
-          //sending this cluster improves the balance2D
-          *weightR -= w;
-          *weightL += w;
-          Nsend++;
-        } else if((*weightR - w) > *weightL) {
-          //sending makes the neighbor have a higher weight, but it improves the balance2D
-          *weightR -= w;
-          *weightL += w;
-          Nsend++;
-          break;
-        } else {
-          break;
-        }
-      }
-
-      MPI_Isend(&Nsend, 1, MPI_INT,  rankL, tag, comm, &send);
-      MPI_Wait(&send, &status);
-
-      if (Nsend) {
-        *Nclusters -= Nsend;
-        parallelCluster_t* newParallelClusters =
-          (parallelCluster_t*) calloc(*Nclusters,sizeof(parallelCluster_t));
-        memcpy(newParallelClusters,
-               (*parallelClusters) + Nsend,
-               *Nclusters * sizeof(parallelCluster_t));
-
-        MPI_Isend(*parallelClusters,
-                  Nsend * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankL,
-                  tag,
-                  comm,
-                  &send);
-        MPI_Wait(&send, &status);
-
-        free(*parallelClusters);
-        *parallelClusters = newParallelClusters;
-      }
-    } else if (*weightL > *weightR) {
-      int Nrecv;
-      MPI_Irecv(&Nrecv, 1, MPI_INT,  rankL, tag, comm, &recv);
-      MPI_Wait(&recv, &status);
-
-      if (Nrecv) {
-        parallelCluster_t* tmpParallelClusters =
-          (parallelCluster_t*) calloc(Nrecv,sizeof(parallelCluster_t));
-
-        MPI_Irecv(tmpParallelClusters,
-                  Nrecv * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankL,
-                  tag,
-                  comm,
-                  &recv);
-        MPI_Wait(&recv, &status);
-
-        for (int n = 0; n < Nrecv; n++) {
-          dfloat w = tmpParallelClusters[n].weight;
-          *weightR += w;
-          *weightL -= w;
-        }
-
-        *Nclusters += Nrecv;
-        parallelCluster_t* newParallelClusters =
-          (parallelCluster_t*) calloc(*Nclusters,sizeof(parallelCluster_t));
-        memcpy(newParallelClusters,tmpParallelClusters,Nrecv * sizeof(parallelCluster_t));
-        memcpy(newParallelClusters + Nrecv,*parallelClusters,
-               (*Nclusters - Nrecv) * sizeof(parallelCluster_t));
-
-        free(tmpParallelClusters);
-        free(*parallelClusters);
-        *parallelClusters = newParallelClusters;
-      }
-    }
-  }
-}
-
-dfloat improveClusteredPartition2D(int rank, int size, MPI_Comm comm,
-                                   int* Nclusters, parallelCluster_t** parallelClusters)
-{
-  int tag = 999;
-
-  MPI_Request recv, send;
-  MPI_Status status;
-
-  dfloat* totalWeights = (dfloat*) calloc(size,sizeof(dfloat));
-  dfloat quality;
-
-  while (true) {
-    dfloat localTotalWeight = 0.;
-    for (int n = 0; n < *Nclusters; n++)
-      localTotalWeight += (*parallelClusters)[n].weight;
-
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-
-    dfloat maxTotalWeight, minTotalWeight;
-    MPI_Allreduce(&localTotalWeight, &minTotalWeight, 1, MPI_DFLOAT, MPI_MIN, comm);
-    MPI_Allreduce(&localTotalWeight, &maxTotalWeight, 1, MPI_DFLOAT, MPI_MAX, comm);
-
-    quality = minTotalWeight / maxTotalWeight;
-
-    //ends
-    if ((rank == 0) || (rank == size - 1))
-      balance2D(rank,
-                size,
-                comm,
-                size - 1,
-                0,
-                totalWeights + size - 1,
-                totalWeights + 0,
-                Nclusters,
-                parallelClusters);
-
-    //resync
-    localTotalWeight = totalWeights[rank];
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-
-    //evens
-    if (( (rank % 2) == 0) && (rank + 1 < size))
-      balance2D(rank,
-                size,
-                comm,
-                rank,
-                rank + 1,
-                totalWeights + rank,
-                totalWeights + rank + 1,
-                Nclusters,
-                parallelClusters);
-    if (( (rank % 2) == 1) && (rank - 1 > -1))
-      balance2D(rank,
-                size,
-                comm,
-                rank - 1,
-                rank,
-                totalWeights + rank - 1,
-                totalWeights + rank,
-                Nclusters,
-                parallelClusters);
-
-    //resync
-    localTotalWeight = totalWeights[rank];
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-
-    //odds
-    if (((rank % 2) == 0) && (rank - 1 > -1))
-      balance2D(rank,
-                size,
-                comm,
-                rank - 1,
-                rank,
-                totalWeights + rank - 1,
-                totalWeights + rank,
-                Nclusters,
-                parallelClusters);
-    if (((rank % 2) == 1) && (rank + 1 < size))
-      balance2D(rank,
-                size,
-                comm,
-                rank,
-                rank + 1,
-                totalWeights + rank,
-                totalWeights + rank + 1,
-                Nclusters,
-                parallelClusters);
-
-    //resync
-    localTotalWeight = totalWeights[rank];
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-    MPI_Allreduce(&localTotalWeight, &minTotalWeight, 1, MPI_DFLOAT, MPI_MIN, comm);
-    MPI_Allreduce(&localTotalWeight, &maxTotalWeight, 1, MPI_DFLOAT, MPI_MAX, comm);
-
-    dfloat newQuality = minTotalWeight / maxTotalWeight;
-
-    if (newQuality == quality) break; //no change
-  }
-
-  return quality;
-}
diff --git a/src/libP/src/meshClusteredGeometricPartition3D.c b/src/libP/src/meshClusteredGeometricPartition3D.c
deleted file mode 100644
index 725e01650..000000000
--- a/src/libP/src/meshClusteredGeometricPartition3D.c
+++ /dev/null
@@ -1,650 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh3D.h"
-
-#define bitRange 10
-
-typedef struct
-{
-  int id;
-  int level;
-  dfloat weight;
-
-  // 8 for maximum number of vertices per element in 3D
-  int v[8];
-  dfloat EX[8], EY[8], EZ[8];
-
-  int cRank;
-  int cId;
-  int type;
-} cElement_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-} cluster_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-  int rank;
-
-  int destId;
-  int destOffset;
-  int destRank;
-
-  dfloat weight;
-  unsigned long long int index; //morton index
-} parallelCluster_t;
-
-//This is linked form meshGeometricPartition3D.c
-unsigned long long int mortonIndex3D(unsigned int ix, unsigned int iy, unsigned int iz);
-void bogusMatch(void* a, void* b);
-
-dfloat improveClusteredPartition3D(int rank, int size, MPI_Comm comm,
-                                   int* Nclusters, parallelCluster_t** parallelClusters);
-
-// compare the Morton indices for two clusters
-int compareIndex3D(const void* a, const void* b)
-{
-  parallelCluster_t* ca = (parallelCluster_t*) a;
-  parallelCluster_t* cb = (parallelCluster_t*) b;
-
-  if(ca->index < cb->index) return -1;
-  if(ca->index > cb->index) return 1;
-
-  return 0;
-}
-
-// compare the Morton indices for two clusters
-int compareRank3D(const void* a, const void* b)
-{
-  parallelCluster_t* ca = (parallelCluster_t*) a;
-  parallelCluster_t* cb = (parallelCluster_t*) b;
-
-  if(ca->rank < cb->rank) return -1;
-  if(ca->rank > cb->rank) return 1;
-
-  if(ca->offSet < cb->offSet) return -1;
-  if(ca->offSet > cb->offSet) return 1;
-
-  return 0;
-}
-
-// geometric partition of clusters of elements in 2D mesh using Morton ordering + parallelSort
-dfloat meshClusteredGeometricPartition3D(mesh3D* mesh, int Nclusters, cluster_t* clusters,
-                                         int* Nelements, cElement_t** elements)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  int maxNclusters;
-  MPI_Allreduce(&Nclusters, &maxNclusters, 1, MPI_INT, MPI_MAX,mesh->comm);
-  maxNclusters = 2 * ((maxNclusters + 1) / 2);
-
-  // fix maxNclusters
-  parallelCluster_t* parallelClusters
-    = (parallelCluster_t*) calloc(maxNclusters, sizeof(parallelCluster_t));
-
-  // local bounding box of element centers
-  dfloat mincx = 1e9, maxcx = -1e9;
-  dfloat mincy = 1e9, maxcy = -1e9;
-  dfloat mincz = 1e9, maxcz = -1e9;
-
-  // compute cluster centers on this process
-  for(int cnt = 0; cnt < Nclusters; ++cnt) {
-    int id = clusters[cnt].offSet;
-    dfloat cx = 0, cy = 0, cz = 0;
-    for (int e = 0; e < clusters[cnt].Nelements; e++)
-      for(int n = 0; n < mesh->Nverts; ++n) {
-        cx += (*elements)[id + e].EX[n];
-        cy += (*elements)[id + e].EY[n];
-        cz += (*elements)[id + e].EZ[n];
-      }
-    cx /= (mesh->Nverts * clusters[cnt].Nelements);
-    cy /= (mesh->Nverts * clusters[cnt].Nelements);
-    cz /= (mesh->Nverts * clusters[cnt].Nelements);
-
-    mincx = mymin(mincx, cx);
-    maxcx = mymax(maxcx, cx);
-    mincy = mymin(mincy, cy);
-    maxcy = mymax(maxcy, cy);
-    mincz = mymin(mincz, cz);
-    maxcz = mymax(maxcz, cz);
-  }
-
-  dfloat delta = 1e-4;
-  mincx -= delta;
-  mincy -= delta;
-  mincz -= delta;
-  maxcx += delta;
-  maxcy += delta;
-  maxcz += delta;
-
-  // find global bounding box of cluster centers
-  dfloat gmincx, gmincy, gmincz, gmaxcx, gmaxcy, gmaxcz;
-  MPI_Allreduce(&mincx, &gmincx, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&mincy, &gmincy, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&mincz, &gmincz, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&maxcx, &gmaxcx, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-  MPI_Allreduce(&maxcy, &gmaxcy, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-  MPI_Allreduce(&maxcz, &gmaxcz, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-
-  // choose sub-range of Morton lattice coordinates to embed cluster centers in
-  unsigned long long int Nboxes = (((unsigned long long int)1) << (bitRange - 1));
-
-  // compute Morton index for each cluster
-  for(int cnt = 0; cnt < Nclusters; ++cnt) {
-    // cluster center coordinates
-    dfloat cx = 0, cy = 0, cz = 0;
-    parallelClusters[cnt].weight = 0.;
-    int id = clusters[cnt].offSet;
-    for (int e = 0; e < clusters[cnt].Nelements; e++) {
-      for(int n = 0; n < mesh->Nverts; ++n) {
-        cx += (*elements)[id + e].EX[n];
-        cy += (*elements)[id + e].EY[n];
-        cz += (*elements)[id + e].EZ[n];
-      }
-      parallelClusters[cnt].weight += (*elements)[id + e].weight;
-    }
-    cx /= (mesh->Nverts * clusters[cnt].Nelements);
-    cy /= (mesh->Nverts * clusters[cnt].Nelements);
-    cz /= (mesh->Nverts * clusters[cnt].Nelements);
-
-    unsigned int ix = (cx - gmincx) * Nboxes / (gmaxcx - gmincx);
-    unsigned int iy = (cy - gmincy) * Nboxes / (gmaxcy - gmincy);
-    unsigned int iz = (cy - gmincy) * Nboxes / (gmaxcy - gmincy);
-
-    //fill the parallel cluster struct
-    parallelClusters[cnt].index =  mortonIndex3D(ix, iy,iz);
-    parallelClusters[cnt].Nelements = clusters[cnt].Nelements;
-    parallelClusters[cnt].offSet = clusters[cnt].offSet;
-    parallelClusters[cnt].rank = rank;
-  }
-
-  // pad cluster array with dummy clusters
-  for(int n = Nclusters; n < maxNclusters; ++n) {
-    parallelClusters[n].Nelements = -1;
-    parallelClusters[n].index = mortonIndex3D(Nboxes + 1, Nboxes + 1,Nboxes + 1);
-  }
-
-  // odd-even parallel sort of cluster capsules based on their Morton index
-  parallelSort(mesh->size, mesh->rank, mesh->comm,
-               maxNclusters, parallelClusters, sizeof(parallelCluster_t),
-               compareIndex3D, bogusMatch);
-
-  int newNclusters = 0;
-  for (int n = 0; n < maxNclusters; n++)
-    newNclusters += (parallelClusters[n].Nelements != -1);
-
-  //Do an initial partitioning
-  dfloat localTotalWeight = 0.;
-  for (int n = 0; n < newNclusters; n++)
-    localTotalWeight += parallelClusters[n].weight;
-
-  dfloat* totalWeights = (dfloat*) calloc(size,sizeof(dfloat));
-  dfloat* weightOffsets = (dfloat*) calloc(size + 1,sizeof(dfloat));
-
-  MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, mesh->comm);
-
-  for (int r = 0; r < size; r++)
-    weightOffsets[r + 1] = weightOffsets[r] + totalWeights[r];
-
-  dfloat globalTotalWeight = weightOffsets[size];
-  dfloat chunkSize = globalTotalWeight / ((dfloat)size);
-
-  int* Nsend = (int*) calloc(size, sizeof(int));
-  int* Nrecv = (int*) calloc(size, sizeof(int));
-  int* Ncount = (int*) calloc(size, sizeof(int));
-  int* sendOffsets = (int*) calloc(size, sizeof(int));
-  int* recvOffsets = (int*) calloc(size, sizeof(int));
-
-  //determine the destination rank based on which chunk the cluster is in
-  localTotalWeight = weightOffsets[rank];
-  for (int n = 0; n < newNclusters; n++) {
-    int destRank = (int) (localTotalWeight / chunkSize);
-    Nsend[destRank]++;
-    localTotalWeight += parallelClusters[n].weight;
-  }
-
-  // find send offsets
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  // count incoming clusters
-  newNclusters = 0;
-  for(int r = 0; r < size; ++r) {
-    newNclusters += Nrecv[r];
-    Nrecv[r] *= sizeof(parallelCluster_t);
-    Nsend[r] *= sizeof(parallelCluster_t);
-    sendOffsets[r] *= sizeof(parallelCluster_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  parallelCluster_t* tmpParallelClusters =
-    (parallelCluster_t*) calloc(newNclusters, sizeof(parallelCluster_t));
-
-  // exchange parallel clusters
-  MPI_Alltoallv(parallelClusters, Nsend, sendOffsets, MPI_CHAR,
-                tmpParallelClusters, Nrecv, recvOffsets, MPI_CHAR, mesh->comm);
-
-  if (parallelClusters) free(parallelClusters);
-  parallelClusters = tmpParallelClusters;
-
-  //improve the partitioning by exchanging elements between neighboring prcesses
-  dfloat partQuality = improveClusteredPartition3D(mesh->rank, mesh->size, mesh->comm,
-                                                   &newNclusters, &parallelClusters);
-
-  //now that we're partitioned and (hopefully) balance3Dd, send the elements
-
-  // count number of elements that should end up on this process
-  int newNelements = 0;
-  for(int n = 0; n < newNclusters; n++)
-    newNelements += parallelClusters[n].Nelements;
-
-  //record the destination info
-  if (newNclusters) {
-    parallelClusters[0].destId = 0;
-    parallelClusters[0].destOffset = 0;
-    parallelClusters[0].destRank = rank;
-  }
-  for (int n = 1; n < newNclusters; n++) {
-    parallelClusters[n].destId = n;
-    parallelClusters[n].destOffset = parallelClusters[n - 1].destOffset +
-                                     parallelClusters[n - 1].Nelements;
-    parallelClusters[n].destRank = rank;
-  }
-
-  //sort by original rank and offset
-  qsort(parallelClusters, newNclusters, sizeof(parallelCluster_t), compareRank3D);
-
-  //reset counters
-  for(int r = 0; r < size; ++r)
-    Nsend[r] = 0;
-
-  for (int n = 0; n < newNclusters; n++)
-    Nsend[parallelClusters[n].rank]++;
-
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  for(int r = 0; r < size; ++r) {
-    Nrecv[r] *= sizeof(parallelCluster_t);
-    Nsend[r] *= sizeof(parallelCluster_t);
-    sendOffsets[r] *= sizeof(parallelCluster_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  parallelCluster_t* recvParallelClusters;
-  if (Nclusters)
-    recvParallelClusters = (parallelCluster_t*) calloc(Nclusters, sizeof(parallelCluster_t));
-
-  MPI_Alltoallv(parallelClusters, Nsend, sendOffsets, MPI_CHAR,
-                recvParallelClusters, Nrecv, recvOffsets, MPI_CHAR, mesh->comm);
-
-  //build the array of elements to send
-  cElement_t* sendElements = (cElement_t*) calloc(1,sizeof(cElement_t));
-  cElement_t* recvElements = (cElement_t*) calloc(1,sizeof(cElement_t));
-
-  if (*Nelements) sendElements = (cElement_t*) calloc(*Nelements,sizeof(cElement_t));
-  if (newNelements) recvElements = (cElement_t*) calloc(newNelements,sizeof(cElement_t));
-
-  //reset send counts
-  for (int r = 0; r < size; r++)
-    Nsend[r] = 0;
-
-  for (int n = 0; n < Nclusters; n++)
-    Nsend[recvParallelClusters[n].destRank] += recvParallelClusters[n].Nelements;
-
-  // find send offsets
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  //build the array of elements to send
-  for (int n = 0; n < Nclusters; n++) {
-    int destRank = recvParallelClusters[n].destRank;
-    int cnt = recvParallelClusters[n].Nelements;
-
-    int sendId = sendOffsets[destRank] + Ncount[destRank];
-    int id = recvParallelClusters[n].offSet;
-    memcpy(sendElements + sendId, *elements + id, cnt * sizeof(cElement_t));
-    Ncount[destRank] += cnt;
-  }
-
-  // exchange element counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  for(int r = 0; r < size; ++r) {
-    Nrecv[r] *= sizeof(cElement_t);
-    Nsend[r] *= sizeof(cElement_t);
-    sendOffsets[r] *= sizeof(cElement_t);
-  }
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  MPI_Alltoallv(sendElements, Nsend, sendOffsets, MPI_CHAR,
-                recvElements, Nrecv, recvOffsets, MPI_CHAR, mesh->comm);
-  free(sendElements);
-
-  //write the clusters in the proper order
-  cluster_t* newClusters = (cluster_t*) calloc(newNclusters,sizeof(cluster_t));
-  cElement_t* newElements = (cElement_t*) calloc(newNelements,sizeof(cElement_t));
-  int cnt = 0;
-  for (int n = 0; n < newNclusters; n++) {
-    int id = parallelClusters[n].destId;
-    newClusters[id].Nelements = parallelClusters[n].Nelements;
-    newClusters[id].offSet = parallelClusters[n].destOffset;
-    for (int e = 0; e < parallelClusters[n].Nelements; e++)
-      memcpy(newElements + newClusters[id].offSet + e, recvElements + cnt++, sizeof(cElement_t));
-  }
-  free(recvElements);
-  free(parallelClusters);
-
-  *Nelements = newNelements;
-
-  if (*elements) free(*elements);
-  *elements = newElements;
-
-  return partQuality;
-}
-
-//swap clusters between neighboring processes to try and improve the partitioning
-void balance3D(int rank, int size, MPI_Comm comm,
-               int rankL, int rankR, dfloat* weightL, dfloat* weightR,
-               int* Nclusters, parallelCluster_t** parallelClusters)
-{
-  int tag = 999;
-  MPI_Request recv, send;
-  MPI_Status status;
-
-  if (rank == rankL) {
-    if ( *weightL > *weightR) {
-      //count number of clusters to send to proc
-      int Nsend = 0;
-      for (int cnt = *Nclusters - 1; cnt > -1; cnt--) {
-        dfloat w = (*parallelClusters)[cnt].weight;
-        if ((*weightL - w) >= (*weightR + w)) {
-          //sending this cluster improves the balance3D
-          *weightL -= w;
-          *weightR += w;
-          Nsend++;
-        } else if((*weightL - w) > *weightR) {
-          //sending makes the neighbor have a higher weight, but it improves the balance3D
-          *weightL -= w;
-          *weightR += w;
-          Nsend++;
-          break;
-        } else {
-          break;
-        }
-      }
-
-      MPI_Isend(&Nsend, 1, MPI_INT,  rankR, tag, comm, &send);
-      MPI_Wait(&send, &status);
-
-      if (Nsend) {
-        *Nclusters -= Nsend;
-
-        MPI_Isend((*parallelClusters) + *Nclusters,
-                  Nsend * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankR,
-                  tag,
-                  comm,
-                  &send);
-        MPI_Wait(&send, &status);
-      }
-    } else if ( *weightL < *weightR) {
-      int Nrecv;
-      MPI_Irecv(&Nrecv, 1, MPI_INT,  rankR, tag, comm, &recv);
-      MPI_Wait(&recv, &status);
-
-      if (Nrecv) {
-        parallelCluster_t* newParallelClusters = (parallelCluster_t*) calloc(*Nclusters + Nrecv,
-                                                                             sizeof(
-                                                                               parallelCluster_t));
-        memcpy(newParallelClusters,*parallelClusters,*Nclusters * sizeof(parallelCluster_t));
-
-        MPI_Irecv(newParallelClusters + *Nclusters,
-                  Nrecv * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankR,
-                  tag,
-                  comm,
-                  &recv);
-        MPI_Wait(&recv, &status);
-
-        for (int n = *Nclusters; n < *Nclusters + Nrecv; n++) {
-          dfloat w = newParallelClusters[n].weight;
-          *weightL += w;
-          *weightR -= w;
-        }
-
-        *Nclusters += Nrecv;
-        free(*parallelClusters);
-        *parallelClusters = newParallelClusters;
-      }
-    }
-  } else if (rank == rankR) {
-    if (*weightL < *weightR) {
-      //count number of clusters to send to proc
-      int Nsend = 0;
-      for (int cnt = 0; cnt < *Nclusters; cnt++) {
-        dfloat w = (*parallelClusters)[cnt].weight;
-        if ((*weightR - w) >= (*weightL + w)) {
-          //sending this cluster improves the balance3D
-          *weightR -= w;
-          *weightL += w;
-          Nsend++;
-        } else if((*weightR - w) > *weightL) {
-          //sending makes the neighbor have a higher weight, but it improves the balance3D
-          *weightR -= w;
-          *weightL += w;
-          Nsend++;
-          break;
-        } else {
-          break;
-        }
-      }
-
-      MPI_Isend(&Nsend, 1, MPI_INT,  rankL, tag, comm, &send);
-      MPI_Wait(&send, &status);
-
-      if (Nsend) {
-        *Nclusters -= Nsend;
-        parallelCluster_t* newParallelClusters =
-          (parallelCluster_t*) calloc(*Nclusters,sizeof(parallelCluster_t));
-        memcpy(newParallelClusters,
-               (*parallelClusters) + Nsend,
-               *Nclusters * sizeof(parallelCluster_t));
-
-        MPI_Isend(*parallelClusters,
-                  Nsend * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankL,
-                  tag,
-                  comm,
-                  &send);
-        MPI_Wait(&send, &status);
-
-        free(*parallelClusters);
-        *parallelClusters = newParallelClusters;
-      }
-    } else if (*weightL > *weightR) {
-      int Nrecv;
-      MPI_Irecv(&Nrecv, 1, MPI_INT,  rankL, tag, comm, &recv);
-      MPI_Wait(&recv, &status);
-
-      if (Nrecv) {
-        parallelCluster_t* tmpParallelClusters =
-          (parallelCluster_t*) calloc(Nrecv,sizeof(parallelCluster_t));
-
-        MPI_Irecv(tmpParallelClusters,
-                  Nrecv * sizeof(parallelCluster_t),
-                  MPI_CHAR,
-                  rankL,
-                  tag,
-                  comm,
-                  &recv);
-        MPI_Wait(&recv, &status);
-
-        for (int n = 0; n < Nrecv; n++) {
-          dfloat w = tmpParallelClusters[n].weight;
-          *weightR += w;
-          *weightL -= w;
-        }
-
-        *Nclusters += Nrecv;
-        parallelCluster_t* newParallelClusters =
-          (parallelCluster_t*) calloc(*Nclusters,sizeof(parallelCluster_t));
-        memcpy(newParallelClusters,tmpParallelClusters,Nrecv * sizeof(parallelCluster_t));
-        memcpy(newParallelClusters + Nrecv,*parallelClusters,
-               (*Nclusters - Nrecv) * sizeof(parallelCluster_t));
-
-        free(tmpParallelClusters);
-        free(*parallelClusters);
-        *parallelClusters = newParallelClusters;
-      }
-    }
-  }
-}
-
-dfloat improveClusteredPartition3D(int rank, int size, MPI_Comm comm,
-                                   int* Nclusters, parallelCluster_t** parallelClusters)
-{
-  int tag = 999;
-
-  MPI_Request recv, send;
-  MPI_Status status;
-
-  dfloat* totalWeights = (dfloat*) calloc(size,sizeof(dfloat));
-  dfloat quality;
-
-  while (true) {
-    dfloat localTotalWeight = 0.;
-    for (int n = 0; n < *Nclusters; n++)
-      localTotalWeight += (*parallelClusters)[n].weight;
-
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-
-    dfloat maxTotalWeight, minTotalWeight;
-    MPI_Allreduce(&localTotalWeight, &minTotalWeight, 1, MPI_DFLOAT, MPI_MIN, comm);
-    MPI_Allreduce(&localTotalWeight, &maxTotalWeight, 1, MPI_DFLOAT, MPI_MAX, comm);
-
-    quality = minTotalWeight / maxTotalWeight;
-
-    //ends
-    if ((rank == 0) || (rank == size - 1))
-      balance3D(rank,
-                size,
-                comm,
-                size - 1,
-                0,
-                totalWeights + size - 1,
-                totalWeights + 0,
-                Nclusters,
-                parallelClusters);
-
-    //resync
-    localTotalWeight = totalWeights[rank];
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-
-    //evens
-    if (( (rank % 2) == 0) && (rank + 1 < size))
-      balance3D(rank,
-                size,
-                comm,
-                rank,
-                rank + 1,
-                totalWeights + rank,
-                totalWeights + rank + 1,
-                Nclusters,
-                parallelClusters);
-    if (( (rank % 2) == 1) && (rank - 1 > -1))
-      balance3D(rank,
-                size,
-                comm,
-                rank - 1,
-                rank,
-                totalWeights + rank - 1,
-                totalWeights + rank,
-                Nclusters,
-                parallelClusters);
-
-    //resync
-    localTotalWeight = totalWeights[rank];
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-
-    //odds
-    if (((rank % 2) == 0) && (rank - 1 > -1))
-      balance3D(rank,
-                size,
-                comm,
-                rank - 1,
-                rank,
-                totalWeights + rank - 1,
-                totalWeights + rank,
-                Nclusters,
-                parallelClusters);
-    if (((rank % 2) == 1) && (rank + 1 < size))
-      balance3D(rank,
-                size,
-                comm,
-                rank,
-                rank + 1,
-                totalWeights + rank,
-                totalWeights + rank + 1,
-                Nclusters,
-                parallelClusters);
-
-    //resync
-    localTotalWeight = totalWeights[rank];
-    MPI_Allgather(&localTotalWeight, 1, MPI_DFLOAT, totalWeights, 1, MPI_DFLOAT, comm);
-    MPI_Allreduce(&localTotalWeight, &minTotalWeight, 1, MPI_DFLOAT, MPI_MIN, comm);
-    MPI_Allreduce(&localTotalWeight, &maxTotalWeight, 1, MPI_DFLOAT, MPI_MAX, comm);
-
-    dfloat newQuality = minTotalWeight / maxTotalWeight;
-
-    if (newQuality == quality) break; //no change
-  }
-
-  return quality;
-}
diff --git a/src/libP/src/meshConnectFaceModes2D.c b/src/libP/src/meshConnectFaceModes2D.c
deleted file mode 100644
index 8a1a326e6..000000000
--- a/src/libP/src/meshConnectFaceModes2D.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "mesh2D.h"
-
-// serial face-mode to face-mode connection
-void meshConnectFaceModes2D(mesh2D* mesh, int* faceModes, dfloat* V)
-{
-  /* volume indices of the interior and exterior face modes for each element */
-  mesh->mmapM = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-  mesh->mmapP = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-  mesh->mmapS = (int*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(int));
-
-  dfloat* VM = (dfloat*) calloc(mesh->Np,sizeof(dfloat));
-  dfloat* VP = (dfloat*) calloc(mesh->Np,sizeof(dfloat));
-
-  /* assume elements already connected */
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong eP = mesh->EToE[e * mesh->Nfaces + f];
-      int fP = mesh->EToF[e * mesh->Nfaces + f];
-      if(eP < 0 || fP < 0) { // fake connections for unconnected faces
-        eP = e;
-        fP = f;
-      }
-
-      /* for each mode on this face find the neighbor mode */
-      for(int n = 0; n < mesh->Nfp; ++n) {
-        int m = faceModes[n + f * mesh->Nfp]; //get face mode number
-
-        for (int i = 0; i < mesh->Nfp; i++) {
-          int k = mesh->faceNodes[i + f * mesh->Nfp];
-          VM[i] = V[m + k * mesh->Np]; //evaluate mode at WB nodes on face
-        }
-
-        dfloat mindist = 1E9;
-        int s;
-        int mMatch;
-        for (int nP = 0; nP < mesh->Nfp; nP++) {
-          //test the modes on face fP
-          int mP = faceModes[nP + fP * mesh->Nfp];
-
-          for (int i = 0; i < mesh->Nfp; i++) {
-            //get neighbouring node
-            dlong id = i + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces;
-            int k = mesh->vmapP[id] % mesh->Np;
-
-            VP[i] = V[mP + k * mesh->Np]; //evaluate mode at WB nodes on face
-          }
-
-          dfloat dist1 = 0, dist2 = 0;
-          for (int i = 0; i < mesh->Nfp; i++) {
-            dist1 += pow(VM[i] - VP[i],2);
-            dist2 += pow(VM[i] + VP[i],2);
-          }
-          dist1 = sqrt(dist1);
-          dist2 = sqrt(dist2);
-
-          /* if next node is closer to target update match */
-          if(dist1 < mindist) {
-            mindist = dist1;
-            mMatch = mP;
-            s = 1;
-          }
-          if(dist2 < mindist) {
-            mindist = dist2;
-            mMatch = mP;
-            s = -1;
-          }
-        }
-        if(mindist > 1e-3) printf("arggh - bad match: e="dlongFormat ",f=%d, mode=%d\n", e,f, m);
-
-        dlong id  = mesh->Nfaces * mesh->Nfp * e + f * mesh->Nfp + n;
-        dlong idM = faceModes[f * mesh->Nfp + n] + e * mesh->Np;
-        dlong idP = mMatch + eP * mesh->Np;
-
-        mesh->mmapM[id] = idM;
-        mesh->mmapP[id] = idP;
-        mesh->mmapS[id] = s;
-      }
-    }
-}
\ No newline at end of file
diff --git a/src/libP/src/meshConnectFaceNodes2D.c b/src/libP/src/meshConnectFaceNodes2D.c
deleted file mode 100644
index 0d18a3cc2..000000000
--- a/src/libP/src/meshConnectFaceNodes2D.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "mesh2D.h"
-
-int findBestMatch(dfloat x1, dfloat y1,
-                  int Np2, int* nodeList, dfloat* x2, dfloat* y2, int* nP)
-{
-  int matchIndex = nodeList[0];
-  dfloat mindist2 = pow(x1 - x2[nodeList[0]],2) + pow(y1 - y2[nodeList[0]],2);
-
-  *nP = 0;
-  for(int n = 1; n < Np2; ++n) {
-    /* next node */
-    const int i2 = nodeList[n];
-
-    /* distance between target and next node */
-    const dfloat dist2 = pow(x1 - x2[i2],2) + pow(y1 - y2[i2],2);
-
-    /* if next node is closer to target update match */
-    if(dist2 < mindist2) {
-      mindist2 = dist2;
-      matchIndex = i2;
-      *nP = n;
-    }
-  }
-  // AK. Commneting out for NekRS testing !!!!!
-  // if(mindist2>1e-3) printf("arggh - bad match: x,y=%g,%g\n", x1,y1);
-  return matchIndex;
-}
-
-// serial face-node to face-node connection
-void meshConnectFaceNodes2D(mesh2D* mesh)
-{
-  /* volume indices of the interior and exterior face nodes for each element */
-  mesh->vmapM = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-  mesh->vmapP = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-  mesh->mapP  = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-
-  /* assume elements already connected */
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong eP = mesh->EToE[e * mesh->Nfaces + f];
-      int fP = mesh->EToF[e * mesh->Nfaces + f];
-      if(eP < 0 || fP < 0) { // fake connections for unconnected faces
-        eP = e;
-        fP = f;
-      }
-      /* for each node on this face find the neighbor node */
-      for(int n = 0; n < mesh->Nfp; ++n) {
-        dlong idM = mesh->faceNodes[f * mesh->Nfp + n] + e * mesh->Np;
-        dfloat xM = mesh->x[idM];
-        dfloat yM = mesh->y[idM];
-        dlong id = mesh->Nfaces * mesh->Nfp * e + f * mesh->Nfp + n;
-        int nP;
-
-        int idP = findBestMatch(xM, yM,
-                                mesh->Nfp,
-                                mesh->faceNodes + fP * mesh->Nfp,
-                                mesh->x + eP * mesh->Np,
-                                mesh->y + eP * mesh->Np, &nP);
-
-        mesh->vmapM[id] = idM;
-        mesh->vmapP[id] = idP + eP * mesh->Np;
-        mesh->mapP[id] = eP * mesh->Nfaces * mesh->Nfp + fP * mesh->Nfp + nP;
-      }
-    }
-}
-
-//      printf("connecting (%d,%d) to (%d,%d) [ vmapM %d to vmapP %d ]\n",
-//             e,f,eP,fP, mesh->vmapM[id], mesh->vmapP[id]);
diff --git a/src/libP/src/meshConnectPeriodicFaceNodes2D.c b/src/libP/src/meshConnectPeriodicFaceNodes2D.c
deleted file mode 100644
index 2ee8428b8..000000000
--- a/src/libP/src/meshConnectPeriodicFaceNodes2D.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "mesh2D.h"
-
-int findBestPeriodicMatch(dfloat xper, dfloat yper, dfloat x1, dfloat y1,
-                          int Np2, int* nodeList, dfloat* x2, dfloat* y2, int* nP)
-{
-  int matchIndex;
-  dfloat mindist2 = 1e9;
-  int isFirst = 1;
-
-  for(int n = 0; n < Np2; ++n) {
-    /* next node */
-    const int i2 = nodeList[n];
-    //  for(int zp=0;zp<2;++zp){
-    for(int yp = 0; yp < 2; ++yp)
-      for(int xp = 0; xp < 2; ++xp) {
-        /* distance between target and next node */
-        const dfloat dist2 =
-          pow(fabs(x1 - x2[i2]) - xp * xper,2) +
-          pow(fabs(y1 - y2[i2]) - yp * yper,2);
-
-        /* if next node is closer to target update match */
-        if(isFirst == 1 || dist2 < mindist2) {
-          mindist2 = dist2;
-          matchIndex = i2;
-          *nP = n;
-          isFirst = 0;
-        }
-      }
-    //}
-  }
-  if(mindist2 > 1e-3) printf("arggh - bad match: x,y,z= %g,%g => %g,%g with mindist=%lg\n",
-                             x1,y1, x2[matchIndex], y2[matchIndex], mindist2);
-
-  return matchIndex;
-}
-
-// serial face-node to face-node connection
-void meshConnectPeriodicFaceNodes2D(mesh2D* mesh, dfloat xper, dfloat yper)
-{
-  /* volume indices of the interior and exterior face nodes for each element */
-  mesh->vmapM = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-  mesh->vmapP = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-  mesh->mapP  = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong));
-
-  /* assume elements already connected */
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong eP = mesh->EToE[e * mesh->Nfaces + f];
-      int fP = mesh->EToF[e * mesh->Nfaces + f];
-      if(eP < 0 || fP < 0) { // fake connections for unconnected faces
-        eP = e;
-        fP = f;
-      }
-      /* for each node on this face find the neighbor node */
-      for(int n = 0; n < mesh->Nfp; ++n) {
-        dlong idM = mesh->faceNodes[f * mesh->Nfp + n] + e * mesh->Np;
-        dfloat xM = mesh->x[idM];
-        dfloat yM = mesh->y[idM];
-        int nP;
-
-        int idP = findBestPeriodicMatch(xper, yper,xM, yM,
-                                        mesh->Nfp,
-                                        mesh->faceNodes + fP * mesh->Nfp,
-                                        mesh->x + eP * mesh->Np,
-                                        mesh->y + eP * mesh->Np, &nP);
-
-        dlong id = mesh->Nfaces * mesh->Nfp * e + f * mesh->Nfp + n;
-        mesh->vmapM[id] = idM;
-        mesh->vmapP[id] = idP + eP * mesh->Np;
-        mesh->mapP[id] = eP * mesh->Nfaces * mesh->Nfp + fP * mesh->Nfp + nP;
-      }
-    }
-}
diff --git a/src/libP/src/meshGeometricFactorsQuad2D.c b/src/libP/src/meshGeometricFactorsQuad2D.c
deleted file mode 100644
index 8931efcb9..000000000
--- a/src/libP/src/meshGeometricFactorsQuad2D.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshGeometricFactorsQuad2D(mesh2D* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nvgeo = 7;
-
-  /* note that we have volume geometric factors for each node */
-  mesh->vgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->Np, sizeof(dfloat));
-
-  mesh->cubvgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp, sizeof(dfloat));
-
-  /* number of second order geometric factors */
-  mesh->Nggeo = 4;
-  mesh->ggeo = (dfloat*) calloc(mesh->Nelements * mesh->Nggeo * mesh->Np, sizeof(dfloat));
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    /* find vertex indices and physical coordinates */
-    dlong id = e * mesh->Nverts;
-
-    dfloat* xe = mesh->EX + id;
-    dfloat* ye = mesh->EY + id;
-
-    for(int n = 0; n < mesh->Np; ++n) {
-      /* local node coordinates */
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-
-      /* Jacobian matrix */
-      dfloat xr = 0.25 * ( (1 - sn) * (xe[1] - xe[0]) + (1 + sn) * (xe[2] - xe[3]) );
-      dfloat xs = 0.25 * ( (1 - rn) * (xe[3] - xe[0]) + (1 + rn) * (xe[2] - xe[1]) );
-      dfloat yr = 0.25 * ( (1 - sn) * (ye[1] - ye[0]) + (1 + sn) * (ye[2] - ye[3]) );
-      dfloat ys = 0.25 * ( (1 - rn) * (ye[3] - ye[0]) + (1 + rn) * (ye[2] - ye[1]) );
-
-      /* compute geometric factors for affine coordinate transform*/
-      dfloat J = xr * ys - xs * yr;
-
-      if(J < 1e-8) {
-        printf("Negative or small Jacobian: %g\n", J);
-        exit(-1);
-      }
-      dfloat rx =  ys / J;
-      dfloat ry = -xs / J;
-      dfloat sx = -yr / J;
-      dfloat sy =  xr / J;
-
-      int i = n % mesh->Nq;
-      int j = n / mesh->Nq;
-      dfloat JW = J * mesh->gllw[i] * mesh->gllw[j];
-
-      /* store geometric factors */
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * RXID] = rx;
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * RYID] = ry;
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * SXID] = sx;
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * SYID] = sy;
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * JID]  = J;
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * JWID] = JW;
-      mesh->vgeo[mesh->Nvgeo * mesh->Np * e + n + mesh->Np * IJWID] = 1. / JW;
-
-      /* store second order geometric factors */
-      mesh->ggeo[mesh->Nggeo * mesh->Np * e + n + mesh->Np * G00ID] = JW * (rx * rx + ry * ry);
-      mesh->ggeo[mesh->Nggeo * mesh->Np * e + n + mesh->Np * G01ID] = JW * (rx * sx + ry * sy);
-      mesh->ggeo[mesh->Nggeo * mesh->Np * e + n + mesh->Np * G11ID] = JW * (sx * sx + sy * sy);
-      mesh->ggeo[mesh->Nggeo * mesh->Np * e + n + mesh->Np * GWJID] = JW;
-    }
-
-    //geometric data for quadrature
-    for(int j = 0; j < mesh->cubNq; ++j)
-      for(int i = 0; i < mesh->cubNq; ++i) {
-        dfloat rn = mesh->cubr[i];
-        dfloat sn = mesh->cubr[j];
-
-        /* Jacobian matrix */
-        dfloat xr = 0.25 * ( (1 - sn) * (xe[1] - xe[0]) + (1 + sn) * (xe[2] - xe[3]) );
-        dfloat xs = 0.25 * ( (1 - rn) * (xe[3] - xe[0]) + (1 + rn) * (xe[2] - xe[1]) );
-        dfloat yr = 0.25 * ( (1 - sn) * (ye[1] - ye[0]) + (1 + sn) * (ye[2] - ye[3]) );
-        dfloat ys = 0.25 * ( (1 - rn) * (ye[3] - ye[0]) + (1 + rn) * (ye[2] - ye[1]) );
-
-        /* compute geometric factors for affine coordinate transform*/
-        dfloat J = xr * ys - xs * yr;
-
-        if(J < 1e-8) {
-          printf("Negative or small Jacobian: %g\n", J);
-          exit(-1);
-        }
-        dfloat rx =  ys / J;
-        dfloat ry = -xs / J;
-        dfloat sx = -yr / J;
-        dfloat sy =  xr / J;
-
-        dfloat JW = J * mesh->cubw[i] * mesh->cubw[j];
-
-        /* store geometric factors */
-        dlong base = mesh->Nvgeo * mesh->cubNp * e + i + j * mesh->cubNq;
-        mesh->cubvgeo[base + mesh->cubNp * RXID] = rx;
-        mesh->cubvgeo[base + mesh->cubNp * RYID] = ry;
-        mesh->cubvgeo[base + mesh->cubNp * SXID] = sx;
-        mesh->cubvgeo[base + mesh->cubNp * SYID] = sy;
-        mesh->cubvgeo[base + mesh->cubNp * JID]  = J;
-        mesh->cubvgeo[base + mesh->cubNp * JWID] = JW;
-        mesh->cubvgeo[base + mesh->cubNp * IJWID] = 1. / JW;
-      }
-  }
-}
diff --git a/src/libP/src/meshGeometricFactorsQuad3D.c b/src/libP/src/meshGeometricFactorsQuad3D.c
deleted file mode 100644
index fd0f66341..000000000
--- a/src/libP/src/meshGeometricFactorsQuad3D.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-// custom geometric factors specialized for 3D quad on sphere
-
-void meshGeometricFactorsQuad3D(mesh_t* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nvgeo = 12; //
-
-  /* note that we have volume geometric factors for each node */
-  mesh->vgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->Np, sizeof(dfloat));
-
-  mesh->cubvgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp, sizeof(dfloat));
-
-  // Can be computed on the fly
-  mesh->Nggeo = 7;
-  mesh->ggeo  = (dfloat*) calloc(mesh->Nelements * mesh->Np * mesh->Nggeo, sizeof(dfloat));
-
-  dfloat* cxr = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* cxs = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* cyr = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* cys = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* czr = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* czs = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* cx  = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* cy  = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-  dfloat* cz  = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-
-  for(int e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    for(int n = 0; n < mesh->cubNq * mesh->cubNq; ++n) {
-      cxr[n] = 0;
-      cyr[n] = 0;
-      czr[n] = 0;
-      cxs[n] = 0;
-      cys[n] = 0;
-      czs[n] = 0;
-      cx[n] = 0;
-      cy[n] = 0;
-      cz[n] = 0;
-    }
-
-    for(int j = 0; j < mesh->Nq; ++j)
-      for(int i = 0; i < mesh->Nq; ++i) {
-        dfloat xij = mesh->x[i + j * mesh->Nq + e * mesh->Np];
-        dfloat yij = mesh->y[i + j * mesh->Nq + e * mesh->Np];
-        dfloat zij = mesh->z[i + j * mesh->Nq + e * mesh->Np];
-
-        dfloat xr = 0, yr = 0, zr = 0;
-        dfloat xs = 0, ys = 0, zs = 0;
-
-        for(int n = 0; n < mesh->Nq; ++n) {
-          dfloat Din = mesh->D[i * mesh->Nq + n];
-          dfloat Djn = mesh->D[j * mesh->Nq + n];
-
-          xr += Din * mesh->x[n + j * mesh->Nq + e * mesh->Np];
-          yr += Din * mesh->y[n + j * mesh->Nq + e * mesh->Np];
-          zr += Din * mesh->z[n + j * mesh->Nq + e * mesh->Np];
-
-          xs += Djn * mesh->x[i + n * mesh->Nq + e * mesh->Np];
-          ys += Djn * mesh->y[i + n * mesh->Nq + e * mesh->Np];
-          zs += Djn * mesh->z[i + n * mesh->Nq + e * mesh->Np];
-        }
-
-        {
-          dfloat rx = ys * zij - zs * yij; // dXds x X
-          dfloat ry = zs * xij - xs * zij;
-          dfloat rz = xs * yij - ys * xij;
-
-          dfloat sx = zr * yij - yr * zij; // -dXdr x X
-          dfloat sy = xr * zij - zr * xij;
-          dfloat sz = yr * xij - xr * yij;
-
-          dfloat tx = yr * zs - zr * ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-          dfloat ty = zr * xs - xr * zs;
-          dfloat tz = xr * ys - yr * xs;
-
-          dfloat Gx = tx, Gy = ty, Gz = tz;
-
-          dfloat J = xij * tx + yij * ty + zij * tz;
-
-          if(J < 1e-8) {
-            printf("Negative or small Jacobian: %g\n", J);
-            exit(-1);
-          }
-
-          rx /= J;
-          sx /= J;
-          tx /= J;
-          ry /= J;
-          sy /= J;
-          ty /= J;
-          rz /= J;
-          sz /= J;
-          tz /= J;
-
-          // use this for "volume" Jacobian
-          dfloat Jnew = sqrt(Gx * Gx + Gy * Gy + Gz * Gz); //(difference between actual Jacobian and sphere Jac)
-          J = Jnew;
-
-          if(J < 1e-8) {
-            printf("Negative or small Jacobian: %g\n", J);
-            exit(-1);
-          }
-          //    printf("before: grad r = %g,%g,%g\n", rx, ry, rz);
-        }
-
-        dfloat GG00 = xr * xr + yr * yr + zr * zr;
-        dfloat GG11 = xs * xs + ys * ys + zs * zs;
-        dfloat GG01 = xr * xs + yr * ys + zr * zs;
-        dfloat detGG = GG00 * GG11 - GG01 * GG01;
-
-        // are these tangential
-        dfloat rx = (xr * GG11 - xs * GG01) / detGG;
-        dfloat ry = (yr * GG11 - ys * GG01) / detGG;
-        dfloat rz = (zr * GG11 - zs * GG01) / detGG;
-
-        dfloat sx = (-xr * GG01 + xs * GG00) / detGG;
-        dfloat sy = (-yr * GG01 + ys * GG00) / detGG;
-        dfloat sz = (-zr * GG01 + zs * GG00) / detGG;
-
-        dfloat tx = yr * zs - zr * ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-        dfloat ty = zr * xs - xr * zs;
-        dfloat tz = xr * ys - yr * xs;
-
-        // use this for "volume" Jacobian
-        dfloat J = sqrt(tx * tx + ty * ty + tz * tz); // (difference between actual Jacobian and sphere Jac)
-
-        //  printf("after: grad r = %g,%g,%g\n", rx, ry, rz);
-
-        dfloat JW = J * mesh->gllw[i] * mesh->gllw[j];
-
-        /* store geometric factors */
-        int base = mesh->Nvgeo * mesh->Np * e + j * mesh->Nq + i;
-
-        mesh->vgeo[base + mesh->Np * RXID] = rx;
-        mesh->vgeo[base + mesh->Np * RYID] = ry;
-        mesh->vgeo[base + mesh->Np * RZID] = rz;
-        mesh->vgeo[base + mesh->Np * SXID] = sx;
-        mesh->vgeo[base + mesh->Np * SYID] = sy;
-        mesh->vgeo[base + mesh->Np * SZID] = sz;
-        mesh->vgeo[base + mesh->Np * TXID] = tx;
-        mesh->vgeo[base + mesh->Np * TYID] = ty;
-        mesh->vgeo[base + mesh->Np * TZID] = tz;
-        mesh->vgeo[base + mesh->Np * JID]  = J;
-        mesh->vgeo[base + mesh->Np * JWID] = JW;
-        mesh->vgeo[base + mesh->Np * IJWID] = 1. / JW;
-
-        /* store second order geometric factors (can be computed on the fly, later!!!)*/
-        int gbase = mesh->Nggeo * mesh->Np * e + j * mesh->Nq + i;
-        mesh->ggeo[gbase + mesh->Np * G00ID] = JW * (rx * rx + ry * ry + rz * rz);
-        mesh->ggeo[gbase + mesh->Np * G01ID] = JW * (rx * sx + ry * sy + rz * sz);
-        mesh->ggeo[gbase + mesh->Np * G02ID] = JW * (rx * tx + ry * ty + rz * tz);
-
-        mesh->ggeo[gbase + mesh->Np * G11ID] = JW * (sx * sx + sy * sy + sz * sz);
-        mesh->ggeo[gbase + mesh->Np * G12ID] = JW * (sx * tx + sy * ty + sz * tz);
-
-        mesh->ggeo[gbase + mesh->Np * G22ID] = JW * (tx * tx + ty * ty + tz * tz);
-        mesh->ggeo[gbase + mesh->Np * GWJID] = JW;
-
-        // now do for cubvgeo
-        // 1. interpolate Jacobian matrix to cubature nodes
-        for(int m = 0; m < mesh->cubNq; ++m)
-          for(int n = 0; n < mesh->cubNq; ++n) {
-            dfloat cIni = mesh->cubInterp[n * mesh->Nq + i];
-            dfloat cImj = mesh->cubInterp[m * mesh->Nq + j];
-            cxr[n + m * mesh->cubNq] += cIni * cImj * xr;
-            cxs[n + m * mesh->cubNq] += cIni * cImj * xs;
-            cyr[n + m * mesh->cubNq] += cIni * cImj * yr;
-            cys[n + m * mesh->cubNq] += cIni * cImj * ys;
-            czr[n + m * mesh->cubNq] += cIni * cImj * zr;
-            czs[n + m * mesh->cubNq] += cIni * cImj * zs;
-            cx[n + m * mesh->cubNq] += cIni * cImj * xij;
-            cy[n + m * mesh->cubNq] += cIni * cImj * yij;
-            cz[n + m * mesh->cubNq] += cIni * cImj * zij;
-          }
-      }
-
-
-    for(int n = 0; n < mesh->cubNq * mesh->cubNq; ++n) {
-      dfloat rx = cys[n] * cz[n] - czs[n] * cy[n]; // dXds x X
-      dfloat ry = czs[n] * cx[n] - cxs[n] * cz[n];
-      dfloat rz = cxs[n] * cy[n] - cys[n] * cx[n];
-
-      dfloat sx = czr[n] * cy[n] - cyr[n] * cz[n]; // -dXdr x X
-      dfloat sy = cxr[n] * cz[n] - czr[n] * cx[n];
-      dfloat sz = cyr[n] * cx[n] - cxr[n] * cy[n];
-
-      dfloat tx = cyr[n] * czs[n] - czr[n] * cys[n]; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-      dfloat ty = czr[n] * cxs[n] - cxr[n] * czs[n];
-      dfloat tz = cxr[n] * cys[n] - cyr[n] * cxs[n];
-
-      dfloat Gx = tx, Gy = ty, Gz = tz;
-
-      dfloat J = cx[n] * tx + cy[n] * ty + cz[n] * tz;
-
-      if(J < 1e-8) {
-        printf("Negative or small Jacobian: %g\n", J);
-        exit(-1);
-      }
-
-      rx /= J;
-      sx /= J;
-      tx /= J;
-      ry /= J;
-      sy /= J;
-      ty /= J;
-      rz /= J;
-      sz /= J;
-      tz /= J;
-
-      // use this for "volume" Jacobian
-      J = sqrt(Gx * Gx + Gy * Gy + Gz * Gz);
-
-      if(J < 1e-8) {
-        printf("Negative or small cubature Jacobian: %g (Gx,y,z=%g,%g,%g)\n",
-               J, Gx, Gy, Gz);
-        exit(-1);
-      }
-
-      dfloat JW = J * mesh->cubw[n % mesh->cubNq] * mesh->cubw[n / mesh->cubNq];
-
-      /* store geometric factors */
-      int base = mesh->Nvgeo * mesh->cubNp * e + n;
-
-      mesh->cubvgeo[base + mesh->cubNp * RXID] = rx;
-      mesh->cubvgeo[base + mesh->cubNp * RYID] = ry;
-      mesh->cubvgeo[base + mesh->cubNp * RZID] = rz;
-      mesh->cubvgeo[base + mesh->cubNp * SXID] = sx;
-      mesh->cubvgeo[base + mesh->cubNp * SYID] = sy;
-      mesh->cubvgeo[base + mesh->cubNp * SZID] = sz;
-      mesh->cubvgeo[base + mesh->cubNp * TXID] = tx;
-      mesh->cubvgeo[base + mesh->cubNp * TYID] = ty;
-      mesh->cubvgeo[base + mesh->cubNp * TZID] = tz;
-      mesh->cubvgeo[base + mesh->cubNp * JID]  = J;
-      mesh->cubvgeo[base + mesh->cubNp * JWID] = JW;
-      mesh->cubvgeo[base + mesh->cubNp * IJWID] = 1. / JW;
-    }
-  }
-}
diff --git a/src/libP/src/meshGeometricFactorsTet3D.c b/src/libP/src/meshGeometricFactorsTet3D.c
deleted file mode 100644
index 879a5dd6b..000000000
--- a/src/libP/src/meshGeometricFactorsTet3D.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-void meshGeometricFactorsTet3D(mesh3D* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nvgeo = 12;
-  mesh->vgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo,
-                                sizeof(dfloat));
-
-  /* number of second order geometric factors */
-  mesh->Nggeo = 7;
-  mesh->ggeo = (dfloat*) calloc(mesh->Nelements * mesh->Nggeo, sizeof(dfloat));
-
-  dfloat minJ = 1e9, maxJ = -1e9;
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    /* find vertex indices and physical coordinates */
-    dlong id = e * mesh->Nverts;
-
-    /* vertex coordinates */
-    dfloat xe1 = mesh->EX[id + 0], ye1 = mesh->EY[id + 0], ze1 = mesh->EZ[id + 0];
-    dfloat xe2 = mesh->EX[id + 1], ye2 = mesh->EY[id + 1], ze2 = mesh->EZ[id + 1];
-    dfloat xe3 = mesh->EX[id + 2], ye3 = mesh->EY[id + 2], ze3 = mesh->EZ[id + 2];
-    dfloat xe4 = mesh->EX[id + 3], ye4 = mesh->EY[id + 3], ze4 = mesh->EZ[id + 3];
-
-    /* Jacobian matrix */
-    dfloat xr = 0.5 * (xe2 - xe1), xs = 0.5 * (xe3 - xe1), xt = 0.5 * (xe4 - xe1);
-    dfloat yr = 0.5 * (ye2 - ye1), ys = 0.5 * (ye3 - ye1), yt = 0.5 * (ye4 - ye1);
-    dfloat zr = 0.5 * (ze2 - ze1), zs = 0.5 * (ze3 - ze1), zt = 0.5 * (ze4 - ze1);
-
-    /* compute geometric factors for affine coordinate transform*/
-    dfloat J = xr * (ys * zt - zs * yt) - yr * (xs * zt - zs * xt) + zr * (xs * yt - ys * xt);
-
-    dfloat rx =  (ys * zt - zs * yt) / J, ry = -(xs * zt - zs * xt) / J,
-           rz =  (xs * yt - ys * xt) / J;
-    dfloat sx = -(yr * zt - zr * yt) / J, sy =  (xr * zt - zr * xt) / J,
-           sz = -(xr * yt - yr * xt) / J;
-    dfloat tx =  (yr * zs - zr * ys) / J, ty = -(xr * zs - zr * xs) / J,
-           tz =  (xr * ys - yr * xs) / J;
-
-    if(J < 0) printf("bugger: got negative geofac\n");
-    minJ = mymin(minJ,J);
-    maxJ = mymax(maxJ,J);
-
-    /* store geometric factors */
-    mesh->vgeo[mesh->Nvgeo * e + RXID] = rx;
-    mesh->vgeo[mesh->Nvgeo * e + RYID] = ry;
-    mesh->vgeo[mesh->Nvgeo * e + RZID] = rz;
-    mesh->vgeo[mesh->Nvgeo * e + SXID] = sx;
-    mesh->vgeo[mesh->Nvgeo * e + SYID] = sy;
-    mesh->vgeo[mesh->Nvgeo * e + SZID] = sz;
-    mesh->vgeo[mesh->Nvgeo * e + TXID] = tx;
-    mesh->vgeo[mesh->Nvgeo * e + TYID] = ty;
-    mesh->vgeo[mesh->Nvgeo * e + TZID] = tz;
-    mesh->vgeo[mesh->Nvgeo * e +  JID] = J;
-    //    printf("geo: %g,%g,%g - %g,%g,%g - %g,%g,%g\n",
-    //     rx,ry,rz, sx,sy,sz, tx,ty,tz);
-
-    /* store second order geometric factors */
-    mesh->ggeo[mesh->Nggeo * e + G00ID] = J * (rx * rx + ry * ry + rz * rz);
-    mesh->ggeo[mesh->Nggeo * e + G01ID] = J * (rx * sx + ry * sy + rz * sz);
-    mesh->ggeo[mesh->Nggeo * e + G02ID] = J * (rx * tx + ry * ty + rz * tz);
-    mesh->ggeo[mesh->Nggeo * e + G11ID] = J * (sx * sx + sy * sy + sz * sz);
-    mesh->ggeo[mesh->Nggeo * e + G12ID] = J * (sx * tx + sy * ty + sz * tz);
-    mesh->ggeo[mesh->Nggeo * e + G22ID] = J * (tx * tx + ty * ty + tz * tz);
-    mesh->ggeo[mesh->Nggeo * e + GWJID] = J;
-  }
-
-  //printf("minJ = %g, maxJ = %g\n", minJ, maxJ);
-}
diff --git a/src/libP/src/meshGeometricFactorsTri2D.c b/src/libP/src/meshGeometricFactorsTri2D.c
deleted file mode 100644
index 485be63ed..000000000
--- a/src/libP/src/meshGeometricFactorsTri2D.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshGeometricFactorsTri2D(mesh2D* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nvgeo = 5;
-  mesh->vgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo,
-                                sizeof(dfloat));
-
-  /* number of second order geometric factors */
-  mesh->Nggeo = 4;
-  mesh->ggeo = (dfloat*) calloc(mesh->Nelements * mesh->Nggeo, sizeof(dfloat));
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    /* find vertex indices and physical coordinates */
-    dlong id = e * mesh->Nverts + 0;
-
-    dfloat xe1 = mesh->EX[id + 0];
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-
-    dfloat ye1 = mesh->EY[id + 0];
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-
-    /* compute geometric factors for affine coordinate transform*/
-    dfloat J = 0.25 * ((xe2 - xe1) * (ye3 - ye1) - (xe3 - xe1) * (ye2 - ye1));
-
-    if(J < 0) printf("bugger: got negative geofac\n");
-    dfloat rx =  (0.5 / J) * (ye3 - ye1);
-    dfloat ry = -(0.5 / J) * (xe3 - xe1);
-    dfloat sx = -(0.5 / J) * (ye2 - ye1);
-    dfloat sy =  (0.5 / J) * (xe2 - xe1);
-
-    /* store geometric factors */
-    mesh->vgeo[mesh->Nvgeo * e + RXID] = rx;
-    mesh->vgeo[mesh->Nvgeo * e + RYID] = ry;
-    mesh->vgeo[mesh->Nvgeo * e + SXID] = sx;
-    mesh->vgeo[mesh->Nvgeo * e + SYID] = sy;
-    mesh->vgeo[mesh->Nvgeo * e +  JID] = J;
-
-    /* store second order geometric factors */
-    mesh->ggeo[mesh->Nggeo * e + G00ID] = J * (rx * rx + ry * ry);
-    mesh->ggeo[mesh->Nggeo * e + G01ID] = J * (rx * sx + ry * sy);
-    mesh->ggeo[mesh->Nggeo * e + G11ID] = J * (sx * sx + sy * sy);
-    mesh->ggeo[mesh->Nggeo * e + GWJID]  = J;
-  }
-}
diff --git a/src/libP/src/meshGeometricFactorsTri3D.c b/src/libP/src/meshGeometricFactorsTri3D.c
deleted file mode 100644
index f76444a65..000000000
--- a/src/libP/src/meshGeometricFactorsTri3D.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-// custom geometric factors specialized for 3D tri on sphere
-
-void meshGeometricFactorsTri3D(mesh_t* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nvgeo = 12; //
-
-  /* note that we have volume geometric factors for each node */
-  mesh->vgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->Np, sizeof(dfloat));
-
-  /* number of second order geometric factors */
-  mesh->Nggeo = 7;
-  mesh->ggeo = (dfloat*) calloc(mesh->Nelements * mesh->Nggeo, sizeof(dfloat));
-
-  for(int e = 0; e < mesh->Nelements; ++e) /* for each element */
-
-    for(int n = 0; n < mesh->Np; ++n) {
-      dfloat xn = mesh->x[n + e * mesh->Np];
-      dfloat yn = mesh->y[n + e * mesh->Np];
-      dfloat zn = mesh->z[n + e * mesh->Np];
-
-      dfloat xr = 0, yr = 0, zr = 0;
-      dfloat xs = 0, ys = 0, zs = 0;
-
-      for(int m = 0; m < mesh->Np; ++m) {
-        dfloat Drnm = mesh->Dr[n * mesh->Np + m];
-        dfloat Dsnm = mesh->Ds[n * mesh->Np + m];
-
-        xr += Drnm * mesh->x[m + e * mesh->Np];
-        yr += Drnm * mesh->y[m + e * mesh->Np];
-        zr += Drnm * mesh->z[m + e * mesh->Np];
-
-        xs += Dsnm * mesh->x[m + e * mesh->Np];
-        ys += Dsnm * mesh->y[m + e * mesh->Np];
-        zs += Dsnm * mesh->z[m + e * mesh->Np];
-      }
-
-      dfloat rx = ys * zn - zs * yn; // dXds x X
-      dfloat ry = zs * xn - xs * zn;
-      dfloat rz = xs * yn - ys * xn;
-
-      dfloat sx = zr * yn - yr * zn; // -dXdr x X
-      dfloat sy = xr * zn - zr * xn;
-      dfloat sz = yr * xn - xr * yn;
-
-      dfloat tx = yr * zs - zr * ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-      dfloat ty = zr * xs - xr * zs;
-      dfloat tz = xr * ys - yr * xs;
-
-      dfloat Gx = tx, Gy = ty, Gz = tz;
-
-      dfloat J = xn * tx + yn * ty + zn * tz;
-
-      if(J < 1e-8) {
-        printf("Negative or small Jacobian: %g\n", J);
-        exit(-1);
-      }
-
-      rx /= J;
-      ry /= J;
-      rz /= J;
-
-      sx /= J;
-      sy /= J;
-      sz /= J;
-
-      tx /= J;
-      ty /= J;
-      tz /= J;
-
-      // use this for "volume" Jacobian
-      J = sqrt(Gx * Gx + Gy * Gy + Gz * Gz);
-
-      if(J < 1e-8) {
-        printf("Negative or small Jacobian: %g\n", J);
-        exit(-1);
-      }
-
-      /* store geometric factors */
-      int base = mesh->Nvgeo * mesh->Np * e + n;
-
-      mesh->vgeo[base + mesh->Np * RXID] = rx;
-      mesh->vgeo[base + mesh->Np * RYID] = ry;
-      mesh->vgeo[base + mesh->Np * RZID] = rz;
-      mesh->vgeo[base + mesh->Np * SXID] = sx;
-      mesh->vgeo[base + mesh->Np * SYID] = sy;
-      mesh->vgeo[base + mesh->Np * SZID] = sz;
-      mesh->vgeo[base + mesh->Np * TXID] = tx;
-      mesh->vgeo[base + mesh->Np * TYID] = ty;
-      mesh->vgeo[base + mesh->Np * TZID] = tz;
-      mesh->vgeo[base + mesh->Np * JID]  = J;
-
-      mesh->ggeo[mesh->Nggeo * e + G00ID] = J * (rx * rx + ry * ry + rz * rz);
-      mesh->ggeo[mesh->Nggeo * e + G01ID] = J * (rx * sx + ry * sy + rz * sz);
-      mesh->ggeo[mesh->Nggeo * e + G11ID] = J * (sx * sx + sy * sy + sz * sz);
-      mesh->ggeo[mesh->Nggeo * e + GWJID]  = J;
-    }
-}
diff --git a/src/libP/src/meshGeometricPartition2D.c b/src/libP/src/meshGeometricPartition2D.c
deleted file mode 100644
index 8746e31d1..000000000
--- a/src/libP/src/meshGeometricPartition2D.c
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh2D.h"
-
-#define bitRange 15
-
-#if 0
-
-/// THIS SECTION ------------------------------------------------------------------------------------>
-// taken from: http://and-what-happened.blogspot.com/2011/08/fast-2d-and-3d-hilbert-curves-and.html
-
-unsigned int Morton_2D_Encode_16bit( unsigned int index1, unsigned int index2 )
-{ // pack 2 16-bit indices into a 32-bit Morton code
-  index1 &= 0x0000ffff;
-  index2 &= 0x0000ffff;
-  index1 |= ( index1 << 8 );
-  index2 |= ( index2 << 8 );
-  index1 &= 0x00ff00ff;
-  index2 &= 0x00ff00ff;
-  index1 |= ( index1 << 4 );
-  index2 |= ( index2 << 4 );
-  index1 &= 0x0f0f0f0f;
-  index2 &= 0x0f0f0f0f;
-  index1 |= ( index1 << 2 );
-  index2 |= ( index2 << 2 );
-  index1 &= 0x33333333;
-  index2 &= 0x33333333;
-  index1 |= ( index1 << 1 );
-  index2 |= ( index2 << 1 );
-  index1 &= 0x55555555;
-  index2 &= 0x55555555;
-  return index1 | ( index2 << 1 );
-}
-
-unsigned int MortonToHilbert2D( const unsigned int morton, const unsigned int bits )
-{
-  unsigned int hilbert = 0;
-  unsigned int remap = 0xb4;
-  unsigned int block = ( bits << 1 );
-  while( block ) {
-    block -= 2;
-    unsigned int mcode = ( ( morton >> block ) & 3 );
-    unsigned int hcode = ( ( remap >> ( mcode << 1 ) ) & 3 );
-    remap ^= ( 0x82000028 >> ( hcode << 3 ) );
-    hilbert = ( ( hilbert << 2 ) + hcode );
-  }
-  return hilbert;
-}
-
-unsigned int hilbert2D(unsigned int index1, unsigned int index2)
-{
-  unsigned int morton = Morton_2D_Encode_16bit(index1,index2);
-
-  return MortonToHilbert2D(morton, 16);
-}
-
-/// THIS SECTION TO HERE <--------------------------------------------------------------------------------
-
-// spread bits of i by introducing zeros between binary bits
-unsigned long long int bitSplitter(unsigned int i)
-{
-  unsigned long long int mask = 1;
-  unsigned long long int li = i;
-  unsigned long long int lj = 0;
-
-  for(int b = 0; b < bitRange; ++b) {
-    lj |=  (li & mask) << b;
-    mask <<= 1;
-  }
-
-  return lj;
-}
-
-// compute Morton index of (ix,iy) relative to a bitRange x bitRange  Morton lattice
-unsigned long long int mortonIndex2D(unsigned int ix, unsigned int iy)
-{
-  // spread bits of ix apart (introduce zeros)
-  unsigned long long int sx = bitSplitter(ix);
-  unsigned long long int sy = bitSplitter(iy);
-
-  // interleave bits of ix and iy
-  unsigned long long int mi = sx | (sy << 1);
-
-  return mi;
-}
-
-#else  /* if 0 */
-
-// from: https://en.wikipedia.org/wiki/Hilbert_curve
-
-//rotate/flip a quadrant appropriately
-void rot(unsigned int n, unsigned int* x, unsigned int* y, unsigned int rx, unsigned int ry)
-{
-  if (ry == 0) {
-    if (rx == 1) {
-      *x = n - 1 - *x;
-      *y = n - 1 - *y;
-    }
-
-    //Swap x and y
-    int t  = *x;
-    *x = *y;
-    *y = t;
-  }
-}
-
-//convert (x,y) to d
-unsigned int hilbert2D (unsigned int n, unsigned int x, unsigned int y)
-{
-  unsigned int rx, ry, s, d = 0;
-  for (s = n / 2; s > 0; s /= 2) {
-    rx = (x & s) > 0;
-    ry = (y & s) > 0;
-    d += s * s * ((3 * rx) ^ ry);
-    rot(s, &x, &y, rx, ry);
-  }
-  return d;
-}
-
-#endif
-
-// capsule for element vertices + Morton index
-typedef struct
-{
-  unsigned long long int index;
-
-  dlong element;
-
-  int type;
-
-  // 4 for maximum number of vertices per element in 2D
-  hlong v[4];
-
-  dfloat EX[4], EY[4];
-}element_t;
-
-// compare the Morton indices for two element capsules
-int compareElements2D(const void* a, const void* b)
-{
-  element_t* ea = (element_t*) a;
-  element_t* eb = (element_t*) b;
-
-  if(ea->index < eb->index) return -1;
-  if(ea->index > eb->index) return 1;
-
-  return 0;
-}
-
-// stub for the match function needed by parallelSort
-void bogusMatch(void* a, void* b){ }
-
-// geometric partition of elements in 2D mesh using Morton ordering + parallelSort
-void meshGeometricPartition2D(mesh2D* mesh)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  dlong maxNelements;
-  MPI_Allreduce(&(mesh->Nelements), &maxNelements, 1, MPI_DLONG, MPI_MAX, mesh->comm);
-  maxNelements = 2 * ((maxNelements + 1) / 2);
-
-  // fix maxNelements
-  element_t* elements
-    = (element_t*) calloc(maxNelements, sizeof(element_t));
-
-  // local bounding box of element centers
-  dfloat mincx = 1e9, maxcx = -1e9;
-  dfloat mincy = 1e9, maxcy = -1e9;
-
-  // compute element centers on this process
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    dfloat cx = 0, cy = 0;
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      cx += mesh->EX[e * mesh->Nverts + n];
-      cy += mesh->EY[e * mesh->Nverts + n];
-    }
-    cx /= mesh->Nverts;
-    cy /= mesh->Nverts;
-
-    mincx = mymin(mincx, cx);
-    maxcx = mymax(maxcx, cx);
-    mincy = mymin(mincy, cy);
-    maxcy = mymax(maxcy, cy);
-  }
-
-  dfloat delta = 1e-1;
-  mincx -= delta;
-  mincy -= delta;
-  maxcx += delta;
-  maxcy += delta;
-
-  // find global bounding box of element centers
-  dfloat gmincx, gmincy, gmaxcx, gmaxcy;
-  MPI_Allreduce(&mincx, &gmincx, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&mincy, &gmincy, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&maxcx, &gmaxcx, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-  MPI_Allreduce(&maxcy, &gmaxcy, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-
-  dfloat maxlength = mymax(gmaxcx - gmincx, gmaxcy - gmincy);
-
-  // choose sub-range of Morton lattice coordinates to embed element centers in
-  unsigned int Nboxes = (((unsigned int)1) << (bitRange));
-
-  // compute Morton index for each element
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    // element center coordinates
-    dfloat cx = 0, cy = 0;
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      cx += mesh->EX[e * mesh->Nverts + n];
-      cy += mesh->EY[e * mesh->Nverts + n];
-    }
-    cx /= mesh->Nverts;
-    cy /= mesh->Nverts;
-
-    // encapsulate element, vertices, Morton index, vertex coordinates
-    elements[e].element = e;
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      elements[e].v[n] = mesh->EToV[e * mesh->Nverts + n];
-      elements[e].EX[n] = mesh->EX[e * mesh->Nverts + n];
-      elements[e].EY[n] = mesh->EY[e * mesh->Nverts + n];
-    }
-
-    elements[e].type = mesh->elementInfo[e];
-
-    unsigned int ix = (cx - gmincx) * Nboxes / maxlength;
-    unsigned int iy = (cy - gmincy) * Nboxes / maxlength;
-
-    //elements[e].index = mortonIndex2D(ix, iy);
-    elements[e].index = hilbert2D(Nboxes, ix, iy);
-  }
-
-  // pad element array with dummy elements
-  for(dlong e = mesh->Nelements; e < maxNelements; ++e) {
-    elements[e].element = -1;
-
-    elements[e].index = hilbert2D(Nboxes, Nboxes - 1, Nboxes - 1);
-
-    //    elements[e].index = hilbert2D(Nboxes+1, Nboxes+1);
-    //    elements[e].index = mortonIndex2D(Nboxes+1, Nboxes+1);
-  }
-
-  // odd-even parallel sort of element capsules based on their Morton index
-  parallelSort(mesh->size, mesh->rank, mesh->comm,
-               maxNelements, elements, sizeof(element_t),
-               compareElements2D,
-               bogusMatch);
-
-  // compress and renumber elements
-  dlong sk  = 0;
-  for(dlong e = 0; e < maxNelements; ++e)
-    if(elements[e].element != -1) {
-      elements[sk] = elements[e];
-      ++sk;
-    }
-
-  dlong localNelements = sk;
-
-  /// redistribute elements to improve balancing
-  // TODO: We need a safer version of this for very large meshes.
-  // if dlong is a long long int Nsend and/or sendOffsets may overflow int
-  dlong* globalNelements = (dlong*) calloc(size,sizeof(dlong));
-  hlong* starts = (hlong*) calloc(size + 1,sizeof(hlong));
-
-  MPI_Allgather(&localNelements, 1, MPI_DLONG, globalNelements, 1,  MPI_DLONG, mesh->comm);
-
-  for(int r = 0; r < size; ++r)
-    starts[r + 1] = starts[r] + globalNelements[r];
-
-  hlong allNelements = starts[size];
-
-  // decide how many to keep on each process
-  hlong chunk = allNelements / size;
-  int remainder = (int) (allNelements - chunk * size);
-
-  int* Nsend = (int*) calloc(size, sizeof(int));
-  int* Nrecv = (int*) calloc(size, sizeof(int));
-  // int *Ncount = (int *) calloc(size, sizeof(int));
-  int* sendOffsets = (int*) calloc(size, sizeof(int));
-  int* recvOffsets = (int*) calloc(size, sizeof(int));
-
-  // Make the MPI_ELEMENT_T data type
-  MPI_Datatype MPI_ELEMENT_T;
-  MPI_Datatype dtype[6] = {MPI_LONG_LONG_INT, MPI_DLONG, MPI_INT,
-                           MPI_HLONG, MPI_DFLOAT, MPI_DFLOAT};
-  int blength[6] = {1, 1, 1, 4, 4, 4};
-  MPI_Aint addr[6], displ[6];
-  MPI_Get_address ( &(elements[0]        ), addr + 0);
-  MPI_Get_address ( &(elements[0].element), addr + 1);
-  MPI_Get_address ( &(elements[0].type   ), addr + 2);
-  MPI_Get_address ( &(elements[0].v[0]   ), addr + 3);
-  MPI_Get_address ( &(elements[0].EX[0]  ), addr + 4);
-  MPI_Get_address ( &(elements[0].EY[0]  ), addr + 5);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  MPI_Type_create_struct (6, blength, displ, dtype, &MPI_ELEMENT_T);
-  MPI_Type_commit (&MPI_ELEMENT_T);
-
-  for(dlong e = 0; e < localNelements; ++e) {
-    // global element index
-    elements[e].element = starts[rank] + e;
-
-    // 0, chunk+1, 2*(chunk+1) ..., remainder*(chunk+1), remainder*(chunk+1) + chunk
-    int r;
-    if(elements[e].element < remainder * (chunk + 1))
-      r = elements[e].element / (chunk + 1);
-    else
-      r = remainder + ((elements[e].element - remainder * (chunk + 1)) / chunk);
-
-    ++Nsend[r];
-  }
-
-  // find send offsets
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  // count incoming clusters
-  dlong newNelements = 0;
-  for(int r = 0; r < size; ++r)
-    newNelements += Nrecv[r];
-
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  element_t* tmpElements = (element_t*) calloc(newNelements, sizeof(element_t));
-
-  // exchange parallel clusters
-  MPI_Alltoallv(elements, Nsend, sendOffsets, MPI_ELEMENT_T,
-                tmpElements, Nrecv, recvOffsets, MPI_ELEMENT_T, mesh->comm);
-
-  MPI_Barrier(mesh->comm);
-  MPI_Type_free(&MPI_ELEMENT_T);
-
-  // replace elements with inbound elements
-  if (elements) free(elements);
-  elements = tmpElements;
-
-  // reset number of elements and element-to-vertex connectivity from returned capsules
-  free(mesh->EToV);
-  free(mesh->EX);
-  free(mesh->EY);
-  free(mesh->elementInfo);
-
-  mesh->Nelements = newNelements;
-  mesh->EToV = (hlong*) calloc(newNelements * mesh->Nverts, sizeof(hlong));
-  mesh->EX = (dfloat*) calloc(newNelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(newNelements * mesh->Nverts, sizeof(dfloat));
-  mesh->elementInfo = (hlong*) calloc(newNelements, sizeof(hlong));
-
-  for(dlong e = 0; e < newNelements; ++e) {
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      mesh->EToV[e * mesh->Nverts + n] = elements[e].v[n];
-      mesh->EX[e * mesh->Nverts + n]   = elements[e].EX[n];
-      mesh->EY[e * mesh->Nverts + n]   = elements[e].EY[n];
-    }
-    mesh->elementInfo[e] = elements[e].type;
-  }
-  if (elements) free(elements);
-}
diff --git a/src/libP/src/meshGeometricPartition3D.c b/src/libP/src/meshGeometricPartition3D.c
deleted file mode 100644
index cfd78c012..000000000
--- a/src/libP/src/meshGeometricPartition3D.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "mesh3D.h"
-
-// 20 bits per coordinate
-#define bitRange 20
-
-// spread bits of i by introducing zeros between binary bits
-unsigned long long int bitSplitter3D(unsigned int i)
-{
-  unsigned long long int mask = 1;
-  unsigned long long int li = i;
-  unsigned long long int lj = 0;
-
-  for(int b = 0; b < bitRange; ++b) {
-    lj |= ((li & mask) << 2 * b); // bit b moves to bit 3b
-    mask <<= 1;
-  }
-
-  return lj;
-}
-
-// compute Morton index of (ix,iy) relative to a bitRange x bitRange  Morton lattice
-unsigned long long int mortonIndex3D(unsigned int ix, unsigned int iy, unsigned int iz)
-{
-  // spread bits of ix apart (introduce zeros)
-  unsigned long long int sx = bitSplitter3D(ix);
-  unsigned long long int sy = bitSplitter3D(iy);
-  unsigned long long int sz = bitSplitter3D(iz);
-
-  // interleave bits of ix and iy
-  unsigned long long int mi = sx | (sy << 1) | (sz << 2);
-
-  return mi;
-}
-
-// capsule for element vertices + Morton index
-typedef struct
-{
-  unsigned long long int index;
-
-  dlong element;
-
-  int type;
-
-  // use 8 for maximum vertices per element
-  hlong v[8];
-
-  dfloat EX[8], EY[8], EZ[8];
-}element_t;
-
-// compare the Morton indices for two element capsules
-int compareElements(const void* a, const void* b)
-{
-  element_t* ea = (element_t*) a;
-  element_t* eb = (element_t*) b;
-
-  if(ea->index < eb->index) return -1;
-  if(ea->index > eb->index) return 1;
-
-  return 0;
-}
-
-// stub for the match function needed by parallelSort
-void bogusMatch3D(void* a, void* b){ }
-
-// geometric partition of elements in 3D mesh using Morton ordering + parallelSort
-void meshGeometricPartition3D(mesh3D* mesh)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  dlong maxNelements;
-  MPI_Allreduce(&(mesh->Nelements), &maxNelements, 1, MPI_DLONG, MPI_MAX,
-                mesh->comm);
-  maxNelements = 2 * ((maxNelements + 1) / 2);
-
-  // fix maxNelements
-  element_t* elements
-    = (element_t*) calloc(maxNelements, sizeof(element_t));
-
-  // local bounding box of element centers
-  dfloat minvx = 1e9, maxvx = -1e9;
-  dfloat minvy = 1e9, maxvy = -1e9;
-  dfloat minvz = 1e9, maxvz = -1e9;
-
-  // compute element centers on this process
-  for(dlong n = 0; n < mesh->Nverts * mesh->Nelements; ++n) {
-    minvx = mymin(minvx, mesh->EX[n]);
-    maxvx = mymax(maxvx, mesh->EX[n]);
-    minvy = mymin(minvy, mesh->EY[n]);
-    maxvy = mymax(maxvy, mesh->EY[n]);
-    minvz = mymin(minvz, mesh->EZ[n]);
-    maxvz = mymax(maxvz, mesh->EZ[n]);
-  }
-
-  // find global bounding box of element centers
-  dfloat gminvx, gminvy, gminvz, gmaxvx, gmaxvy, gmaxvz;
-  MPI_Allreduce(&minvx, &gminvx, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&minvy, &gminvy, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&minvz, &gminvz, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&maxvx, &gmaxvx, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-  MPI_Allreduce(&maxvy, &gmaxvy, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-  MPI_Allreduce(&maxvz, &gmaxvz, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
-
-  // choose sub-range of Morton lattice coordinates to embed element centers in
-  unsigned long long int Nboxes = (((unsigned long long int)1) << (bitRange - 1));
-
-  // compute Morton index for each element
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    // element center coordinates
-    dfloat cx = 0, cy = 0, cz = 0;
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      cx += mesh->EX[e * mesh->Nverts + n];
-      cy += mesh->EY[e * mesh->Nverts + n];
-      cz += mesh->EZ[e * mesh->Nverts + n];
-    }
-    cx /= mesh->Nverts;
-    cy /= mesh->Nverts;
-    cz /= mesh->Nverts;
-
-    // encapsulate element, vertices, Morton index, vertex coordinates
-    elements[e].element = e;
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      elements[e].v[n] = mesh->EToV[e * mesh->Nverts + n];
-      elements[e].EX[n] = mesh->EX[e * mesh->Nverts + n];
-      elements[e].EY[n] = mesh->EY[e * mesh->Nverts + n];
-      elements[e].EZ[n] = mesh->EZ[e * mesh->Nverts + n];
-    }
-
-    elements[e].type = mesh->elementInfo[e];
-
-    dfloat maxlength = mymax(gmaxvx - gminvx, mymax(gmaxvy - gminvy, gmaxvz - gminvz));
-
-    // avoid stretching axes
-    unsigned long long int ix = (cx - gminvx) * Nboxes / maxlength;
-    unsigned long long int iy = (cy - gminvy) * Nboxes / maxlength;
-    unsigned long long int iz = (cz - gminvz) * Nboxes / maxlength;
-
-    elements[e].index = mortonIndex3D(ix, iy, iz);
-  }
-
-  // pad element array with dummy elements
-  for(dlong e = mesh->Nelements; e < maxNelements; ++e) {
-    elements[e].element = -1;
-    elements[e].index = mortonIndex3D(Nboxes + 1, Nboxes + 1, Nboxes + 1);
-  }
-
-  // odd-even parallel sort of element capsules based on their Morton index
-  parallelSort(mesh->size, mesh->rank, mesh->comm,
-               maxNelements, elements, sizeof(element_t),
-               compareElements,
-               bogusMatch3D);
-
-#if 0
-  // count number of elements that end up on this process
-  int cnt = 0;
-  for(int e = 0; e < maxNelements; ++e)
-    cnt += (elements[e].element != -1);
-
-  // reset number of elements and element-to-vertex connectivity from returned capsules
-  free(mesh->EToV);
-  free(mesh->EX);
-  free(mesh->EY);
-  free(mesh->EZ);
-
-  mesh->Nelements = cnt;
-  mesh->EToV = (int*) calloc(cnt * mesh->Nverts, sizeof(int));
-  mesh->EX = (dfloat*) calloc(cnt * mesh->Nverts, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(cnt * mesh->Nverts, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(cnt * mesh->Nverts, sizeof(dfloat));
-
-  cnt = 0;
-  for(int e = 0; e < maxNelements; ++e)
-    if(elements[e].element != -1) {
-      for(int n = 0; n < mesh->Nverts; ++n) {
-        mesh->EToV[cnt * mesh->Nverts + n] = elements[e].v[n];
-        mesh->EX[cnt * mesh->Nverts + n]   = elements[e].EX[n];
-        mesh->EY[cnt * mesh->Nverts + n]   = elements[e].EY[n];
-        mesh->EZ[cnt * mesh->Nverts + n]   = elements[e].EZ[n];
-      }
-      ++cnt;
-    }
-
-#else  /* if 0 */
-  // compress and renumber elements
-  dlong sk  = 0;
-  for(dlong e = 0; e < maxNelements; ++e)
-    if(elements[e].element != -1) {
-      elements[sk] = elements[e];
-      ++sk;
-    }
-
-  dlong localNelements = sk;
-
-  /// redistribute elements to improve balancing
-  dlong* globalNelements = (dlong*) calloc(size,sizeof(dlong));
-  hlong* starts = (hlong*) calloc(size + 1,sizeof(hlong));
-
-  MPI_Allgather(&localNelements, 1, MPI_DLONG, globalNelements, 1,  MPI_DLONG, mesh->comm);
-
-  for(int r = 0; r < size; ++r)
-    starts[r + 1] = starts[r] + globalNelements[r];
-
-  hlong allNelements = starts[size];
-
-  // decide how many to keep on each process
-  hlong chunk = allNelements / size;
-  int remainder = (int) (allNelements - chunk * size);
-
-  int* Nsend = (int*) calloc(size, sizeof(int));
-  int* Nrecv = (int*) calloc(size, sizeof(int));
-  // int *Ncount = (int *) calloc(size, sizeof(int));
-  int* sendOffsets = (int*) calloc(size, sizeof(int));
-  int* recvOffsets = (int*) calloc(size, sizeof(int));
-
-  // Make the MPI_ELEMENT_T data type
-  MPI_Datatype MPI_ELEMENT_T;
-  MPI_Datatype dtype[7] = {MPI_LONG_LONG_INT, MPI_DLONG, MPI_INT,
-                           MPI_HLONG, MPI_DFLOAT, MPI_DFLOAT, MPI_DFLOAT};
-  int blength[7] = {1, 1, 1, 8, 8, 8, 8};
-  MPI_Aint addr[7], displ[7];
-  MPI_Get_address ( &(elements[0]        ), addr + 0);
-  MPI_Get_address ( &(elements[0].element), addr + 1);
-  MPI_Get_address ( &(elements[0].type   ), addr + 2);
-  MPI_Get_address ( &(elements[0].v[0]   ), addr + 3);
-  MPI_Get_address ( &(elements[0].EX[0]  ), addr + 4);
-  MPI_Get_address ( &(elements[0].EY[0]  ), addr + 5);
-  MPI_Get_address ( &(elements[0].EZ[0]  ), addr + 6);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  displ[6] = addr[6] - addr[0];
-  MPI_Type_create_struct (7, blength, displ, dtype, &MPI_ELEMENT_T);
-  MPI_Type_commit (&MPI_ELEMENT_T);
-
-  for(dlong e = 0; e < localNelements; ++e) {
-    // global element index
-    elements[e].element = starts[rank] + e;
-
-    // 0, chunk+1, 2*(chunk+1) ..., remainder*(chunk+1), remainder*(chunk+1) + chunk
-    int r;
-    if(elements[e].element < remainder * (chunk + 1))
-      r = elements[e].element / (chunk + 1);
-    else
-      r = remainder + ((elements[e].element - remainder * (chunk + 1)) / chunk);
-
-    ++Nsend[r];
-  }
-
-  // find send offsets
-  for(int r = 1; r < size; ++r)
-    sendOffsets[r] = sendOffsets[r - 1] + Nsend[r - 1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, mesh->comm);
-
-  // count incoming clusters
-  dlong newNelements = 0;
-  for(int r = 0; r < size; ++r)
-    newNelements += Nrecv[r];
-
-  for(int r = 1; r < size; ++r)
-    recvOffsets[r] = recvOffsets[r - 1] + Nrecv[r - 1];
-
-  element_t* tmpElements = (element_t*) calloc(newNelements, sizeof(element_t));
-
-  // exchange parallel clusters
-  MPI_Alltoallv(elements, Nsend, sendOffsets, MPI_ELEMENT_T,
-                tmpElements, Nrecv, recvOffsets, MPI_ELEMENT_T, mesh->comm);
-
-  MPI_Barrier(mesh->comm);
-  MPI_Type_free(&MPI_ELEMENT_T);
-
-  // replace elements with inbound elements
-  if (elements) free(elements);
-  elements = tmpElements;
-
-  // reset number of elements and element-to-vertex connectivity from returned capsules
-  free(mesh->EToV);
-  free(mesh->EX);
-  free(mesh->EY);
-  free(mesh->EZ);
-  free(mesh->elementInfo);
-
-  mesh->Nelements = newNelements;
-  mesh->EToV = (hlong*) calloc(newNelements * mesh->Nverts, sizeof(hlong));
-  mesh->EX = (dfloat*) calloc(newNelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(newNelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(newNelements * mesh->Nverts, sizeof(dfloat));
-  mesh->elementInfo = (hlong*) calloc(newNelements, sizeof(hlong));
-
-  for(dlong e = 0; e < newNelements; ++e) {
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      mesh->EToV[e * mesh->Nverts + n] = elements[e].v[n];
-      mesh->EX[e * mesh->Nverts + n]   = elements[e].EX[n];
-      mesh->EY[e * mesh->Nverts + n]   = elements[e].EY[n];
-      mesh->EZ[e * mesh->Nverts + n]   = elements[e].EZ[n];
-    }
-    mesh->elementInfo[e] = elements[e].type;
-  }
-  if (elements) free(elements);
-#endif
-}
diff --git a/src/libP/src/meshGradientTensorProductQuad2D.c b/src/libP/src/meshGradientTensorProductQuad2D.c
deleted file mode 100644
index da16c4c79..000000000
--- a/src/libP/src/meshGradientTensorProductQuad2D.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-// baseline tensor product mesh Gradient for quadrilateral elements
-void meshGradientTensorProductQuad2D(mesh2D* mesh,
-                                     dfloat* q,
-                                     dfloat* dqdx,
-                                     dfloat* dqdy)
-{
-  // loop over elements
-  for(int e = 0; e < mesh->Nelements; ++e)
-
-    // compute gradient at each node
-    for(int j = 0; j < mesh->N + 1; ++j)
-      for(int i = 0; i < mesh->N + 1; ++i) {
-        // local node index
-        int n = i + (mesh->N + 1) * j;
-
-        // load geometric factors
-        int gid = mesh->Np * mesh->Nvgeo * e + n;
-        float drdx = vgeo[gid + mesh->Np * RXID];
-        float drdy = vgeo[gid + mesh->Np * RYID];
-        float dsdx = vgeo[gid + mesh->Np * SXID];
-        float dsdy = vgeo[gid + mesh->Np * SYID];
-
-        // matrix-vector multiplies
-        dfloat dqdr = 0, dqds = 0;
-        for(int m = 0; m < mesh->N + 1; ++m) {
-          dqdr += mesh->D[i * (mesh->N + 1) + m] * q[m + j * (mesh->N + 1) + e * mesh->Np];
-          dqds += mesh->D[j * (mesh->N + 1) + m] * q[i + m * (mesh->N + 1) + e * mesh->Np];
-        }
-
-        // chain rule
-        dqdx[n + e * mesh->Np] = drdx * dqdr + dsdx * dqds;
-        dqdy[n + e * mesh->Np] = drdy * dqdr + dsdy * dqds;
-      }
-}
diff --git a/src/libP/src/meshLoadReferenceNodesQuad2D.c b/src/libP/src/meshLoadReferenceNodesQuad2D.c
deleted file mode 100644
index 2eaa4def0..000000000
--- a/src/libP/src/meshLoadReferenceNodesQuad2D.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshLoadReferenceNodesQuad2D(mesh2D* mesh, int N)
-{
-  char fname[BUFSIZ];
-  sprintf(fname, DHOLMES "/nodes/quadrilateralN%02d.dat", N);
-
-  FILE* fp = fopen(fname, "r");
-
-  if (!fp) {
-    printf("ERROR: Cannot open file: '%s'\n", fname);
-    exit(-1);
-  }
-
-  mesh->N = N;
-  mesh->Nfp = N + 1;
-  mesh->Nq = (N + 1);
-  mesh->Np = (N + 1) * (N + 1);
-
-  int Nrows, Ncols;
-
-  /* Nodal Data */
-  readDfloatArray(fp, "Nodal r-coordinates", &(mesh->r),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal s-coordinates", &(mesh->s),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal Dr differentiation matrix", &(mesh->Dr), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Ds differentiation matrix", &(mesh->Ds), &Nrows, &Ncols);
-  readIntArray   (fp, "Nodal Face nodes", &(mesh->faceNodes), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Lift Matrix", &(mesh->LIFT), &Nrows, &Ncols);
-
-  readDfloatArray(fp, "Nodal 1D GLL Nodes", &(mesh->gllz), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal 1D GLL Weights", &(mesh->gllw), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal 1D differentiation matrix", &(mesh->D), &Nrows, &Ncols);
-
-  readDfloatArray(fp, "1D degree raise matrix", &(mesh->interpRaise), &Nrows, &Ncols);
-  readDfloatArray(fp, "1D degree lower matrix", &(mesh->interpLower), &Nrows, &Ncols);
-
-  /* Plotting data */
-  readDfloatArray(fp, "Plotting r-coordinates", &(mesh->plotR),&Nrows,&Ncols);
-  readDfloatArray(fp, "Plotting s-coordinates", &(mesh->plotS),&Nrows,&Ncols);
-  mesh->plotNp = Nrows;
-
-  readDfloatArray(fp, "Plotting Interpolation Matrix", &(mesh->plotInterp),&Nrows,&Ncols);
-  readIntArray   (fp, "Plotting triangulation", &(mesh->plotEToV), &Nrows, &Ncols);
-  mesh->plotNelements = Nrows;
-  mesh->plotNverts = Ncols;
-
-  /* Quadrature data */
-  readDfloatArray(fp, "Quadrature r-coordinates", &(mesh->cubr),&Nrows,&Ncols);
-  readDfloatArray(fp, "Quadrature weights", &(mesh->cubw),&Nrows,&Ncols);
-  mesh->cubNq = Nrows;
-  mesh->cubNp = mesh->cubNq * mesh->cubNq;
-
-  readDfloatArray(fp, "Quadrature Interpolation Matrix", &(mesh->cubInterp),&Nrows,&Ncols);
-  readDfloatArray(fp,
-                  "Quadrature Differentiation Interpolation Matrix",
-                  &(mesh->cubDiffInterp),
-                  &Nrows,
-                  &Ncols);
-  readDfloatArray(fp, "Quadrature Weak D Differentiation Matrix", &(mesh->cubDW),&Nrows,&Ncols);
-  readDfloatArray(fp, "Quadrature Projection Matrix", &(mesh->cubProject),&Nrows,&Ncols);
-
-  /* Cubature data */
-  // readDfloatArray(fp, "Cubature r-coordinates", &(mesh->cubr),&Nrows,&Ncols);
-  // readDfloatArray(fp, "Cubature s-coordinates", &(mesh->cubs),&Nrows,&Ncols);
-  // readDfloatArray(fp, "Cubature weights", &(mesh->cubw),&Nrows,&Ncols);
-  // mesh->cubNp = Nrows;
-
-  // readDfloatArray(fp, "Cubature Interpolation Matrix", &(mesh->cubInterp),&Nrows,&Ncols);
-  // readDfloatArray(fp, "Cubature Weak Dr Differentiation Matrix", &(mesh->cubDrW),&Nrows,&Ncols);
-  // readDfloatArray(fp, "Cubature Weak Ds Differentiation Matrix", &(mesh->cubDsW),&Nrows,&Ncols);
-  // readDfloatArray(fp, "Cubature Projection Matrix", &(mesh->cubProject),&Nrows,&Ncols);
-  // readDfloatArray(fp, "Cubature Surface Interpolation Matrix", &(mesh->intInterp),&Nrows,&Ncols);
-  // mesh->intNfp = Nrows/mesh->Nfaces; //number of interpolation points per face
-
-  // readDfloatArray(fp, "Cubature Surface Lift Matrix", &(mesh->intLIFT),&Nrows,&Ncols);
-
-  mesh->max_EL_nnz = 0;
-  mesh->intNfp = 0;
-
-  /* C0 patch data */
-  readDfloatArray(fp, "C0 overlapping patch forward matrix", &(mesh->oasForward), &Nrows, &Ncols);
-  readDfloatArray(fp, "C0 overlapping patch diagonal scaling", &(mesh->oasDiagOp), &Nrows, &Ncols);
-  readDfloatArray(fp, "C0 overlapping patch backward matrix", &(mesh->oasBack), &Nrows, &Ncols);
-  /* IPDG patch data */
-  readDfloatArray(fp, "IPDG overlapping patch forward matrix", &(mesh->oasForwardDg), &Nrows,
-                  &Ncols);
-  readDfloatArray(fp,
-                  "IPDG overlapping patch diagonal scaling",
-                  &(mesh->oasDiagOpDg),
-                  &Nrows,
-                  &Ncols);
-  readDfloatArray(fp, "IPDG overlapping patch backward matrix", &(mesh->oasBackDg), &Nrows, &Ncols);
-  mesh->NpP = Nrows; //overlapping patch size
-
-  readIntArray   (fp, "SEMFEM reference mesh", &(mesh->FEMEToV), &Nrows, &Ncols);
-  mesh->NelFEM = Nrows;
-  mesh->NpFEM = mesh->Np;
-
-  fclose(fp);
-
-  // find node indices of vertex nodes
-  dfloat NODETOL = 1e-6;
-  mesh->vertexNodes = (int*) calloc(mesh->Nverts, sizeof(int));
-  for(int n = 0; n < mesh->Np; ++n) {
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) < NODETOL)
-      mesh->vertexNodes[0] = n;
-    if( (mesh->r[n] - 1) * (mesh->r[n] - 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) < NODETOL)
-      mesh->vertexNodes[1] = n;
-    if( (mesh->r[n] - 1) * (mesh->r[n] - 1) + (mesh->s[n] - 1) * (mesh->s[n] - 1) < NODETOL)
-      mesh->vertexNodes[2] = n;
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] - 1) * (mesh->s[n] - 1) < NODETOL)
-      mesh->vertexNodes[3] = n;
-  }
-}
diff --git a/src/libP/src/meshLoadReferenceNodesTet3D.c b/src/libP/src/meshLoadReferenceNodesTet3D.c
deleted file mode 100644
index 74ee065e2..000000000
--- a/src/libP/src/meshLoadReferenceNodesTet3D.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-void meshLoadReferenceNodesTet3D(mesh3D* mesh, int N)
-{
-  char fname[BUFSIZ];
-  sprintf(fname, DHOLMES "/nodes/tetN%02d.dat", N);
-
-  FILE* fp = fopen(fname, "r");
-
-  if (!fp) {
-    printf("ERROR: Cannot open file: '%s'\n", fname);
-    exit(-1);
-  }
-
-  mesh->N = N;
-  mesh->Np = ((N + 1) * (N + 2) * (N + 3)) / 6;
-  mesh->Nfp = ((N + 1) * (N + 2)) / 2;
-
-  int Nrows, Ncols;
-
-  /* Nodal Data */
-  readDfloatArray(fp, "Nodal r-coordinates", &(mesh->r),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal s-coordinates", &(mesh->s),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal t-coordinates", &(mesh->t),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal Dr differentiation matrix", &(mesh->Dr), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Ds differentiation matrix", &(mesh->Ds), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Dt differentiation matrix", &(mesh->Dt), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Mass Matrix", &(mesh->MM), &Nrows, &Ncols);
-  readIntArray   (fp, "Nodal Face nodes", &(mesh->faceNodes), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Lift Matrix", &(mesh->LIFT), &Nrows, &Ncols);
-  //readIntArray   (fp, "Nodal rotation permutations", &(mesh->rmapP), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal degree raise matrix", &(mesh->interpRaise), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal degree lower matrix", &(mesh->interpLower), &Nrows, &Ncols);
-
-  /* Plotting data */
-  readDfloatArray(fp, "Plotting r-coordinates", &(mesh->plotR),&Nrows,&Ncols);
-  readDfloatArray(fp, "Plotting s-coordinates", &(mesh->plotS),&Nrows,&Ncols);
-  readDfloatArray(fp, "Plotting t-coordinates", &(mesh->plotT),&Nrows,&Ncols);
-  mesh->plotNp = Nrows;
-
-  readDfloatArray(fp, "Plotting Interpolation Matrix", &(mesh->plotInterp),&Nrows,&Ncols);
-  readIntArray   (fp, "Plotting triangulation", &(mesh->plotEToV), &Nrows, &Ncols);
-  mesh->plotNelements = Nrows;
-  mesh->plotNverts = Ncols;
-
-  readIntArray(fp,"Contour plot EToV", &(mesh->contourEToV), &Nrows, &Ncols);
-  readDfloatArray(fp,"Contour plot VX", &(mesh->contourVX), &Nrows, &Ncols);
-  readDfloatArray(fp,"Contour plot VY", &(mesh->contourVY), &Nrows, &Ncols);
-  readDfloatArray(fp,"Contour plot VZ", &(mesh->contourVZ), &Nrows, &Ncols);
-
-  readDfloatArray(fp, "Contour plot Interpolation",&(mesh->contourInterp), &Nrows, &Ncols);
-  readDfloatArray(fp, "Contour plot Linear Interpolation",&(mesh->contourInterp1), &Nrows, &Ncols);
-  readDfloatArray(fp, "Contour plot Filter",&(mesh->contourFilter), &Nrows, &Ncols);
-
-  /* Cubature data */
-  if (N < 7) {
-    readDfloatArray(fp, "Cubature r-coordinates", &(mesh->cubr),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature s-coordinates", &(mesh->cubs),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature t-coordinates", &(mesh->cubt),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature weights", &(mesh->cubw),&Nrows,&Ncols);
-    mesh->cubNp = Nrows;
-
-    readDfloatArray(fp, "Cubature Interpolation Matrix", &(mesh->cubInterp),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature Weak Dr Differentiation Matrix", &(mesh->cubDrW),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature Weak Ds Differentiation Matrix", &(mesh->cubDsW),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature Weak Dt Differentiation Matrix", &(mesh->cubDtW),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature Projection Matrix", &(mesh->cubProject),&Nrows,&Ncols);
-    readDfloatArray(fp, "Cubature Surface Interpolation Matrix", &(mesh->intInterp),&Nrows,&Ncols);
-    mesh->intNfp = Nrows / mesh->Nfaces; //number of interpolation points per face
-
-    readDfloatArray(fp, "Cubature Surface Lift Matrix", &(mesh->intLIFT),&Nrows,&Ncols);
-  }
-
-  /* Bernstein-Bezier data */
-  readDfloatArray(fp, "Bernstein-Bezier Vandermonde Matrix", &(mesh->VB),&Nrows,&Ncols);
-  readDfloatArray(fp, "Bernstein-Bezier Inverse Vandermonde Matrix", &(mesh->invVB),&Nrows,&Ncols);
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D0 differentiation ids",
-                  &(mesh->D0ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 4
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D1 differentiation ids",
-                  &(mesh->D1ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 4
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D2 differentiation ids",
-                  &(mesh->D2ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 4
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D3 differentiation ids",
-                  &(mesh->D3ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 4
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse D differentiation values",
-                  &(mesh->Dvals),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 4
-
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D0T transpose differentiation ids",
-                  &(mesh->D0Tids),
-                  &Nrows,
-                  &Ncols);                                                                                            //Ncols should be 4
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D1T transpose differentiation ids",
-                  &(mesh->D1Tids),
-                  &Nrows,
-                  &Ncols);                                                                                            //Ncols should be 4
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D2T transpose differentiation ids",
-                  &(mesh->D2Tids),
-                  &Nrows,
-                  &Ncols);                                                                                            //Ncols should be 4
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D3T transpose differentiation ids",
-                  &(mesh->D3Tids),
-                  &Nrows,
-                  &Ncols);                                                                                            //Ncols should be 4
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse DT transpose differentiation values",
-                  &(mesh->DTvals),
-                  &Nrows,
-                  &Ncols);                                                                                            //Ncols should be 4
-
-  readIntArray   (fp, "Bernstein-Bezier L0 Matrix ids", &(mesh->L0ids), &Nrows, &Ncols);
-  readDfloatArray(fp, "Bernstein-Bezier L0 Matrix values", &(mesh->L0vals), &Nrows, &Ncols); //Ncols should be 7
-  readIntArray   (fp, "Bernstein-Bezier EL lift ids", &(mesh->ELids), &Nrows, &Ncols);
-  readDfloatArray(fp, "Bernstein-Bezier EL lift values", &(mesh->ELvals), &Nrows, &Ncols);
-  mesh->max_EL_nnz = Ncols;
-
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse 2D degree raise ids",
-                  &(mesh->BBRaiseids),
-                  &Nrows,
-                  &Ncols);                                                                                     //Ncols should be 3
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse 2D degree raise values",
-                  &(mesh->BBRaiseVals),
-                  &Nrows,
-                  &Ncols);                                                                                     //Ncols should be 3
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse 2D degree lower matrix",
-                  &(mesh->BBLower),
-                  &Nrows,
-                  &Ncols);
-
-  /* IPDG patch data */
-  readDfloatArray(fp, "IPDG overlapping patch forward matrix", &(mesh->oasForwardDg), &Nrows,
-                  &Ncols);
-  readDfloatArray(fp,
-                  "IPDG overlapping patch diagonal scaling",
-                  &(mesh->oasDiagOpDg),
-                  &Nrows,
-                  &Ncols);
-  readDfloatArray(fp, "IPDG overlapping patch backward matrix", &(mesh->oasBackDg), &Nrows, &Ncols);
-  mesh->NpP = Nrows; //overlapping patch size
-
-  /* SEMFEM data */
-  readDfloatArray(fp, "SEMFEM r-coordinates", &(mesh->rFEM),&Nrows,&Ncols);
-  readDfloatArray(fp, "SEMFEM s-coordinates", &(mesh->sFEM),&Nrows,&Ncols);
-  readDfloatArray(fp, "SEMFEM t-coordinates", &(mesh->tFEM),&Nrows,&Ncols);
-  mesh->NpFEM = Nrows;
-
-  readIntArray   (fp, "SEMFEM reference mesh", &(mesh->FEMEToV), &Nrows, &Ncols);
-  mesh->NelFEM = Nrows;
-
-  readDfloatArray(fp, "SEMFEM interpolation matrix", &(mesh->SEMFEMInterp),&Nrows,&Ncols);
-
-  fclose(fp);
-
-  // find node indices of vertex nodes
-  dfloat NODETOL = 1e-6;
-  mesh->vertexNodes = (int*) calloc(mesh->Nverts, sizeof(int));
-  for(int n = 0; n < mesh->Np; ++n) {
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) +
-        (mesh->t[n] + 1) * (mesh->t[n] + 1) < NODETOL)
-      mesh->vertexNodes[0] = n;
-    if( (mesh->r[n] - 1) * (mesh->r[n] - 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) +
-        (mesh->t[n] + 1) * (mesh->t[n] + 1) < NODETOL)
-      mesh->vertexNodes[1] = n;
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] - 1) * (mesh->s[n] - 1) +
-        (mesh->t[n] + 1) * (mesh->t[n] + 1) < NODETOL)
-      mesh->vertexNodes[2] = n;
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) +
-        (mesh->t[n] - 1) * (mesh->t[n] - 1) < NODETOL)
-      mesh->vertexNodes[3] = n;
-  }
-}
diff --git a/src/libP/src/meshLoadReferenceNodesTri2D.c b/src/libP/src/meshLoadReferenceNodesTri2D.c
deleted file mode 100644
index a3433e6a6..000000000
--- a/src/libP/src/meshLoadReferenceNodesTri2D.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshLoadReferenceNodesTri2D(mesh2D* mesh, int N)
-{
-  char fname[BUFSIZ];
-  sprintf(fname, DHOLMES "/nodes/triangleN%02d.dat", N);
-
-  FILE* fp = fopen(fname, "r");
-
-  if (!fp) {
-    printf("ERROR: Cannot open file: '%s'\n", fname);
-    exit(-1);
-  }
-
-  mesh->N = N;
-  mesh->Nfp = N + 1;
-  mesh->Np = (N + 1) * (N + 2) / 2;
-
-  int Nrows, Ncols;
-
-  /* Nodal Data */
-  readDfloatArray(fp, "Nodal r-coordinates", &(mesh->r),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal s-coordinates", &(mesh->s),&Nrows,&Ncols);
-  readDfloatArray(fp, "Nodal Dr differentiation matrix", &(mesh->Dr), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Ds differentiation matrix", &(mesh->Ds), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Mass Matrix", &(mesh->MM), &Nrows, &Ncols);
-  readIntArray   (fp, "Nodal Face nodes", &(mesh->faceNodes), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal Lift Matrix", &(mesh->LIFT), &Nrows, &Ncols);
-  readIntArray   (fp, "Nodal rotation permutations", &(mesh->rmapP), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal degree raise matrix", &(mesh->interpRaise), &Nrows, &Ncols);
-  readDfloatArray(fp, "Nodal degree lower matrix", &(mesh->interpLower), &Nrows, &Ncols);
-
-  /* Plotting data */
-  readDfloatArray(fp, "Plotting r-coordinates", &(mesh->plotR),&Nrows,&Ncols);
-  readDfloatArray(fp, "Plotting s-coordinates", &(mesh->plotS),&Nrows,&Ncols);
-  mesh->plotNp = Nrows;
-
-  readDfloatArray(fp, "Plotting Interpolation Matrix", &(mesh->plotInterp),&Nrows,&Ncols);
-  readIntArray   (fp, "Plotting triangulation", &(mesh->plotEToV), &Nrows, &Ncols);
-  mesh->plotNelements = Nrows;
-  mesh->plotNverts = Ncols;
-
-  /* Cubature data */
-  readDfloatArray(fp, "Cubature r-coordinates", &(mesh->cubr),&Nrows,&Ncols);
-  readDfloatArray(fp, "Cubature s-coordinates", &(mesh->cubs),&Nrows,&Ncols);
-  readDfloatArray(fp, "Cubature weights", &(mesh->cubw),&Nrows,&Ncols);
-  mesh->cubNp = Nrows;
-
-  readDfloatArray(fp, "Cubature Interpolation Matrix", &(mesh->cubInterp),&Nrows,&Ncols);
-  readDfloatArray(fp, "Cubature Weak Dr Differentiation Matrix", &(mesh->cubDrW),&Nrows,&Ncols);
-  readDfloatArray(fp, "Cubature Weak Ds Differentiation Matrix", &(mesh->cubDsW),&Nrows,&Ncols);
-  readDfloatArray(fp, "Cubature Projection Matrix", &(mesh->cubProject),&Nrows,&Ncols);
-  readDfloatArray(fp, "Cubature Surface Interpolation Matrix", &(mesh->intInterp),&Nrows,&Ncols);
-  mesh->intNfp = Nrows / mesh->Nfaces; //number of interpolation points per face
-  readDfloatArray(fp, "Cubature Surface Lift Matrix", &(mesh->intLIFT),&Nrows,&Ncols);
-
-  /* Bernstein-Bezier data */
-  readDfloatArray(fp, "Bernstein-Bezier Vandermonde Matrix", &(mesh->VB),&Nrows,&Ncols);
-  readDfloatArray(fp, "Bernstein-Bezier Inverse Vandermonde Matrix", &(mesh->invVB),&Nrows,&Ncols);
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D1 differentiation ids",
-                  &(mesh->D1ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 3
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D2 differentiation ids",
-                  &(mesh->D2ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 3
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse D3 differentiation ids",
-                  &(mesh->D3ids),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 3
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse D differentiation values",
-                  &(mesh->Dvals),
-                  &Nrows,
-                  &Ncols);                                                                                //Ncols should be 3
-  readDfloatArray(fp, "Cubature Bernstein-Bezier Interpolation Matrix", &(mesh->VBq), &Nrows,
-                  &Ncols);
-  readDfloatArray(fp, "Cubature Bernstein-Bezier Projection Matrix", &(mesh->PBq), &Nrows, &Ncols);
-  readDfloatArray(fp, "Bernstein-Bezier L0 Matrix values", &(mesh->L0vals), &Nrows, &Ncols); //Ncols should be 3 (tridiagonal)
-  readIntArray   (fp, "Bernstein-Bezier EL lift ids", &(mesh->ELids), &Nrows, &Ncols);
-  readDfloatArray(fp, "Bernstein-Bezier EL lift values", &(mesh->ELvals), &Nrows, &Ncols);
-  mesh->max_EL_nnz = Ncols;
-
-  readIntArray   (fp,
-                  "Bernstein-Bezier sparse 1D degree raise ids",
-                  &(mesh->BBRaiseids),
-                  &Nrows,
-                  &Ncols);                                                                                     //Ncols should be 2
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse 1D degree raise values",
-                  &(mesh->BBRaiseVals),
-                  &Nrows,
-                  &Ncols);                                                                                     //Ncols should be 2
-  readDfloatArray(fp,
-                  "Bernstein-Bezier sparse 1D degree lower matrix",
-                  &(mesh->BBLower),
-                  &Nrows,
-                  &Ncols);
-
-  /* IPDG patch data */
-  readDfloatArray(fp, "IPDG overlapping patch forward matrix", &(mesh->oasForwardDg), &Nrows,
-                  &Ncols);
-  readDfloatArray(fp,
-                  "IPDG overlapping patch diagonal scaling",
-                  &(mesh->oasDiagOpDg),
-                  &Nrows,
-                  &Ncols);
-  readDfloatArray(fp, "IPDG overlapping patch backward matrix", &(mesh->oasBackDg), &Nrows, &Ncols);
-  mesh->NpP = Nrows; //overlapping patch size
-
-  readDfloatArray(fp, "IPDG full reference patch inverse matrix", &(mesh->invAP), &Nrows, &Ncols);
-
-  if (N < 13) { //data only generated for N<13
-    /* SEMFEM data */
-    readDfloatArray(fp, "SEMFEM r-coordinates", &(mesh->rFEM),&Nrows,&Ncols);
-    readDfloatArray(fp, "SEMFEM s-coordinates", &(mesh->sFEM),&Nrows,&Ncols);
-    mesh->NpFEM = Nrows;
-
-    readIntArray   (fp, "SEMFEM reference mesh", &(mesh->FEMEToV), &Nrows, &Ncols);
-    mesh->NelFEM = Nrows;
-
-    readDfloatArray(fp, "SEMFEM interpolation matrix", &(mesh->SEMFEMInterp),&Nrows,&Ncols);
-  }
-
-  /* Sparse basis data */
-  readDfloatArray(fp, "Sparse basis Vandermonde", &(mesh->sparseV), &Nrows, &Ncols);
-  readDfloatArray(fp, "Sparse basis mass matrix", &(mesh->sparseMM), &Nrows, &Ncols);
-  readIntArray   (fp, "Sparse basis face modes", &(mesh->FaceModes), &Nrows, &Ncols);
-  readIntArray   (fp, "Sparse differentiation matrix ids", &(mesh->sparseStackedNZ), &Nrows,
-                  &Ncols);
-  readDfloatArray(fp, "Sparse differentiation Srr values", &(mesh->sparseSrrT), &Nrows, &Ncols);
-  readDfloatArray(fp, "Sparse differentiation Srs values", &(mesh->sparseSrsT), &Nrows, &Ncols);
-  readDfloatArray(fp, "Sparse differentiation Sss values", &(mesh->sparseSssT), &Nrows, &Ncols);
-  mesh->SparseNnzPerRow = Nrows;
-
-  fclose(fp);
-
-  // find node indices of vertex nodes
-  dfloat NODETOL = 1e-6;
-  mesh->vertexNodes = (int*) calloc(mesh->Nverts, sizeof(int));
-  for(int n = 0; n < mesh->Np; ++n) {
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) < NODETOL)
-      mesh->vertexNodes[0] = n;
-    if( (mesh->r[n] - 1) * (mesh->r[n] - 1) + (mesh->s[n] + 1) * (mesh->s[n] + 1) < NODETOL)
-      mesh->vertexNodes[1] = n;
-    if( (mesh->r[n] + 1) * (mesh->r[n] + 1) + (mesh->s[n] - 1) * (mesh->s[n] - 1) < NODETOL)
-      mesh->vertexNodes[2] = n;
-  }
-}
diff --git a/src/libP/src/meshMRABSetup2D.c b/src/libP/src/meshMRABSetup2D.c
deleted file mode 100644
index 54dce7ee1..000000000
--- a/src/libP/src/meshMRABSetup2D.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh2D.h"
-
-dfloat meshMRABSetup2D(mesh2D* mesh, dfloat* EToDT, int maxLevels, dfloat finalTime)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  //find global min and max dt
-  dfloat dtmin, dtmax;
-  dtmin = EToDT[0];
-  dtmax = EToDT[0];
-  for (dlong e = 1; e < mesh->Nelements; e++) {
-    dtmin = mymin(dtmin,EToDT[e]);
-    dtmax = mymax(dtmax,EToDT[e]);
-  }
-  dfloat dtGmin, dtGmax;
-  MPI_Allreduce(&dtmin, &dtGmin, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&dtmax, &dtGmax, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-
-  if (rank == 0) {
-    printf("----------- MRAB Setup ------------------------------\n");
-    printf("dtmin = %g, dtmax = %g\n", dtGmin, dtGmax);
-  }
-
-  //number of levels
-  mesh->MRABNlevels = mymin(floor(log2(dtGmax / dtGmin)) + 1,maxLevels);
-
-  //shift dtGmin so that we have an integer number of steps
-  int NtimeSteps = finalTime / (pow(2,mesh->MRABNlevels - 1) * dtGmin);
-  dtGmin = finalTime / (pow(2,mesh->MRABNlevels - 1) * NtimeSteps);
-
-  //compute the level of each element
-  mesh->MRABlevel = (dlong*) calloc(mesh->Nelements + mesh->totalHaloPairs,sizeof(int));
-  int* MRABsendBuffer;
-  for(int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    dfloat dtlev = dtGmin * pow(2,lev);
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      if(EToDT[e] >= dtlev)
-        mesh->MRABlevel[e] = lev;
-  }
-
-  //enforce one level difference between neighbours
-  if (mesh->totalHaloPairs)
-    MRABsendBuffer = (int*) calloc(mesh->totalHaloPairs,sizeof(int));
-
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    if (mesh->totalHaloPairs)
-      meshHaloExchange(mesh,
-                       sizeof(int),
-                       mesh->MRABlevel,
-                       MRABsendBuffer,
-                       mesh->MRABlevel + mesh->Nelements);
-    for (dlong e = 0; e < mesh->Nelements; e++)
-      if (mesh->MRABlevel[e] > lev + 1) { //find elements at least 2 levels higher than lev
-        for (int f = 0; f < mesh->Nfaces; f++) { //check for a level lev neighbour
-          int eP = mesh->EToE[mesh->Nfaces * e + f];
-          if (eP > -1)
-            if (mesh->MRABlevel[eP] == lev)
-              mesh->MRABlevel[e] = lev + 1;  //if one exists, lower the level of this element to lev-1
-        }
-      }
-  }
-
-  if (mesh->totalHaloPairs) free(MRABsendBuffer);
-
-  //this could change the number of MRAB levels there are, so find the new max level
-  mesh->MRABNlevels = 0;
-  for (dlong e = 0; e < mesh->Nelements; e++)
-    mesh->MRABNlevels =
-      (mesh->MRABlevel[e] > mesh->MRABNlevels) ? mesh->MRABlevel[e] : mesh->MRABNlevels;
-  mesh->MRABNlevels++;
-  int localNlevels = mesh->MRABNlevels;
-  MPI_Allreduce(&localNlevels, &(mesh->MRABNlevels), 1, MPI_INT, MPI_MAX, mesh->comm);
-  // mesh->NtimeSteps = mesh->finalTime/(pow(2,mesh->MRABNlevels-1)*dtGmin);
-
-  printf("MRABNlevels %d \n", mesh->MRABNlevels);
-
-  //now we need to perform a weighted repartitioning of the mesh to optimize MRAB
-  if (size > 1) {
-    //for the moment, just weigth the elements by the number or RHS evals per MRAB step
-    // TODO: We should make this an input parameter later to handle other problems.
-    dfloat* weights = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat));
-    for (dlong e = 0; e < mesh->Nelements; e++)
-      weights[e] = pow(2,mesh->MRABNlevels - mesh->MRABlevel[e]);
-
-    if (rank == 0) printf("Repartitioning for MRAB...\n");
-    meshMRABWeightedPartition2D(mesh,weights,mesh->MRABNlevels, mesh->MRABlevel);
-  }
-
-  //construct element and halo lists
-  mesh->MRABelementIds = (int**) calloc(mesh->MRABNlevels,sizeof(int*));
-  mesh->MRABhaloIds = (int**) calloc(mesh->MRABNlevels,sizeof(int*));
-
-  mesh->MRABNelements = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-  mesh->MRABNhaloElements = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-
-  for (dlong e = 0; e < mesh->Nelements; e++) {
-    mesh->MRABNelements[mesh->MRABlevel[e]]++;
-    for (int f = 0; f < mesh->Nfaces; f++) {
-      int eP = mesh->EToE[mesh->Nfaces * e + f];
-      if (eP > -1) {
-        if (mesh->MRABlevel[eP] == mesh->MRABlevel[e] - 1) {//check for a level lev-1 neighbour
-          mesh->MRABNhaloElements[mesh->MRABlevel[e]]++;
-          break;
-        }
-      }
-    }
-  }
-
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    mesh->MRABelementIds[lev] = (int*) calloc(mesh->MRABNelements[lev],sizeof(int));
-    mesh->MRABhaloIds[lev] = (int*) calloc(mesh->MRABNhaloElements[lev],sizeof(int));
-    int cnt  = 0;
-    int cnt2 = 0;
-    for (dlong e = 0; e < mesh->Nelements; e++)
-      if (mesh->MRABlevel[e] == lev) {
-        mesh->MRABelementIds[lev][cnt++] = e;
-
-        for (int f = 0; f < mesh->Nfaces; f++) {
-          dlong eP = mesh->EToE[mesh->Nfaces * e + f];
-          if (eP > -1) {
-            if (mesh->MRABlevel[eP] == lev - 1) {//check for a level lev-1 neighbour
-              mesh->MRABhaloIds[lev][cnt2++] = e;
-              break;
-            }
-          }
-        }
-      }
-  }
-
-  //offset index
-  mesh->MRABshiftIndex = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-
-  if (rank == 0) {
-    printf("| Rank | Level | Nelements | Level/Level Boundary Elements | \n");
-    printf("------------------------------------------------------------\n");
-  }
-  MPI_Barrier(mesh->comm);
-  for (int r = 0; r < size; r++) {
-    if (r == rank) {
-      for (int lev = 0; lev < mesh->MRABNlevels; lev++)
-        printf("|  %d,    %d,      %d,        %d     \n",
-               rank,
-               lev,
-               mesh->MRABNelements[lev],
-               mesh->MRABNhaloElements[lev]);
-      printf("------------------------------------------------------------\n");
-    }
-    MPI_Barrier(mesh->comm);
-  }
-  MPI_Barrier(mesh->comm);
-
-  return dtGmin;
-}
-
-#if 0
-
-void meshMRABSetup2D(mesh2D* mesh, dfloat* EToDT, int maxLevels)
-{
-  int rank, size;
-  MPI_Comm_rank(mesh->comm, &rank);
-  MPI_Comm_size(mesh->comm, &size);
-
-  //find global min and max dt
-  dfloat dtmin, dtmax;
-  dtmin = EToDT[0];
-  dtmax = EToDT[0];
-  for (int e = 1; e < mesh->Nelements; e++) {
-    dtmin = mymin(dtmin,EToDT[e]);
-    dtmax = mymax(dtmax,EToDT[e]);
-  }
-  dfloat dtGmin, dtGmax;
-  MPI_Allreduce(&dtmin, &dtGmin, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&dtmax, &dtGmax, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-
-  if (rank == 0) {
-    printf("----------- MRAB Setup ------------------------------\n");
-    printf("dtmin = %g, dtmax = %g\n", dtGmin, dtGmax);
-  }
-
-  //number of levels
-  mesh->MRABNlevels = mymin(floor(log2(dtGmax / dtGmin)) + 1,maxLevels);
-
-  //shift dtGmin so that we have an integer number of steps
-  mesh->NtimeSteps = mesh->finalTime / (pow(2,mesh->MRABNlevels - 1) * dtGmin);
-  dtGmin = mesh->finalTime / (pow(2,mesh->MRABNlevels - 1) * mesh->NtimeSteps);
-
-  mesh->dt = dtGmin;
-
-  //compute the level of each element
-  mesh->MRABlevel = (int*) calloc(mesh->Nelements + mesh->totalHaloPairs,sizeof(int));
-  int* MRABsendBuffer;
-  for(int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    dfloat dtlev = dtGmin * pow(2,lev);
-    for(int e = 0; e < mesh->Nelements; ++e)
-      if(EToDT[e] >= dtlev)
-        mesh->MRABlevel[e] = lev;
-  }
-
-  //enforce one level difference between neighbours
-  if (mesh->totalHaloPairs)
-    MRABsendBuffer = (int*) calloc(mesh->totalHaloPairs,sizeof(int));
-
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    if (mesh->totalHaloPairs)
-      meshHaloExchange(mesh,
-                       sizeof(int),
-                       mesh->MRABlevel,
-                       MRABsendBuffer,
-                       mesh->MRABlevel + mesh->Nelements);
-    for (int e = 0; e < mesh->Nelements; e++)
-      if (mesh->MRABlevel[e] > lev + 1) { //find elements at least 2 levels higher than lev
-        for (int f = 0; f < mesh->Nfaces; f++) { //check for a level lev neighbour
-          int eP = mesh->EToE[mesh->Nfaces * e + f];
-          if (eP > -1)
-            if (mesh->MRABlevel[eP] == lev)
-              mesh->MRABlevel[e] = lev + 1;  //if one exists, lower the level of this element to lev-1
-        }
-      }
-  }
-
-  if (mesh->totalHaloPairs) free(MRABsendBuffer);
-
-  //this could change the number of MRAB levels there are, so find the new max level
-  mesh->MRABNlevels = 0;
-  for (int e = 0; e < mesh->Nelements; e++)
-    mesh->MRABNlevels =
-      (mesh->MRABlevel[e] > mesh->MRABNlevels) ? mesh->MRABlevel[e] : mesh->MRABNlevels;
-  mesh->MRABNlevels++;
-  int localNlevels = mesh->MRABNlevels;
-  MPI_Allreduce(&localNlevels, &(mesh->MRABNlevels), 1, MPI_INT, MPI_MAX, mesh->comm);
-  mesh->NtimeSteps = mesh->finalTime / (pow(2,mesh->MRABNlevels - 1) * dtGmin);
-
-  printf("MRABNlevels %d \n", mesh->MRABNlevels);
-
-  //now we need to perform a weighted repartitioning of the mesh to optimize MRAB
-  if (size > 1) {
-    //for the moment, just weigth the elements by the number or RHS evals per MRAB step
-    // TODO: We should make this an input parameter later to handle other problems.
-    dfloat* weights = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat));
-    for (int e = 0; e < mesh->Nelements; e++)
-      weights[e] = pow(2,mesh->MRABNlevels - mesh->MRABlevel[e]);
-
-    if (rank == 0) printf("Repartitioning for MRAB...\n");
-    meshMRABWeightedPartitionTri2D(mesh,weights,mesh->MRABNlevels, mesh->MRABlevel);
-  }
-
-  //construct element and halo lists
-  mesh->MRABelementIds = (int**) calloc(mesh->MRABNlevels,sizeof(int*));
-  mesh->MRABhaloIds = (int**) calloc(mesh->MRABNlevels,sizeof(int*));
-
-  mesh->MRABNelements = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-  mesh->MRABNhaloElements = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-
-  for (int e = 0; e < mesh->Nelements; e++) {
-    mesh->MRABNelements[mesh->MRABlevel[e]]++;
-    for (int f = 0; f < mesh->Nfaces; f++) {
-      int eP = mesh->EToE[mesh->Nfaces * e + f];
-      if (eP > -1) {
-        if (mesh->MRABlevel[eP] == mesh->MRABlevel[e] - 1) {//check for a level lev-1 neighbour
-          mesh->MRABNhaloElements[mesh->MRABlevel[e]]++;
-          break;
-        }
-      }
-    }
-  }
-
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    mesh->MRABelementIds[lev] = (int*) calloc(mesh->MRABNelements[lev],sizeof(int));
-    mesh->MRABhaloIds[lev] = (int*) calloc(mesh->MRABNhaloElements[lev],sizeof(int));
-    int cnt  = 0;
-    int cnt2 = 0;
-    for (int e = 0; e < mesh->Nelements; e++)
-      if (mesh->MRABlevel[e] == lev) {
-        mesh->MRABelementIds[lev][cnt++] = e;
-
-        for (int f = 0; f < mesh->Nfaces; f++) {
-          int eP = mesh->EToE[mesh->Nfaces * e + f];
-          if (eP > -1) {
-            if (mesh->MRABlevel[eP] == lev - 1) {//check for a level lev-1 neighbour
-              mesh->MRABhaloIds[lev][cnt2++] = e;
-              break;
-            }
-          }
-        }
-      }
-  }
-
-  //offset index
-  mesh->MRABshiftIndex = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-
-  if (rank == 0) {
-    printf("| Rank | Level | Nelements | Level/Level Boundary Elements | \n");
-    printf("------------------------------------------------------------\n");
-  }
-  MPI_Barrier(mesh->comm);
-  for (int r = 0; r < size; r++) {
-    if (r == rank) {
-      for (int lev = 0; lev < mesh->MRABNlevels; lev++)
-        printf("|  %d,    %d,      %d,        %d     \n",
-               rank,
-               lev,
-               mesh->MRABNelements[lev],
-               mesh->MRABNhaloElements[lev]);
-      printf("------------------------------------------------------------\n");
-    }
-    MPI_Barrier(mesh->comm);
-  }
-  MPI_Barrier(mesh->comm);
-}
-#endif
diff --git a/src/libP/src/meshMRABSetup3D.c b/src/libP/src/meshMRABSetup3D.c
deleted file mode 100644
index 1131959bd..000000000
--- a/src/libP/src/meshMRABSetup3D.c
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh3D.h"
-
-dfloat meshMRABSetup3D(mesh3D* mesh, dfloat* EToDT, int maxLevels, dfloat finalTime)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  //find global min and max dt
-  dfloat dtmin, dtmax;
-  dtmin = EToDT[0];
-  dtmax = EToDT[0];
-  for (dlong e = 1; e < mesh->Nelements; e++) {
-    dtmin = mymin(dtmin,EToDT[e]);
-    dtmax = mymax(dtmax,EToDT[e]);
-  }
-  dfloat dtGmin, dtGmax;
-  MPI_Allreduce(&dtmin, &dtGmin, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-  MPI_Allreduce(&dtmax, &dtGmax, 1, MPI_DFLOAT, MPI_MIN, mesh->comm);
-
-  if (rank == 0) {
-    printf("----------- MRAB Setup ------------------------------\n");
-    printf("dtmin = %g, dtmax = %g\n", dtGmin, dtGmax);
-  }
-
-  //number of levels
-  mesh->MRABNlevels = mymin(floor(log2(dtGmax / dtGmin)) + 1,maxLevels);
-
-  //shift dtGmin so that we have an integer number of steps
-  mesh->NtimeSteps = finalTime / (pow(2,mesh->MRABNlevels - 1) * dtGmin);
-  dtGmin = finalTime / (pow(2,mesh->MRABNlevels - 1) * mesh->NtimeSteps);
-
-  //compute the level of each element
-  mesh->MRABlevel = (dlong*) calloc(mesh->Nelements + mesh->totalHaloPairs,sizeof(int));
-  int* MRABsendBuffer = (int*) calloc(mesh->totalHaloPairs,sizeof(int));
-  for(int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    dfloat dtlev = dtGmin * pow(2,lev);
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      if(EToDT[e] >= dtlev)
-        mesh->MRABlevel[e] = lev;
-  }
-
-  //enforce one level difference between neighbours
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    if (mesh->totalHaloPairs)
-      meshHaloExchange(mesh,
-                       sizeof(int),
-                       mesh->MRABlevel,
-                       MRABsendBuffer,
-                       mesh->MRABlevel + mesh->Nelements);
-    for (dlong e = 0; e < mesh->Nelements; e++)
-      if (mesh->MRABlevel[e] > lev + 1) { //find elements at least 2 levels higher than lev
-        for (int f = 0; f < mesh->Nfaces; f++) { //check for a level lev neighbour
-          dlong eP = mesh->EToE[mesh->Nfaces * e + f];
-          if (eP > -1)
-            if (mesh->MRABlevel[eP] == lev)
-              mesh->MRABlevel[e] = lev + 1;  //if one exists, lower the level of this element to lev-1
-        }
-      }
-  }
-  if (mesh->totalHaloPairs) free(MRABsendBuffer);
-
-  //this could change the number of MRAB levels there are, so find the new max level
-  mesh->MRABNlevels = 0;
-  for (dlong e = 0; e < mesh->Nelements; e++)
-    mesh->MRABNlevels =
-      (mesh->MRABlevel[e] > mesh->MRABNlevels) ? mesh->MRABlevel[e] : mesh->MRABNlevels;
-  mesh->MRABNlevels++;
-  int localNlevels = mesh->MRABNlevels;
-  MPI_Allreduce(&localNlevels, &(mesh->MRABNlevels), 1, MPI_INT, MPI_MAX, mesh->comm);
-  mesh->NtimeSteps = mesh->finalTime / (pow(2,mesh->MRABNlevels - 1) * dtGmin);
-
-  //now we need to perform a weighted repartitioning of the mesh to optimize MRAB
-  if (size > 1) {
-    //for the moment, just weigth the elements by the number or RHS evals per MRAB step
-    // TODO: We should make this an input parameter later to handle other problems.
-    dfloat* weights = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat));
-    for (dlong e = 0; e < mesh->Nelements; e++)
-      weights[e] = pow(2,mesh->MRABNlevels - mesh->MRABlevel[e]);
-
-    if (rank == 0) printf("Repartitioning for MRAB...\n");
-    meshMRABWeightedPartition3D(mesh,weights,mesh->MRABNlevels, mesh->MRABlevel);
-  }
-
-  //construct element and halo lists
-  mesh->MRABelementIds = (int**) calloc(mesh->MRABNlevels,sizeof(int*));
-  mesh->MRABhaloIds = (int**) calloc(mesh->MRABNlevels,sizeof(int*));
-
-  mesh->MRABNelements = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-  mesh->MRABNhaloElements = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-
-  for (dlong e = 0; e < mesh->Nelements; e++) {
-    mesh->MRABNelements[mesh->MRABlevel[e]]++;
-    for (int f = 0; f < mesh->Nfaces; f++) {
-      dlong eP = mesh->EToE[mesh->Nfaces * e + f];
-      if (eP > -1) {
-        if (mesh->MRABlevel[eP] == mesh->MRABlevel[e] - 1) {//check for a level lev-1 neighbour
-          mesh->MRABNhaloElements[mesh->MRABlevel[e]]++;
-          break;
-        }
-      }
-    }
-  }
-
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    mesh->MRABelementIds[lev] = (int*) calloc(mesh->MRABNelements[lev],sizeof(int));
-    mesh->MRABhaloIds[lev] = (int*) calloc(mesh->MRABNhaloElements[lev],sizeof(int));
-    int cnt  = 0;
-    int cnt2 = 0;
-    for (dlong e = 0; e < mesh->Nelements; e++)
-      if (mesh->MRABlevel[e] == lev) {
-        mesh->MRABelementIds[lev][cnt++] = e;
-
-        for (int f = 0; f < mesh->Nfaces; f++) {
-          dlong eP = mesh->EToE[mesh->Nfaces * e + f];
-          if (eP > -1) {
-            if (mesh->MRABlevel[eP] == lev - 1) {//check for a level lev-1 neighbour
-              mesh->MRABhaloIds[lev][cnt2++] = e;
-              break;
-            }
-          }
-        }
-      }
-  }
-
-  //offset index
-  mesh->MRABshiftIndex = (int*) calloc(mesh->MRABNlevels,sizeof(int));
-
-  if (rank == 0) {
-    printf("| Rank | Level | Nelements | Level/Level Boundary Elements | \n");
-    printf("------------------------------------------------------------\n");
-  }
-  MPI_Barrier(mesh->comm);
-  for (int r = 0; r < size; r++) {
-    if (r == rank) {
-      for (int lev = 0; lev < mesh->MRABNlevels; lev++)
-        printf("|  %d,    %d,      %d,        %d     \n",
-               rank,
-               lev,
-               mesh->MRABNelements[lev],
-               mesh->MRABNhaloElements[lev]);
-      printf("------------------------------------------------------------\n");
-    }
-    MPI_Barrier(mesh->comm);
-  }
-  MPI_Barrier(mesh->comm);
-
-  return dtGmin;
-}
diff --git a/src/libP/src/meshMRABWeightedPartition2D.c b/src/libP/src/meshMRABWeightedPartition2D.c
deleted file mode 100644
index 2d1e4e8e8..000000000
--- a/src/libP/src/meshMRABWeightedPartition2D.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh2D.h"
-
-typedef struct
-{
-  int id;
-  int level;
-  dfloat weight;
-
-  // 4 for maximum number of vertices per element in 2D
-  int v[4];
-  dfloat EX[4], EY[4];
-
-  int cRank;
-  int cId;
-  int type;
-} cElement_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-} cluster_t;
-
-void meshBuildMRABClusters2D(mesh2D* mesh,
-                             int lev,
-                             dfloat* weights,
-                             int* levels,
-                             int* Nclusters,
-                             cluster_t** clusters,
-                             int* Nelements,
-                             cElement_t** newElements);
-
-// geometric partition of clusters of elements in 2D mesh using Morton ordering + parallelSort
-dfloat meshClusteredGeometricPartition2D(mesh2D* mesh, int Nclusters, cluster_t* clusters,
-                                         int* Nelements, cElement_t** elements);
-
-/* ---------------------------------------------------------
-
-   This function is a bit spaghetti, but the general idea is
-   we cluster low-MRAB-level elements together along with a
-   halo and partition the mesh of clusters. This reduces the MPI
-   costs of communicating on the low levels.
-
-   The algorithm performs the following steps
-   - cluster elements of level lev or lower
-   - put clusters together a single 'owning' process
-   - sort the list of clusters using a space-filling curve
-   - partition the SFC between the processors, exchange the
-    elements along the processor boundaries to improve the
-    partitioning.
-   - If the resulting partition is acceptable, save it.
-   - If not, return to the last acceptable partition, and rerun
-    the mesh setup.
-
-   ------------------------------------------------------------ */
-
-void meshMRABWeightedPartition2D(mesh2D* mesh, dfloat* weights,
-                                 int numLevels, int* levels)
-{
-  const dfloat TOL = 0.8; //tolerance on what partitions are ruled 'acceptable'
-                          // min_{ranks} totalWeight > TOL*max_{ranks} totalWeight => accepted
-
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  int Nelements, Nclusters;
-
-  cElement_t* elements, * acceptedPartition;
-  cluster_t* clusters;
-
-  if (!levels) numLevels = 1;
-
-  //perform the first weigthed partitioning with no clustering
-  meshBuildMRABClusters2D(mesh, -1, weights, levels, &Nclusters, &clusters, &Nelements, &elements);
-  meshClusteredGeometricPartition2D(mesh, Nclusters, clusters, &Nelements, &elements);
-
-  //initialize the accepted partition
-  int acceptedNelements = Nelements;
-  acceptedPartition = elements;
-
-  for (int lev = 0; lev < 1; lev++) {
-    if (rank == 0) printf("Clustering level %d...", lev);
-    meshBuildMRABClusters2D(mesh, lev, weights, levels, &Nclusters, &clusters, &Nelements,
-                            &elements);
-    if (rank == 0) printf("done.\n");
-    dfloat partQuality = meshClusteredGeometricPartition2D(mesh,
-                                                           Nclusters,
-                                                           clusters,
-                                                           &Nelements,
-                                                           &elements);
-
-    if (partQuality > TOL) {
-      if (rank == 0) printf("Accepting level %d clustered partition...(quality = %g)\n",
-                            lev,
-                            partQuality);
-      free(acceptedPartition); //discard the old partition
-      acceptedNelements = Nelements;
-      acceptedPartition = elements; //good partition
-    } else {
-      if (rank == 0) printf("Regecting level %d clustered partition...(quality = %g)\n",
-                            lev,
-                            partQuality);
-      free(elements); //discard this partition
-      break;
-    }
-  }
-
-  //save this partition, and perform the mesh setup again.
-  mesh->Nelements = acceptedNelements;
-
-  mesh->EToV = (hlong*) realloc(mesh->EToV, mesh->Nelements * mesh->Nverts * sizeof(hlong));
-  mesh->EX = (dfloat*) realloc(mesh->EX, mesh->Nelements * mesh->Nverts * sizeof(dfloat));
-  mesh->EY = (dfloat*) realloc(mesh->EY, mesh->Nelements * mesh->Nverts * sizeof(dfloat));
-  mesh->elementInfo = (int*) realloc(mesh->elementInfo,mesh->Nelements * sizeof(int));
-  mesh->MRABlevel = (int*) realloc(mesh->MRABlevel,mesh->Nelements * sizeof(int));
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      mesh->EToV[e * mesh->Nverts + n] = acceptedPartition[e].v[n];
-      mesh->EX  [e * mesh->Nverts + n] = acceptedPartition[e].EX[n];
-      mesh->EY  [e * mesh->Nverts + n] = acceptedPartition[e].EY[n];
-    }
-    mesh->elementInfo[e] = acceptedPartition[e].type;
-    mesh->MRABlevel[e] = acceptedPartition[e].level;
-  }
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  if(mesh->dim == 2 && mesh->Nverts == 3) { // Triangle
-    // compute physical (x,y) locations of the element nodes
-    meshPhysicalNodesTri2D(mesh);
-    // compute geometric factors
-    meshGeometricFactorsTri2D(mesh);
-  }else if(mesh->dim == 2 && mesh->Nverts == 4) {      // Quad2D
-    meshPhysicalNodesQuad2D(mesh);
-    meshGeometricFactorsQuad2D(mesh);
-  }
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  if(mesh->dim == 2 && mesh->Nverts == 3) { // Triangle
-    meshConnectFaceNodes2D(mesh);
-    meshSurfaceGeometricFactorsTri2D(mesh);
-  }else if(mesh->dim == 2 && mesh->Nverts == 4) {      // Quad2D
-    meshConnectFaceNodes2D(mesh);
-    meshSurfaceGeometricFactorsQuad2D(mesh);
-  }
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  if (mesh->totalHaloPairs) {
-    mesh->MRABlevel = (int*) realloc(mesh->MRABlevel,
-                                     (mesh->Nelements + mesh->totalHaloPairs) * sizeof(int));
-    int* MRABsendBuffer = (int*) calloc(mesh->totalHaloPairs,sizeof(int));
-    meshHaloExchange(mesh,
-                     sizeof(int),
-                     mesh->MRABlevel,
-                     MRABsendBuffer,
-                     mesh->MRABlevel + mesh->Nelements);
-    free(MRABsendBuffer);
-  }
-
-  free(acceptedPartition);
-}
diff --git a/src/libP/src/meshMRABWeightedPartition3D.c b/src/libP/src/meshMRABWeightedPartition3D.c
deleted file mode 100644
index fb03fa4bd..000000000
--- a/src/libP/src/meshMRABWeightedPartition3D.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh3D.h"
-
-typedef struct
-{
-  int id;
-  int level;
-  dfloat weight;
-
-  // 8 for maximum number of vertices per element in 3D
-  int v[8];
-  dfloat EX[8], EY[8], EZ[8];
-
-  int cRank;
-  int cId;
-  int type;
-} cElement_t;
-
-typedef struct
-{
-  int Nelements;
-  int offSet;
-} cluster_t;
-
-void meshBuildMRABClusters3D(mesh3D* mesh,
-                             int lev,
-                             dfloat* weights,
-                             int* levels,
-                             int* Nclusters,
-                             cluster_t** clusters,
-                             int* Nelements,
-                             cElement_t** newElements);
-
-// geometric partition of clusters of elements in 3D mesh using Morton ordering + parallelSort
-dfloat meshClusteredGeometricPartition3D(mesh3D* mesh, int Nclusters, cluster_t* clusters,
-                                         int* Nelements, cElement_t** elements);
-
-/* ---------------------------------------------------------
-
-   This function is a bit spaghetti, but the general idea is
-   we cluster low-MRAB-level elements together along with a
-   halo and partition the mesh of clusters. This reduces the MPI
-   costs of communicating on the low levels.
-
-   The algorithm performs the following steps
-   - cluster elements of level lev or lower
-   - put clusters together a single 'owning' process
-   - sort the list of clusters using a space-filling curve
-   - partition the SFC between the processors, exchange the
-    elements along the processor boundaries to improve the
-    partitioning.
-   - If the resulting partition is acceptable, save it.
-   - If not, return to the last acceptable partition, and rerun
-    the mesh setup.
-
-   ------------------------------------------------------------ */
-void meshMRABWeightedPartition3D(mesh3D* mesh, dfloat* weights,
-                                 int numLevels, int* levels)
-{
-  const dfloat TOL = 0.8; //tolerance on what partitions are ruled 'acceptable'
-                          // min_{ranks} totalWeight > TOL*max_{ranks} totalWeight => accepted
-
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  int Nelements, Nclusters;
-
-  cElement_t* elements, * acceptedPartition;
-  cluster_t* clusters;
-
-  if (!levels) numLevels = 1;
-
-  //perform the first weigthed partitioning with no clustering
-  meshBuildMRABClusters3D(mesh, -1, weights, levels, &Nclusters, &clusters, &Nelements, &elements);
-  meshClusteredGeometricPartition3D(mesh, Nclusters, clusters, &Nelements, &elements);
-
-  //initialize the accepted partition
-  int acceptedNelements = Nelements;
-  acceptedPartition = elements;
-
-  for (int lev = 0; lev < mesh->MRABNlevels; lev++) {
-    if (rank == 0) printf("Clustering level %d...", lev);
-    meshBuildMRABClusters3D(mesh, lev, weights, levels, &Nclusters, &clusters, &Nelements,
-                            &elements);
-    if (rank == 0) printf("done.\n");
-    dfloat partQuality = meshClusteredGeometricPartition3D(mesh,
-                                                           Nclusters,
-                                                           clusters,
-                                                           &Nelements,
-                                                           &elements);
-
-    if (partQuality > TOL) {
-      if (rank == 0) printf("Accepting level %d clustered partition...(quality = %g)\n",
-                            lev,
-                            partQuality);
-      free(acceptedPartition); //discard the old partition
-      acceptedNelements = Nelements;
-      acceptedPartition = elements; //good partition
-    } else {
-      if (rank == 0) printf("Regecting level %d clustered partition...(quality = %g)\n",
-                            lev,
-                            partQuality);
-      free(elements); //discard this partition
-      break;
-    }
-  }
-
-  //save this partition, and perform the mesh setup again.
-  mesh->Nelements = acceptedNelements;
-
-  mesh->EToV = (hlong*) realloc(mesh->EToV,mesh->Nelements * mesh->Nverts * sizeof(hlong));
-  mesh->EX = (dfloat*) realloc(mesh->EX,mesh->Nelements * mesh->Nverts * sizeof(dfloat));
-  mesh->EY = (dfloat*) realloc(mesh->EY,mesh->Nelements * mesh->Nverts * sizeof(dfloat));
-  mesh->EZ = (dfloat*) realloc(mesh->EZ,mesh->Nelements * mesh->Nverts * sizeof(dfloat));
-  mesh->elementInfo = (int*) realloc(mesh->elementInfo,mesh->Nelements * sizeof(int));
-  mesh->MRABlevel = (int*) realloc(mesh->MRABlevel,mesh->Nelements * sizeof(int));
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      mesh->EToV[e * mesh->Nverts + n] = acceptedPartition[e].v[n];
-      mesh->EX  [e * mesh->Nverts + n] = acceptedPartition[e].EX[n];
-      mesh->EY  [e * mesh->Nverts + n] = acceptedPartition[e].EY[n];
-      mesh->EZ  [e * mesh->Nverts + n] = acceptedPartition[e].EZ[n];
-    }
-    mesh->elementInfo[e] = acceptedPartition[e].type;
-    mesh->MRABlevel[e] = acceptedPartition[e].level;
-  }
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  if(mesh->NfaceVertices == 2) { // Quad 3D
-    meshLoadReferenceNodesQuad2D(mesh, mesh->N);
-    meshPhysicalNodesQuad3D(mesh);
-    meshGeometricFactorsQuad3D(mesh);
-  }else if(mesh->NfaceVertices == 4) {    // tet
-    // compute physical (x,y) locations of the element nodes
-    meshPhysicalNodesTet3D(mesh);
-    // compute geometric factors
-    meshGeometricFactorsTet3D(mesh);
-  }else{                         // Hex
-    // compute physical (x,y) locations of the element nodes
-    meshPhysicalNodesHex3D(mesh);
-    // compute geometric factors
-    meshGeometricFactorsHex3D(mesh);
-  }
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes3D(mesh);
-
-  // compute surface geofacs
-  if(mesh->NfaceVertices == 2)
-    meshSurfaceGeometricFactorsQuad3D(mesh);
-  else if(mesh->NfaceVertices == 3)
-    meshSurfaceGeometricFactorsTet3D(mesh);
-  else
-    meshSurfaceGeometricFactorsHex3D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  if (mesh->totalHaloPairs) {
-    mesh->MRABlevel = (int*) realloc(mesh->MRABlevel,
-                                     (mesh->Nelements + mesh->totalHaloPairs) * sizeof(int));
-    int* MRABsendBuffer = (int*) calloc(mesh->totalHaloPairs,sizeof(int));
-    meshHaloExchange(mesh,
-                     sizeof(int),
-                     mesh->MRABlevel,
-                     MRABsendBuffer,
-                     mesh->MRABlevel + mesh->Nelements);
-    free(MRABsendBuffer);
-  }
-
-  free(acceptedPartition);
-}
diff --git a/src/libP/src/meshOccaSetup2D.c b/src/libP/src/meshOccaSetup2D.c
deleted file mode 100644
index 192d373f1..000000000
--- a/src/libP/src/meshOccaSetup2D.c
+++ /dev/null
@@ -1,588 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh2D.h"
-
-void meshOccaSetup2D(mesh2D* mesh, setupAide &newOptions, occa::properties &kernelInfo)
-{
-  //make seperate stream for halo exchange
-  mesh->defaultStream = mesh->device.getStream();
-  mesh->dataStream = mesh->device.createStream();
-  mesh->device.setStream(mesh->defaultStream);
-
-  // find elements that have all neighbors on this process
-  dlong* internalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-  dlong* notInternalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  dlong Ninterior = 0, NnotInterior = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    int flag = 0;
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      if(mesh->EToP[e * mesh->Nfaces + f] != -1)
-        flag = 1;
-    if(!flag)
-      internalElementIds[Ninterior++] = e;
-    else
-      notInternalElementIds[NnotInterior++] = e;
-  }
-
-  //printf("NinteriorElements = %d, NnotInternalElements = %d\n", Ninterior, NnotInterior);
-
-  mesh->NinternalElements = Ninterior;
-  mesh->NnotInternalElements = NnotInterior;
-  if(Ninterior)
-    mesh->o_internalElementIds    = mesh->device.malloc(Ninterior * sizeof(dlong),
-                                                        internalElementIds);
-
-  if(NnotInterior > 0)
-    mesh->o_notInternalElementIds = mesh->device.malloc(NnotInterior * sizeof(dlong),
-                                                        notInternalElementIds);
-
-  // // OCCA allocate device memory (remember to go back for halo)
-  // mesh->o_q =
-  //   mesh->device.malloc(mesh->Np*(mesh->totalHaloPairs+mesh->Nelements)*mesh->Nfields*sizeof(dfloat), mesh->q);
-  // mesh->o_rhsq =
-  //   mesh->device.malloc(mesh->Np*mesh->Nelements*mesh->Nfields*sizeof(dfloat), mesh->rhsq);
-  // mesh->o_resq =
-  //   mesh->device.malloc(mesh->Np*mesh->Nelements*mesh->Nfields*sizeof(dfloat), mesh->resq);
-
-  if (mesh->Nverts == 3) {
-    // build Dr, Ds, LIFT transposes
-    dfloat* DrT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    dfloat* DsT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        DrT[n + m * mesh->Np] = mesh->Dr[n * mesh->Np + m];
-        DsT[n + m * mesh->Np] = mesh->Ds[n * mesh->Np + m];
-      }
-
-    // build Dr, Ds transposes
-    dfloat* DrsT = (dfloat*) calloc(2 * mesh->Np * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        DrsT[n + m * mesh->Np] = mesh->Dr[n * mesh->Np + m];
-        DrsT[n + m * mesh->Np + mesh->Np * mesh->Np] = mesh->Ds[n * mesh->Np + m];
-      }
-
-    dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Nfaces * mesh->Nfp; ++m)
-        LIFTT[n + m * mesh->Np] = mesh->LIFT[n * mesh->Nfp * mesh->Nfaces + m];
-
-    // build volume cubature matrix transposes
-    int cubNpBlocked = mesh->Np * ((mesh->cubNp + mesh->Np - 1) / mesh->Np);
-    dfloat* cubDrWT = (dfloat*) calloc(cubNpBlocked * mesh->Np, sizeof(dfloat));
-    dfloat* cubDsWT = (dfloat*) calloc(cubNpBlocked * mesh->Np, sizeof(dfloat));
-    dfloat* cubDrsWT = (dfloat*) calloc(2 * mesh->cubNp * mesh->Np, sizeof(dfloat));
-    dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-    dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->cubNp; ++m) {
-        cubDrWT[n + m * mesh->Np] = mesh->cubDrW[n * mesh->cubNp + m];
-        cubDsWT[n + m * mesh->Np] = mesh->cubDsW[n * mesh->cubNp + m];
-
-        cubDrsWT[n + m * mesh->Np] = mesh->cubDrW[n * mesh->cubNp + m];
-        cubDrsWT[n + m * mesh->Np + mesh->cubNp * mesh->Np] = mesh->cubDsW[n * mesh->cubNp + m];
-
-        cubProjectT[n + m * mesh->Np] = mesh->cubProject[n * mesh->cubNp + m];
-        cubInterpT[m + n * mesh->cubNp] = mesh->cubInterp[m * mesh->Np + n];
-        //      printf("%g @ ", cubInterpT[m+n*mesh->cubNp]);
-      }
-
-    // build surface integration matrix transposes
-    dfloat* intLIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-    dfloat* intInterpT = (dfloat*) calloc(mesh->Nfp * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Nfaces * mesh->intNfp; ++m)
-        intLIFTT[n + m * mesh->Np] = mesh->intLIFT[n * mesh->intNfp * mesh->Nfaces + m];
-    for(int n = 0; n < mesh->intNfp * mesh->Nfaces; ++n)
-      for(int m = 0; m < mesh->Nfp; ++m)
-        intInterpT[n + m * mesh->Nfaces * mesh->intNfp] = mesh->intInterp[n * mesh->Nfp + m];
-
-    // =============== BB operators [added by JC] ===============
-    // deriv operators: transpose from row major to column major
-    int* D1ids = (int*) calloc(mesh->Np * 3,sizeof(int));
-    int* D2ids = (int*) calloc(mesh->Np * 3,sizeof(int));
-    int* D3ids = (int*) calloc(mesh->Np * 3,sizeof(int));
-    dfloat* Dvals = (dfloat*) calloc(mesh->Np * 3,sizeof(dfloat));
-
-    dfloat* VBq = (dfloat*) calloc(mesh->Np * mesh->cubNp,sizeof(dfloat));
-    dfloat* PBq = (dfloat*) calloc(mesh->Np * mesh->cubNp,sizeof(dfloat));
-
-    dfloat* L0vals = (dfloat*) calloc(mesh->Nfp * 3,sizeof(dfloat)); // tridiag
-    int* ELids = (int*) calloc(1 + mesh->Np * mesh->max_EL_nnz,sizeof(int));
-    dfloat* ELvals = (dfloat*) calloc(1 + mesh->Np * mesh->max_EL_nnz,sizeof(dfloat));
-
-    for (int i = 0; i < mesh->Np; ++i)
-      for (int j = 0; j < 3; ++j) {
-        D1ids[i + j * mesh->Np] = mesh->D1ids[j + i * 3];
-        D2ids[i + j * mesh->Np] = mesh->D2ids[j + i * 3];
-        D3ids[i + j * mesh->Np] = mesh->D3ids[j + i * 3];
-        Dvals[i + j * mesh->Np] = mesh->Dvals[j + i * 3];
-      }
-
-    for (int i = 0; i < mesh->cubNp; ++i)
-      for (int j = 0; j < mesh->Np; ++j) {
-        VBq[i + j * mesh->cubNp] = mesh->VBq[j + i * mesh->Np];
-        PBq[j + i * mesh->Np] = mesh->PBq[i + j * mesh->cubNp];
-      }
-
-
-    for (int i = 0; i < mesh->Nfp; ++i)
-      for (int j = 0; j < 3; ++j)
-        L0vals[i + j * mesh->Nfp] = mesh->L0vals[j + i * 3];
-
-    for (int i = 0; i < mesh->Np; ++i)
-      for (int j = 0; j < mesh->max_EL_nnz; ++j) {
-        ELids[i + j * mesh->Np] = mesh->ELids[j + i * mesh->max_EL_nnz];
-        ELvals[i + j * mesh->Np] = mesh->ELvals[j + i * mesh->max_EL_nnz]; // ???
-      }
-
-    //BB mass matrix
-    mesh->BBMM = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    for (int n = 0; n < mesh->Np; ++n)
-      for (int m = 0; m < mesh->Np; ++m)
-        for (int i = 0; i < mesh->Np; ++i)
-          for (int j = 0; j < mesh->Np; ++j)
-            mesh->BBMM[n + m * mesh->Np] += mesh->VB[m + j * mesh->Np] *
-                                            mesh->MM[i + j * mesh->Np] * mesh->VB[n + i * mesh->Np];
-
-    // =============== end BB stuff =============================
-
-    //build element stiffness matrices
-    dfloat* SrrT, * SrsT, * SsrT, * SssT;
-    if (mesh->Nverts == 3) {
-      mesh->Srr = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      mesh->Srs = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      mesh->Ssr = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      mesh->Sss = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      for (int n = 0; n < mesh->Np; n++)
-        for (int m = 0; m < mesh->Np; m++)
-          for (int k = 0; k < mesh->Np; k++)
-            for (int l = 0; l < mesh->Np; l++) {
-              mesh->Srr[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] *
-                                             mesh->MM[k + l * mesh->Np] *
-                                             mesh->Dr[m + k * mesh->Np];
-              mesh->Srs[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] *
-                                             mesh->MM[k + l * mesh->Np] *
-                                             mesh->Ds[m + k * mesh->Np];
-              mesh->Ssr[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] *
-                                             mesh->MM[k + l * mesh->Np] *
-                                             mesh->Dr[m + k * mesh->Np];
-              mesh->Sss[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] *
-                                             mesh->MM[k + l * mesh->Np] *
-                                             mesh->Ds[m + k * mesh->Np];
-            }
-      SrrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      SrsT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      SsrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      SssT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-      for (int n = 0; n < mesh->Np; n++)
-        for (int m = 0; m < mesh->Np; m++) {
-          SrrT[m + n * mesh->Np] = mesh->Srr[n + m * mesh->Np];
-          SrsT[m + n * mesh->Np] = mesh->Srs[n + m * mesh->Np];
-          SsrT[m + n * mesh->Np] = mesh->Ssr[n + m * mesh->Np];
-          SssT[m + n * mesh->Np] = mesh->Sss[n + m * mesh->Np];
-        }
-    }
-
-    dfloat* ST = (dfloat*) calloc(3 * mesh->Np * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        ST[n + m * mesh->Np + 0 * mesh->Np * mesh->Np] = mesh->Srr[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 1 * mesh->Np * mesh->Np] = mesh->Srs[n * mesh->Np + m] +
-                                                         mesh->Ssr[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 2 * mesh->Np * mesh->Np] = mesh->Sss[n * mesh->Np + m];
-      }
-
-    mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-    mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      for(int f = 0; f < mesh->Nfaces; ++f)
-        for(int n = 0; n < mesh->intNfp; ++n) {
-          dfloat ix = 0, iy = 0;
-          for(int m = 0; m < mesh->Nfp; ++m) {
-            dlong vid = mesh->vmapM[m + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces];
-            dfloat xm = mesh->x[vid];
-            dfloat ym = mesh->y[vid];
-            //dfloat Inm = mesh->intInterp[n+f*mesh->intNfp+m*mesh->intNfp*mesh->Nfaces];
-            dfloat Inm = mesh->intInterp[m + n * mesh->Nfp + f * mesh->intNfp * mesh->Nfp]; // Fixed
-            ix += Inm * xm;
-            iy += Inm * ym;
-          }
-          dlong id = n + f * mesh->intNfp + e * mesh->Nfaces * mesh->intNfp;
-          mesh->intx[id] = ix;
-          mesh->inty[id] = iy;
-        }
-
-    mesh->o_Dr = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                     mesh->Dr);
-
-    mesh->o_Ds = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                     mesh->Ds);
-
-    mesh->o_DrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                      DrT);
-
-    mesh->o_DsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                      DsT);
-
-    mesh->o_DtT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                      DsT); // note: dummy allocated with DsT
-
-    mesh->o_Dmatrices = mesh->device.malloc(2 * mesh->Np * mesh->Np * sizeof(dfloat), DrsT);
-
-    mesh->o_LIFT =
-      mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->Nfp * sizeof(dfloat),
-                          mesh->LIFT);
-
-    mesh->o_LIFTT =
-      mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->Nfp * sizeof(dfloat),
-                          LIFTT);
-
-    mesh->o_vgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * sizeof(dfloat),
-                          mesh->vgeo);
-
-    mesh->o_cubvgeo =   mesh->device.malloc(sizeof(dfloat));// dummy
-
-    mesh->o_sgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nsgeo * sizeof(dfloat),
-                          mesh->sgeo);
-
-    mesh->o_ggeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nggeo * sizeof(dfloat),
-                          mesh->ggeo);
-
-    mesh->o_cubsgeo = mesh->o_sgeo; //dummy cubature geo factors
-
-    mesh->o_SrrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrrT);
-    mesh->o_SrsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrsT);
-    mesh->o_SsrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SsrT);
-    mesh->o_SssT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SssT);
-    mesh->o_Smatrices = mesh->device.malloc(3 * mesh->Np * mesh->Np * sizeof(dfloat), ST);
-
-    mesh->o_D1ids = mesh->device.malloc(mesh->Np * 3 * sizeof(int),D1ids);
-    mesh->o_D2ids = mesh->device.malloc(mesh->Np * 3 * sizeof(int),D2ids);
-    mesh->o_D3ids = mesh->device.malloc(mesh->Np * 3 * sizeof(int),D3ids);
-    mesh->o_Dvals = mesh->device.malloc(mesh->Np * 3 * sizeof(dfloat),Dvals);
-
-    mesh->o_BBMM = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),mesh->BBMM);
-
-    mesh->o_VBq = mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),VBq);
-    mesh->o_PBq = mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),PBq);
-
-    mesh->o_L0vals = mesh->device.malloc(mesh->Nfp * 3 * sizeof(dfloat),L0vals);
-    mesh->o_ELids =
-      mesh->device.malloc(mesh->Np * mesh->max_EL_nnz * sizeof(int),ELids);
-    mesh->o_ELvals =
-      mesh->device.malloc(mesh->Np * mesh->max_EL_nnz * sizeof(dfloat),ELvals);
-
-    mesh->o_cubInterpT =
-      mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                          cubInterpT);
-
-    mesh->o_cubProjectT =
-      mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                          cubProjectT);
-
-    mesh->o_cubDrWT =
-      mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                          cubDrWT);
-
-    mesh->o_cubDsWT =
-      mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                          cubDsWT);
-
-    mesh->o_cubDtWT =
-      mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                          cubDsWT); // dummy to align with 3d
-
-    mesh->o_cubDWmatrices = mesh->device.malloc(2 * mesh->cubNp * mesh->Np * sizeof(dfloat),
-                                                cubDrsWT);
-
-    mesh->o_intInterpT =
-      mesh->device.malloc(mesh->Nfp * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                          intInterpT);
-
-    mesh->o_intLIFTT =
-      mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                          intLIFTT);
-
-    mesh->o_intx =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                          mesh->intx);
-
-    mesh->o_inty =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                          mesh->inty);
-
-    mesh->o_intz =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                          mesh->inty); // dummy to align with 3d
-
-    free(DrsT);
-    free(ST);
-  } else if (mesh->Nverts == 4) {//quads
-    //lumped mass matrix
-    mesh->MM = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    for (int j = 0; j < mesh->Nq; j++)
-      for (int i = 0; i < mesh->Nq; i++) {
-        int n = i + j * mesh->Nq;
-        mesh->MM[n + n * mesh->Np] = mesh->gllw[i] * mesh->gllw[j];
-      }
-
-    dfloat* cubDWT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-    dfloat* cubDiffInterpT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-    dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-    dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-    for(int n = 0; n < mesh->Nq; ++n)
-      for(int m = 0; m < mesh->cubNq; ++m) {
-        cubDWT[n + m * mesh->Nq] = mesh->cubDW[n * mesh->cubNq + m];
-        cubProjectT[n + m * mesh->Nq] = mesh->cubProject[n * mesh->cubNq + m];
-        cubDiffInterpT[m + n * mesh->cubNq] = mesh->cubDiffInterp[m * mesh->Nq + n];
-        cubInterpT[m + n * mesh->cubNq] = mesh->cubInterp[m * mesh->Nq + n];
-      }
-
-    mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq, sizeof(dfloat));
-    mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq, sizeof(dfloat));
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      for(int f = 0; f < mesh->Nfaces; ++f)
-        for(int n = 0; n < mesh->cubNq; ++n) {
-          dfloat ix = 0, iy = 0;
-          for(int m = 0; m < mesh->Nq; ++m) {
-            dlong vid = mesh->vmapM[m + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces];
-            dfloat xm = mesh->x[vid];
-            dfloat ym = mesh->y[vid];
-
-            dfloat Inm = mesh->cubInterp[m + n * mesh->Nq];
-            ix += Inm * xm;
-            iy += Inm * ym;
-          }
-          dlong id = n + f * mesh->cubNq + e * mesh->Nfaces * mesh->cubNq;
-          mesh->intx[id] = ix;
-          mesh->inty[id] = iy;
-        }
-
-    mesh->o_D = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
-
-    mesh->o_Dmatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
-    mesh->o_Smatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D); //dummy
-
-    mesh->o_vgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->Np * sizeof(dfloat),
-                          mesh->vgeo);
-    mesh->o_sgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat),
-                          mesh->sgeo);
-    mesh->o_ggeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat),
-                          mesh->ggeo);
-
-    mesh->o_cubvgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp * sizeof(dfloat),
-                          mesh->cubvgeo);
-
-    mesh->o_cubsgeo =
-      mesh->device.malloc(
-        mesh->Nelements * mesh->Nfaces * mesh->cubNq * mesh->Nsgeo * sizeof(dfloat),
-        mesh->cubsgeo);
-
-    mesh->o_cubInterpT =
-      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                          cubInterpT);
-
-    mesh->o_cubProjectT =
-      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                          cubProjectT);
-
-    mesh->o_cubDWT =
-      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                          cubDWT);
-
-    mesh->o_cubDiffInterpT =
-      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                          cubDiffInterpT);
-
-    mesh->o_cubDWmatrices = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat), cubDWT);
-
-    dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-
-    mesh->o_LIFTT =
-      mesh->device.malloc(1 * sizeof(dfloat)); // dummy
-
-    mesh->o_intx =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * sizeof(dfloat),
-                          mesh->intx);
-
-    mesh->o_inty =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * sizeof(dfloat),
-                          mesh->inty);
-
-    // dummy int z variable (using y)
-    mesh->o_intz =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * sizeof(dfloat),
-                          mesh->inty);
-
-    //dummy quadrature lifter operators
-    mesh->o_intInterpT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
-    mesh->o_intInterpT.copyFrom(mesh->o_cubInterpT);
-
-    mesh->o_intLIFTT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
-    mesh->o_intLIFTT.copyFrom(mesh->o_cubProjectT);
-  }
-
-  mesh->o_MM =
-    mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                        mesh->MM);
-
-  mesh->o_vmapM =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapM);
-
-  mesh->o_vmapP =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapP);
-
-  mesh->o_EToB =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),
-                        mesh->EToB);
-
-  mesh->o_x =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->x);
-
-  mesh->o_y =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->y);
-
-  // dummy z variables (note used y)
-  mesh->o_z =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->y);
-
-  if(mesh->totalHaloPairs > 0) {
-    // copy halo element list to DEVICE
-    mesh->o_haloElementList =
-      mesh->device.malloc(mesh->totalHaloPairs * sizeof(dlong), mesh->haloElementList);
-
-    // temporary DEVICE buffer for halo (maximum size Nfields*Np for dfloat)
-    mesh->o_haloBuffer =
-      mesh->device.malloc(mesh->totalHaloPairs * mesh->Np * mesh->Nfields * sizeof(dfloat));
-
-    mesh->o_haloGetNodeIds =
-      mesh->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloGetNodeIds);
-
-    mesh->o_haloPutNodeIds =
-      mesh->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloPutNodeIds);
-  }
-
-  //-------------------------------------
-  // NBN: 2 streams for async MPI updates
-  // {Vol, Surf, update}  run on q[0]
-  // {halo-get, copy} run on q[1]
-  //-------------------------------------
-  mesh->stream0 = mesh->device.getStream();
-#ifdef USE_2_STREAMS
-  mesh->stream1 = mesh->device.createStream();  // NBN: second stream
-#else
-  mesh->stream1 = mesh->stream0;                // NBN: stream1 == stream0
-#endif
-  mesh->device.setStream(mesh->stream0);
-  //-------------------------------------
-
-  kernelInfo["defines/" "p_Nfields"] = mesh->Nfields;
-  kernelInfo["defines/" "p_N"] = mesh->N;
-  kernelInfo["defines/" "p_Nq"] = mesh->N + 1;
-  kernelInfo["defines/" "p_Np"] = mesh->Np;
-  kernelInfo["defines/" "p_Nfp"] = mesh->Nfp;
-  kernelInfo["defines/" "p_Nfaces"] = mesh->Nfaces;
-  kernelInfo["defines/" "p_NfacesNfp"] = mesh->Nfp * mesh->Nfaces;
-  kernelInfo["defines/" "p_Nvgeo"] = mesh->Nvgeo;
-  kernelInfo["defines/" "p_Nsgeo"] = mesh->Nsgeo;
-  kernelInfo["defines/" "p_Nggeo"] = mesh->Nggeo;
-
-  kernelInfo["defines/" "p_NXID"] = NXID;
-  kernelInfo["defines/" "p_NYID"] = NYID;
-  kernelInfo["defines/" "p_SJID"] = SJID;
-  kernelInfo["defines/" "p_IJID"] = IJID;
-  kernelInfo["defines/" "p_IHID"] = IHID;
-  kernelInfo["defines/" "p_WIJID"] = WIJID;
-  kernelInfo["defines/" "p_WSJID"] = WSJID;
-
-  kernelInfo["defines/" "p_max_EL_nnz"] = mesh->max_EL_nnz; // for Bernstein Bezier lift
-
-  kernelInfo["defines/" "p_cubNq"] = mesh->cubNq;
-  kernelInfo["defines/" "p_cubNp"] = mesh->cubNp;
-  kernelInfo["defines/" "p_intNfp"] = mesh->intNfp;
-  kernelInfo["defines/" "p_intNfpNfaces"] = mesh->intNfp * mesh->Nfaces;
-
-  if(sizeof(dfloat) == 4) {
-    kernelInfo["defines/" "dfloat"] = "float";
-    kernelInfo["defines/" "dfloat2"] = "float2";
-    kernelInfo["defines/" "dfloat4"] = "float4";
-    kernelInfo["defines/" "dfloat8"] = "float8";
-  }
-  if(sizeof(dfloat) == 8) {
-    kernelInfo["defines/" "dfloat"] = "double";
-    kernelInfo["defines/" "dfloat2"] = "double2";
-    kernelInfo["defines/" "dfloat4"] = "double4";
-    kernelInfo["defines/" "dfloat8"] = "double8";
-  }
-
-  if(sizeof(dlong) == 4)
-    kernelInfo["defines/" "dlong"] = "int";
-  if(sizeof(dlong) == 8)
-    kernelInfo["defines/" "dlong"] = "long long int";
-
-  if(mesh->device.mode() == "CUDA") { // add backend compiler optimization for CUDA
-    kernelInfo["compiler_flags"] += " --ftz=true ";
-    kernelInfo["compiler_flags"] += " --prec-div=false ";
-    kernelInfo["compiler_flags"] += " --prec-sqrt=false ";
-    kernelInfo["compiler_flags"] += " --use_fast_math ";
-    kernelInfo["compiler_flags"] += " --fmad=true "; // compiler option for cuda
-  }
-
-  kernelInfo["defines/" "p_G00ID"] = G00ID;
-  kernelInfo["defines/" "p_G01ID"] = G01ID;
-  kernelInfo["defines/" "p_G11ID"] = G11ID;
-  kernelInfo["defines/" "p_GWJID"] = GWJID;
-
-  kernelInfo["defines/" "p_RXID"] = RXID;
-  kernelInfo["defines/" "p_SXID"] = SXID;
-
-  kernelInfo["defines/" "p_RYID"] = RYID;
-  kernelInfo["defines/" "p_SYID"] = SYID;
-
-  kernelInfo["defines/" "p_JID"] = JID;
-  kernelInfo["defines/" "p_JWID"] = JWID;
-  kernelInfo["defines/" "p_IJWID"] = IJWID;
-}
diff --git a/src/libP/src/meshOccaSetup3D.c b/src/libP/src/meshOccaSetup3D.c
deleted file mode 100644
index 7e1052d48..000000000
--- a/src/libP/src/meshOccaSetup3D.c
+++ /dev/null
@@ -1,785 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh3D.h"
-
-void reportMemoryUsage(occa::device &device, const char* mess)
-{
-  size_t bytes = device.memoryAllocated();
-
-  printf("%s: bytes allocated = %lu\n", mess, bytes);
-}
-
-void meshOccaPopulateDevice3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo)
-{
-  // find elements that have all neighbors on this process
-  dlong* internalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-  dlong* notInternalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  dlong Ninterior = 0, NnotInterior = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    int flag = 0;
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      if(mesh->EToP[e * mesh->Nfaces + f] != -1)
-        flag = 1;
-    if(!flag)
-      internalElementIds[Ninterior++] = e;
-    else
-      notInternalElementIds[NnotInterior++] = e;
-  }
-
-  //  printf("NinteriorElements = %d, NnotInternalElements = %d\n", Ninterior, NnotInterior);
-
-  mesh->NinternalElements = Ninterior;
-  mesh->NnotInternalElements = NnotInterior;
-  if(Ninterior)
-    mesh->o_internalElementIds    = mesh->device.malloc(Ninterior * sizeof(dlong),
-                                                        internalElementIds);
-
-  if(NnotInterior > 0)
-    mesh->o_notInternalElementIds = mesh->device.malloc(NnotInterior * sizeof(dlong),
-                                                        notInternalElementIds);
-
-  // // OCCA allocate device memory (remember to go back for halo)
-  // mesh->o_q =
-  //   mesh->device.malloc(mesh->Np*(mesh->totalHaloPairs+mesh->Nelements)*mesh->Nfields*sizeof(dfloat), mesh->q);
-  // mesh->o_rhsq =
-  //   mesh->device.malloc(mesh->Np*mesh->Nelements*mesh->Nfields*sizeof(dfloat), mesh->rhsq);
-  // mesh->o_resq =
-  //   mesh->device.malloc(mesh->Np*mesh->Nelements*mesh->Nfields*sizeof(dfloat), mesh->resq);
-
-  //  reportMemoryUsage(mesh->device, "meshOccaSetup3D: before operators ");
-
-  if(mesh->Nfaces == 4) {
-    // build Dr, Ds, LIFT transposes
-    dfloat* DrT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    dfloat* DsT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    dfloat* DtT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        DrT[n + m * mesh->Np] = mesh->Dr[n * mesh->Np + m];
-        DsT[n + m * mesh->Np] = mesh->Ds[n * mesh->Np + m];
-        DtT[n + m * mesh->Np] = mesh->Dt[n * mesh->Np + m];
-      }
-
-    // build Dr, Ds transposes
-    dfloat* DrstT = (dfloat*) calloc(3 * mesh->Np * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        DrstT[n + m * mesh->Np] = mesh->Dr[n * mesh->Np + m];
-        DrstT[n + m * mesh->Np + mesh->Np * mesh->Np] = mesh->Ds[n * mesh->Np + m];
-        DrstT[n + m * mesh->Np + 2 * mesh->Np * mesh->Np] = mesh->Dt[n * mesh->Np + m];
-      }
-
-    dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Nfaces * mesh->Nfp; ++m)
-        LIFTT[n + m * mesh->Np] = mesh->LIFT[n * mesh->Nfp * mesh->Nfaces + m];
-
-    // =============== BB operators [added by NC] ===============
-
-    // deriv operators: transpose from row major to column major
-    int* D0ids = (int*) calloc(mesh->Np * 4,sizeof(int));
-    int* D1ids = (int*) calloc(mesh->Np * 4,sizeof(int));
-    int* D2ids = (int*) calloc(mesh->Np * 4,sizeof(int));
-    int* D3ids = (int*) calloc(mesh->Np * 4,sizeof(int));
-    dfloat* Dvals = (dfloat*) calloc(mesh->Np * 4,sizeof(dfloat));
-
-    int* L0ids = (int*) calloc(mesh->Nfp * 7,sizeof(int));
-    dfloat* L0vals = (dfloat*) calloc(mesh->Nfp * 7,sizeof(dfloat)); // tridiag
-    int* ELids = (int*) calloc(mesh->Np * mesh->max_EL_nnz,sizeof(int));
-    dfloat* ELvals = (dfloat*) calloc(mesh->Np * mesh->max_EL_nnz,sizeof(dfloat));
-
-    for (int i = 0; i < mesh->Np; ++i)
-      for (int j = 0; j < 4; ++j) {
-        D0ids[i + j * mesh->Np] = mesh->D0ids[j + i * 4];
-        D1ids[i + j * mesh->Np] = mesh->D1ids[j + i * 4];
-        D2ids[i + j * mesh->Np] = mesh->D2ids[j + i * 4];
-        D3ids[i + j * mesh->Np] = mesh->D3ids[j + i * 4];
-        Dvals[i + j * mesh->Np] = mesh->Dvals[j + i * 4];
-      }
-
-    for (int i = 0; i < mesh->Nfp; ++i)
-      for (int j = 0; j < 7; ++j) {
-        L0ids [i + j * mesh->Nfp] = mesh->L0ids [j + i * 7];
-        L0vals[i + j * mesh->Nfp] = mesh->L0vals[j + i * 7];
-      }
-
-    for (int i = 0; i < mesh->Np; ++i)
-      for (int j = 0; j < mesh->max_EL_nnz; ++j) {
-        ELids [i + j * mesh->Np] = mesh->ELids [j + i * mesh->max_EL_nnz];
-        ELvals[i + j * mesh->Np] = mesh->ELvals[j + i * mesh->max_EL_nnz];
-      }
-    // =============== end BB stuff =============================
-
-    if(mesh->cubNp) {
-      dfloat* cubDrWT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-      dfloat* cubDsWT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-      dfloat* cubDtWT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-      dfloat* cubDrstWT = (dfloat*) calloc(3 * mesh->cubNp * mesh->Np, sizeof(dfloat));
-      dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-      dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-      for(int n = 0; n < mesh->Np; ++n)
-        for(int m = 0; m < mesh->cubNp; ++m) {
-          cubDrWT[n + m * mesh->Np] = mesh->cubDrW[n * mesh->cubNp + m];
-          cubDsWT[n + m * mesh->Np] = mesh->cubDsW[n * mesh->cubNp + m];
-          cubDtWT[n + m * mesh->Np] = mesh->cubDtW[n * mesh->cubNp + m];
-
-          cubDrstWT[n + m * mesh->Np] = mesh->cubDrW[n * mesh->cubNp + m];
-          cubDrstWT[n + m * mesh->Np + mesh->cubNp * mesh->Np] = mesh->cubDsW[n * mesh->cubNp + m];
-          cubDrstWT[n + m * mesh->Np + 2 * mesh->cubNp *
-                    mesh->Np] = mesh->cubDtW[n * mesh->cubNp + m];
-
-          cubProjectT[n + m * mesh->Np] = mesh->cubProject[n * mesh->cubNp + m];
-          cubInterpT[m + n * mesh->cubNp] = mesh->cubInterp[m * mesh->Np + n];
-        }
-
-      mesh->o_cubInterpT =
-        mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                            cubInterpT);
-
-      mesh->o_cubProjectT =
-        mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                            cubProjectT);
-
-      mesh->o_cubDrWT =
-        mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                            cubDrWT);
-
-      mesh->o_cubDsWT =
-        mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                            cubDsWT);
-
-      mesh->o_cubDtWT =
-        mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                            cubDtWT);
-
-      mesh->o_cubDWmatrices = mesh->device.malloc(3 * mesh->cubNp * mesh->Np * sizeof(dfloat),
-                                                  cubDrstWT);
-    }
-
-    if(mesh->intNfp) {
-      // build surface integration matrix transposes
-      dfloat* intLIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-      dfloat* intInterpT =
-        (dfloat*) calloc(mesh->Nfp * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-      for(int n = 0; n < mesh->Np; ++n)
-        for(int m = 0; m < mesh->Nfaces * mesh->intNfp; ++m)
-          intLIFTT[n + m * mesh->Np] = mesh->intLIFT[n * mesh->intNfp * mesh->Nfaces + m];
-      for(int n = 0; n < mesh->intNfp * mesh->Nfaces; ++n)
-        for(int m = 0; m < mesh->Nfp; ++m)
-          intInterpT[n + m * mesh->Nfaces * mesh->intNfp] = mesh->intInterp[n * mesh->Nfp + m];
-
-      mesh->o_intInterpT =
-        mesh->device.malloc(mesh->Nfp * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                            intInterpT);
-
-      mesh->o_intLIFTT =
-        mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                            intLIFTT);
-
-      // printf("Integration number of points: %d \n",mesh->intNfp);
-      mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-      mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-      mesh->intz = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-
-      for(dlong e = 0; e < mesh->Nelements; ++e)
-        for(int f = 0; f < mesh->Nfaces; ++f)
-          for(int n = 0; n < mesh->intNfp; ++n) {
-            dfloat ix = 0, iy = 0, iz = 0;
-            for(int m = 0; m < mesh->Nfp; ++m) {
-              dlong vid = mesh->vmapM[m + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces];
-              dfloat xm = mesh->x[vid];
-              dfloat ym = mesh->y[vid];
-              dfloat zm = mesh->z[vid];
-              dfloat Inm = mesh->intInterp[m + n * mesh->Nfp + f * mesh->intNfp * mesh->Nfp]; // Fixed
-              ix += Inm * xm;
-              iy += Inm * ym;
-              iz += Inm * zm;
-            }
-            dlong id = n + f * mesh->intNfp + e * mesh->Nfaces * mesh->intNfp;
-            mesh->intx[id] = ix;
-            mesh->inty[id] = iy;
-            mesh->intz[id] = iz;
-          }
-
-      mesh->o_intx =
-        mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                            mesh->intx);
-
-      mesh->o_inty =
-        mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                            mesh->inty);
-
-      mesh->o_intz =
-        mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                            mesh->intz);
-    }
-
-    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: after operators and integration grids ");
-
-    // =============== Bernstein-Bezier allocations [added by NC] ============
-
-    // create packed BB indexes
-    mesh->o_D0ids = mesh->device.malloc(mesh->Np * 4 * sizeof(int),D0ids);
-    mesh->o_D1ids = mesh->device.malloc(mesh->Np * 4 * sizeof(int),D1ids);
-    mesh->o_D2ids = mesh->device.malloc(mesh->Np * 4 * sizeof(int),D2ids);
-    mesh->o_D3ids = mesh->device.malloc(mesh->Np * 4 * sizeof(int),D3ids);
-    mesh->o_Dvals = mesh->device.malloc(mesh->Np * 4 * sizeof(dfloat),Dvals);
-
-    unsigned char* packedDids = (unsigned char*) malloc(mesh->Np * 3 * 4 * sizeof(unsigned char));
-
-    for(int n = 0; n < 4 * mesh->Np; ++n) {
-      if(D1ids[n] < D0ids[n]) printf("bugger: D0id > D1id\n");
-      if(D2ids[n] < D0ids[n]) printf("bugger: D0id > D2id\n");
-      if(D3ids[n] < D0ids[n]) printf("bugger: D0id > D3id\n");
-    }
-
-    for(int n = 0; n < mesh->Np; ++n) {
-      packedDids[n * 4 + 0] = D1ids[n + 0 * mesh->Np] - D0ids[n + 0 * mesh->Np];
-      packedDids[n * 4 + 1] = D1ids[n + 1 * mesh->Np] - D0ids[n + 1 * mesh->Np];
-      packedDids[n * 4 + 2] = D1ids[n + 2 * mesh->Np] - D0ids[n + 2 * mesh->Np];
-      packedDids[n * 4 + 3] = D1ids[n + 3 * mesh->Np] - D0ids[n + 3 * mesh->Np];
-
-      packedDids[4 * mesh->Np + n * 4 + 0] = D2ids[n + 0 * mesh->Np] - D0ids[n + 0 * mesh->Np];
-      packedDids[4 * mesh->Np + n * 4 + 1] = D2ids[n + 1 * mesh->Np] - D0ids[n + 1 * mesh->Np];
-      packedDids[4 * mesh->Np + n * 4 + 2] = D2ids[n + 2 * mesh->Np] - D0ids[n + 2 * mesh->Np];
-      packedDids[4 * mesh->Np + n * 4 + 3] = D2ids[n + 3 * mesh->Np] - D0ids[n + 3 * mesh->Np];
-
-      packedDids[8 * mesh->Np + n * 4 + 0] = D3ids[n + 0 * mesh->Np] - D0ids[n + 0 * mesh->Np];
-      packedDids[8 * mesh->Np + n * 4 + 1] = D3ids[n + 1 * mesh->Np] - D0ids[n + 1 * mesh->Np];
-      packedDids[8 * mesh->Np + n * 4 + 2] = D3ids[n + 2 * mesh->Np] - D0ids[n + 2 * mesh->Np];
-      packedDids[8 * mesh->Np + n * 4 + 3] = D3ids[n + 3 * mesh->Np] - D0ids[n + 3 * mesh->Np];
-    }
-
-    mesh->o_packedDids = mesh->device.malloc(mesh->Np * 3 * 4 * sizeof(unsigned char),packedDids);
-
-    mesh->o_L0ids  = mesh->device.malloc(mesh->Nfp * 7 * sizeof(int),L0ids);
-    mesh->o_L0vals = mesh->device.malloc(mesh->Nfp * 7 * sizeof(dfloat),L0vals);
-    mesh->o_ELids  = mesh->device.malloc(mesh->Np * mesh->max_EL_nnz * sizeof(int),ELids);
-    mesh->o_ELvals = mesh->device.malloc(mesh->Np * mesh->max_EL_nnz * sizeof(dfloat),ELvals);
-    // =============== end Bernstein-Bezier section [added by NC] ============
-
-    //build element stiffness matrices
-    dfloat* SrrT, * SrsT, * SrtT;
-    dfloat* SsrT, * SssT, * SstT;
-    dfloat* StrT, * StsT, * SttT;
-
-    mesh->Srr = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Srs = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Srt = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Ssr = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Sss = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Sst = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Str = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Sts = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Stt = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    for (int n = 0; n < mesh->Np; n++)
-      for (int m = 0; m < mesh->Np; m++)
-        for (int k = 0; k < mesh->Np; k++)
-          for (int l = 0; l < mesh->Np; l++) {
-            mesh->Srr[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dr[m + k * mesh->Np];
-            mesh->Srs[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Ds[m + k * mesh->Np];
-            mesh->Srt[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dt[m + k * mesh->Np];
-            mesh->Ssr[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dr[m + k * mesh->Np];
-            mesh->Sss[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Ds[m + k * mesh->Np];
-            mesh->Sst[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dt[m + k * mesh->Np];
-            mesh->Str[m + n * mesh->Np] += mesh->Dt[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dr[m + k * mesh->Np];
-            mesh->Sts[m + n * mesh->Np] += mesh->Dt[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Ds[m + k * mesh->Np];
-            mesh->Stt[m + n * mesh->Np] += mesh->Dt[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dt[m + k * mesh->Np];
-          }
-    SrrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SrsT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SrtT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SsrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SssT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SstT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    StrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    StsT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SttT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    for (int n = 0; n < mesh->Np; n++) {
-      for (int m = 0; m < mesh->Np; m++) {
-#if 0
-        SrrT[m + n * mesh->Np] = mesh->Srr[n + m * mesh->Np];
-        SrsT[m + n * mesh->Np] = mesh->Srs[n + m * mesh->Np];
-        SrtT[m + n * mesh->Np] = mesh->Srt[n + m * mesh->Np];
-        SsrT[m + n * mesh->Np] = mesh->Ssr[n + m * mesh->Np];
-        SssT[m + n * mesh->Np] = mesh->Sss[n + m * mesh->Np];
-        SstT[m + n * mesh->Np] = mesh->Sst[n + m * mesh->Np];
-        StrT[m + n * mesh->Np] = mesh->Str[n + m * mesh->Np];
-        StsT[m + n * mesh->Np] = mesh->Sts[n + m * mesh->Np];
-        SttT[m + n * mesh->Np] = mesh->Stt[n + m * mesh->Np];
-#else
-        SrrT[m + n * mesh->Np] = mesh->Srr[n + m * mesh->Np];
-        SrsT[m + n * mesh->Np] = mesh->Srs[n + m * mesh->Np] + mesh->Ssr[n + m * mesh->Np];
-        SrtT[m + n * mesh->Np] = mesh->Srt[n + m * mesh->Np] + mesh->Str[n + m * mesh->Np];
-        SssT[m + n * mesh->Np] = mesh->Sss[n + m * mesh->Np];
-        SstT[m + n * mesh->Np] = mesh->Sst[n + m * mesh->Np] + mesh->Sts[n + m * mesh->Np];
-        SttT[m + n * mesh->Np] = mesh->Stt[n + m * mesh->Np];
-#endif
-      }
-    }
-
-    dfloat* ST = (dfloat*) calloc(6 * mesh->Np * mesh->Np, sizeof(dfloat));
-    for(int n = 0; n < mesh->Np; ++n)
-      for(int m = 0; m < mesh->Np; ++m) {
-        ST[n + m * mesh->Np + 0 * mesh->Np * mesh->Np] = mesh->Srr[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 1 * mesh->Np * mesh->Np] = mesh->Srs[n * mesh->Np + m] +
-                                                         mesh->Ssr[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 2 * mesh->Np * mesh->Np] = mesh->Srt[n * mesh->Np + m] +
-                                                         mesh->Str[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 3 * mesh->Np * mesh->Np] = mesh->Sss[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 4 * mesh->Np * mesh->Np] = mesh->Sst[n * mesh->Np + m] +
-                                                         mesh->Sts[n * mesh->Np + m];
-        ST[n + m * mesh->Np + 5 * mesh->Np * mesh->Np] = mesh->Stt[n * mesh->Np + m];
-      }
-
-
-    mesh->o_Dr = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), mesh->Dr);
-    mesh->o_Ds = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), mesh->Ds);
-    mesh->o_Dt = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), mesh->Dt);
-
-    mesh->o_DrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), DrT);
-    mesh->o_DsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), DsT);
-    mesh->o_DtT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), DtT);
-
-    mesh->o_Dmatrices = mesh->device.malloc(3 * mesh->Np * mesh->Np * sizeof(dfloat), DrstT);
-
-    mesh->o_LIFT =
-      mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->Nfp * sizeof(dfloat),
-                          mesh->LIFT);
-
-    mesh->o_LIFTT =
-      mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->Nfp * sizeof(dfloat),
-                          LIFTT);
-
-    mesh->o_MM =
-      mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                          mesh->MM);
-
-    mesh->o_vgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * sizeof(dfloat),
-                          mesh->vgeo);
-
-    mesh->o_sgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nsgeo * sizeof(dfloat),
-                          mesh->sgeo);
-
-    mesh->o_cubsgeo = mesh->o_sgeo; //dummy cubature geo factors
-
-    mesh->o_ggeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nggeo * sizeof(dfloat),
-                          mesh->ggeo);
-
-    mesh->o_cubvgeo =   mesh->device.malloc(sizeof(dfloat));// dummy
-
-    mesh->o_SrrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrrT);
-    mesh->o_SrsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrsT);
-    mesh->o_SrtT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrtT);
-    mesh->o_SsrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SsrT);
-    mesh->o_SssT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SssT);
-    mesh->o_SstT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SstT);
-    mesh->o_StrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), StrT);
-    mesh->o_StsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), StsT);
-    mesh->o_SttT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SttT);
-
-    mesh->o_Smatrices = mesh->device.malloc(6 * mesh->Np * mesh->Np * sizeof(dfloat), ST);
-
-    free(DrstT);
-    free(ST);
-  } else if (mesh->Nverts == 8) {    // hardcoded for hexes
-    //lumped mass matrix
-    mesh->MM = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-    for (int k = 0; k < mesh->Nq; k++)
-      for (int j = 0; j < mesh->Nq; j++)
-        for (int i = 0; i < mesh->Nq; i++) {
-          int n = i + j * mesh->Nq + k * mesh->Nq * mesh->Nq;
-          mesh->MM[n + n * mesh->Np] = mesh->gllw[i] * mesh->gllw[j] * mesh->gllw[k];
-        }
-
-    mesh->LIFT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-
-    dfloat* cubDWT = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
-    dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-    dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-    for(int n = 0; n < mesh->Nq; ++n)
-      for(int m = 0; m < mesh->cubNq; ++m) {
-        cubProjectT[n + m * mesh->Nq] = mesh->cubProject[n * mesh->cubNq + m];
-        cubInterpT[m + n * mesh->cubNq] = mesh->cubInterp[m * mesh->Nq + n];
-      }
-    for(int n = 0; n < mesh->cubNq; ++n)
-      for(int m = 0; m < mesh->cubNq; ++m)
-        cubDWT[n + m * mesh->cubNq] = mesh->cubDW[n * mesh->cubNq + m];
-
-    dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-
-    mesh->o_LIFTT =
-      mesh->device.malloc(1 * sizeof(dfloat)); // dummy
-
-    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: before intX ");
-
-    mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp, sizeof(dfloat));
-    mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp, sizeof(dfloat));
-    mesh->intz = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp, sizeof(dfloat));
-
-    dfloat* ix = (dfloat*) calloc(mesh->cubNq * mesh->Nq,sizeof(dfloat));
-    dfloat* iy = (dfloat*) calloc(mesh->cubNq * mesh->Nq,sizeof(dfloat));
-    dfloat* iz = (dfloat*) calloc(mesh->cubNq * mesh->Nq,sizeof(dfloat));
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      for(int f = 0; f < mesh->Nfaces; ++f) {
-        //interpolate in i
-        for(int ny = 0; ny < mesh->Nq; ++ny)
-          for(int nx = 0; nx < mesh->cubNq; ++nx) {
-            ix[nx + mesh->cubNq * ny] = 0;
-            iy[nx + mesh->cubNq * ny] = 0;
-            iz[nx + mesh->cubNq * ny] = 0;
-
-            for(int m = 0; m < mesh->Nq; ++m) {
-              dlong vid = m + ny * mesh->Nq + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces;
-              dlong idM = mesh->vmapM[vid];
-
-              dfloat xm = mesh->x[idM];
-              dfloat ym = mesh->y[idM];
-              dfloat zm = mesh->z[idM];
-
-              dfloat Inm = mesh->cubInterp[m + nx * mesh->Nq];
-              ix[nx + mesh->cubNq * ny] += Inm * xm;
-              iy[nx + mesh->cubNq * ny] += Inm * ym;
-              iz[nx + mesh->cubNq * ny] += Inm * zm;
-            }
-          }
-
-        //interpolate in j and store
-        for(int ny = 0; ny < mesh->cubNq; ++ny)
-          for(int nx = 0; nx < mesh->cubNq; ++nx) {
-            dfloat x = 0.0, y = 0.0, z = 0.0;
-
-            for(int m = 0; m < mesh->Nq; ++m) {
-              dfloat xm = ix[nx + m * mesh->cubNq];
-              dfloat ym = iy[nx + m * mesh->cubNq];
-              dfloat zm = iz[nx + m * mesh->cubNq];
-
-              dfloat Inm = mesh->cubInterp[m + ny * mesh->Nq];
-              x += Inm * xm;
-              y += Inm * ym;
-              z += Inm * zm;
-            }
-
-            dlong id = nx + ny * mesh->cubNq + f * mesh->cubNfp + e * mesh->Nfaces * mesh->cubNfp;
-            mesh->intx[id] = x;
-            mesh->inty[id] = y;
-            mesh->intz[id] = z;
-          }
-      }
-    free(ix);
-    free(iy);
-    free(iz);
-
-    mesh->LMM = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
-    mesh->o_LMM =
-      mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat));
-    mesh->invLMM = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
-    mesh->o_invLMM =
-      mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat));
-
-    mesh->o_MM =
-      mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                          mesh->MM); //dummy
-
-    mesh->o_D = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
-
-    mesh->o_DW = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->DW);
-
-    mesh->o_Dmatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
-
-    dfloat* DT = (dfloat*) calloc(mesh->Nq * mesh->Nq,sizeof(dfloat));
-    for(int j = 0; j < mesh->Nq; ++j)
-      for(int i = 0; i < mesh->Nq; ++i)
-        DT[i * mesh->Nq + j] = mesh->D[j * mesh->Nq + i];
-
-    mesh->o_Smatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), DT); //dummy
-
-    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: before geofactors ");
-
-    mesh->o_vgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nvgeo * sizeof(dfloat),
-                          mesh->vgeo);
-
-    mesh->o_sgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat),
-                          mesh->sgeo);
-
-    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: before vgeo,sgeo ");
-
-    mesh->o_ggeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat),
-                          mesh->ggeo);
-
-    mesh->o_cubvgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp * sizeof(dfloat),
-                          mesh->cubvgeo);
-
-    mesh->o_cubsgeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * mesh->Nsgeo *
-                          sizeof(dfloat),
-                          mesh->cubsgeo);
-
-    mesh->o_cubggeo =
-      mesh->device.malloc(mesh->Nelements * mesh->Nggeo * mesh->cubNp * sizeof(dfloat),
-                          mesh->cubggeo);
-
-    mesh->o_cubInterpT =
-      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                          cubInterpT);
-
-    mesh->o_cubProjectT =
-      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                          cubProjectT);
-
-    mesh->o_cubDWT =
-      mesh->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat),
-                          cubDWT);
-
-    mesh->o_cubD =
-      mesh->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat),
-                          mesh->cubD);
-
-    mesh->o_cubDWmatrices = mesh->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat), cubDWT);
-
-    // just neeeded to combine quad and hex cub kernels
-    mesh->o_cubDiffInterpT = mesh->o_cubDWmatrices;
-
-    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: after geofactors ");
-
-    mesh->o_intx =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * sizeof(dfloat),
-                          mesh->intx);
-
-    mesh->o_inty =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * sizeof(dfloat),
-                          mesh->inty);
-
-    mesh->o_intz =
-      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * sizeof(dfloat),
-                          mesh->intz);
-
-    mesh->o_intInterpT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
-    mesh->o_intInterpT.copyFrom(mesh->o_cubInterpT);
-
-    mesh->o_intLIFTT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
-    mesh->o_intLIFTT.copyFrom(mesh->o_cubProjectT);
-
-    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: after intX ");
-  } else {
-    printf("Nverts = %d: unknown element type!\n",mesh->Nverts);
-  }
-
-  mesh->o_vmapM =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapM);
-
-  mesh->o_vmapP =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapP);
-
-  mesh->o_EToB =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),
-                        mesh->EToB);
-
-  mesh->o_x =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat), mesh->x);
-
-  mesh->o_y =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat), mesh->y);
-
-  mesh->o_z =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat), mesh->z);
-
-  if(mesh->totalHaloPairs > 0) {
-    // copy halo element list to DEVICE
-    mesh->o_haloElementList =
-      mesh->device.malloc(mesh->totalHaloPairs * sizeof(dlong), mesh->haloElementList);
-
-    // temporary DEVICE buffer for halo (maximum size Nfields*Np for dfloat)
-    //printf("mesh->Nfields = %d\n", mesh->Nfields);
-    mesh->o_haloBuffer =
-      mesh->device.malloc(mesh->totalHaloPairs * mesh->Np * mesh->Nfields * sizeof(dfloat));
-
-    mesh->o_haloGetNodeIds =
-      mesh->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloGetNodeIds);
-
-    mesh->o_haloPutNodeIds =
-      mesh->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloPutNodeIds);
-  }
-
-  kernelInfo["defines/" "p_dim"] = 3;
-  kernelInfo["defines/" "p_Nfields"] = mesh->Nfields;
-  kernelInfo["defines/" "p_N"] = mesh->N;
-  kernelInfo["defines/" "p_Nq"] = mesh->N + 1;
-  kernelInfo["defines/" "p_Np"] = mesh->Np;
-  kernelInfo["defines/" "p_Nfp"] = mesh->Nfp;
-  kernelInfo["defines/" "p_Nfaces"] = mesh->Nfaces;
-  kernelInfo["defines/" "p_NfacesNfp"] = mesh->Nfp * mesh->Nfaces;
-  kernelInfo["defines/" "p_Nvgeo"] = mesh->Nvgeo;
-  kernelInfo["defines/" "p_Nsgeo"] = mesh->Nsgeo;
-  kernelInfo["defines/" "p_Nggeo"] = mesh->Nggeo;
-
-  kernelInfo["defines/" "p_max_EL_nnz"] = mesh->max_EL_nnz; // for Bernstein Bezier lift
-
-  kernelInfo["defines/" "p_NXID"] = NXID;
-  kernelInfo["defines/" "p_NYID"] = NYID;
-  kernelInfo["defines/" "p_NZID"] = NZID;
-  kernelInfo["defines/" "p_SJID"] = SJID;
-  kernelInfo["defines/" "p_IJID"] = IJID;
-  kernelInfo["defines/" "p_IHID"] = IHID;
-  kernelInfo["defines/" "p_WSJID"] = WSJID;
-  kernelInfo["defines/" "p_WIJID"] = WIJID;
-  kernelInfo["defines/" "p_STXID"] = STXID;
-  kernelInfo["defines/" "p_STYID"] = STYID;
-  kernelInfo["defines/" "p_STZID"] = STZID;
-  kernelInfo["defines/" "p_SBXID"] = SBXID;
-  kernelInfo["defines/" "p_SBYID"] = SBYID;
-  kernelInfo["defines/" "p_SBZID"] = SBZID;
-
-  int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
-  kernelInfo["defines/" "p_maxNodes"] = maxNodes;
-
-#if 0
-  // TW: these should be defined at the solver setup
-  int NblockV = 256 / mesh->Np; // works for CUDA
-  kernelInfo["defines/" "p_NblockV"] = NblockV;
-
-  int NblockS = 256 / maxNodes; // works for CUDA
-  kernelInfo["defines/" "p_NblockS"] = NblockS;
-#endif
-
-  kernelInfo["defines/" "p_Lambda2"] = 0.5f;
-
-  kernelInfo["defines/" "p_cubNq"] = mesh->cubNq;
-  kernelInfo["defines/" "p_cubNfp"] = mesh->cubNfp;
-  kernelInfo["defines/" "p_cubNp"] = mesh->cubNp;
-  kernelInfo["defines/" "p_intNfp"] = mesh->intNfp;
-  kernelInfo["defines/" "p_intNfpNfaces"] = mesh->intNfp * mesh->Nfaces;
-
-  if(sizeof(dfloat) == 4) {
-    kernelInfo["defines/" "dfloat"] = "float";
-    kernelInfo["defines/" "dfloat4"] = "float4";
-    kernelInfo["defines/" "dfloat8"] = "float8";
-  }
-  if(sizeof(dfloat) == 8) {
-    kernelInfo["defines/" "dfloat"] = "double";
-    kernelInfo["defines/" "dfloat4"] = "double4";
-    kernelInfo["defines/" "dfloat8"] = "double8";
-  }
-
-  if(sizeof(dlong) == 4)
-    kernelInfo["defines/" "dlong"] = "int";
-  if(sizeof(dlong) == 8)
-    kernelInfo["defines/" "dlong"] = "long long int";
-
-  if(mesh->device.mode() == "CUDA") { // add backend compiler optimization for CUDA
-    kernelInfo["compiler_flags"] += "--ftz=true ";
-    kernelInfo["compiler_flags"] += "--prec-div=false ";
-    kernelInfo["compiler_flags"] += "--prec-sqrt=false ";
-    kernelInfo["compiler_flags"] += "--use_fast_math ";
-    kernelInfo["compiler_flags"] += "--fmad=true "; // compiler option for cuda
-    //kernelInfo["compiler_flags"] += "-Xptxas -dlcm=ca";
-  }
-
-  if(mesh->device.mode() == "OpenCL") { // add backend compiler optimization for OPENCL
-    kernelInfo["compiler_flags"] += " -cl-std=CL2.0 ";
-    kernelInfo["compiler_flags"] += " -cl-strict-aliasing ";
-    kernelInfo["compiler_flags"] += " -cl-mad-enable ";
-    kernelInfo["compiler_flags"] += " -cl-no-signed-zeros ";
-    kernelInfo["compiler_flags"] += " -cl-unsafe-math-optimizations ";
-    kernelInfo["compiler_flags"] += " -cl-fast-relaxed-math ";
-  }
-
-  if(mesh->device.mode() == "HIP") { // add backend compiler optimization for HIP
-    kernelInfo["compiler_flags"] += " -O3 ";
-    kernelInfo["compiler_flags"] += " -ffp-contract=fast ";
-    // kernelInfo["compiler_flags"] += " -funsafe-math-optimizations ";
-    // kernelInfo["compiler_flags"] += " -ffast-math ";
-  }
-
-  kernelInfo["defines/" "p_G00ID"] = G00ID;
-  kernelInfo["defines/" "p_G01ID"] = G01ID;
-  kernelInfo["defines/" "p_G02ID"] = G02ID;
-  kernelInfo["defines/" "p_G11ID"] = G11ID;
-  kernelInfo["defines/" "p_G12ID"] = G12ID;
-  kernelInfo["defines/" "p_G22ID"] = G22ID;
-  kernelInfo["defines/" "p_GWJID"] = GWJID;
-
-  kernelInfo["defines/" "p_RXID"] = RXID;
-  kernelInfo["defines/" "p_SXID"] = SXID;
-  kernelInfo["defines/" "p_TXID"] = TXID;
-
-  kernelInfo["defines/" "p_RYID"] = RYID;
-  kernelInfo["defines/" "p_SYID"] = SYID;
-  kernelInfo["defines/" "p_TYID"] = TYID;
-
-  kernelInfo["defines/" "p_RZID"] = RZID;
-  kernelInfo["defines/" "p_SZID"] = SZID;
-  kernelInfo["defines/" "p_TZID"] = TZID;
-
-  kernelInfo["defines/" "p_JID"] = JID;
-  kernelInfo["defines/" "p_JWID"] = JWID;
-  kernelInfo["defines/" "p_IJWID"] = IJWID;
-}
-
-void meshOccaSetup3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo)
-{
-  //make seperate stream for halo exchange
-  mesh->defaultStream = mesh->device.getStream();
-  mesh->dataStream = mesh->device.createStream();
-  mesh->computeStream = mesh->device.createStream();
-  mesh->device.setStream(mesh->defaultStream);
-
-  meshOccaPopulateDevice3D(mesh, newOptions, kernelInfo);
-}
-
-void meshOccaCloneDevice(mesh_t* donorMesh, mesh_t* mesh)
-{
-  mesh->device = donorMesh->device;
-
-  mesh->defaultStream = donorMesh->defaultStream;
-  mesh->dataStream = donorMesh->dataStream;
-  mesh->computeStream = donorMesh->computeStream;
-}
diff --git a/src/libP/src/meshOccaSetupQuad3D.c b/src/libP/src/meshOccaSetupQuad3D.c
deleted file mode 100644
index 568bcdf6a..000000000
--- a/src/libP/src/meshOccaSetupQuad3D.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh3D.h"
-
-void meshOccaSetupQuad3D(mesh_t* mesh, setupAide &newOptions, occa::properties &kernelInfo)
-{
-  //make seperate stream for halo exchange
-  mesh->defaultStream = mesh->device.getStream();
-  mesh->dataStream = mesh->device.createStream();
-  mesh->device.setStream(mesh->defaultStream);
-
-  // find elements that have all neighbors on this process
-  dlong* internalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-  dlong* notInternalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  dlong Ninterior = 0, NnotInterior = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    int flag = 0;
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      if(mesh->EToP[e * mesh->Nfaces + f] != -1)
-        flag = 1;
-    if(!flag)
-      internalElementIds[Ninterior++] = e;
-    else
-      notInternalElementIds[NnotInterior++] = e;
-  }
-
-  //printf("NinteriorElements = %d, NnotInternalElements = %d\n", Ninterior, NnotInterior);
-
-  mesh->NinternalElements = Ninterior;
-  mesh->NnotInternalElements = NnotInterior;
-  if(Ninterior)
-    mesh->o_internalElementIds    = mesh->device.malloc(Ninterior * sizeof(dlong),
-                                                        internalElementIds);
-
-  if(NnotInterior > 0)
-    mesh->o_notInternalElementIds = mesh->device.malloc(NnotInterior * sizeof(dlong),
-                                                        notInternalElementIds);
-
-  // OCCA allocate device memory (remember to go back for halo)
-  mesh->o_q =
-    mesh->device.malloc(
-      mesh->Np * (mesh->totalHaloPairs + mesh->Nelements) * mesh->Nfields * sizeof(dfloat),
-      mesh->q);
-  mesh->o_rhsq =
-    mesh->device.malloc(mesh->Np * mesh->Nelements * mesh->Nfields * sizeof(dfloat), mesh->rhsq);
-  mesh->o_resq =
-    mesh->device.malloc(mesh->Np * mesh->Nelements * mesh->Nfields * sizeof(dfloat), mesh->resq);
-
-  size_t bytes = mesh->device.memoryAllocated();
-  printf("bytes allocated: %lg\n", bytes / 1.e9);
-
-  //lumped mass matrix
-  mesh->MM = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  for (int j = 0; j < mesh->Nq; j++)
-    for (int i = 0; i < mesh->Nq; i++) {
-      int n = i + j * mesh->Nq;
-      mesh->MM[n + n * mesh->Np] = mesh->gllw[i] * mesh->gllw[j];
-    }
-
-  dfloat* cubDWT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-  dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-  dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
-  for(int n = 0; n < mesh->Nq; ++n)
-    for(int m = 0; m < mesh->cubNq; ++m) {
-      cubDWT[n + m * mesh->Nq] = mesh->cubDW[n * mesh->cubNq + m];
-      cubProjectT[n + m * mesh->Nq] = mesh->cubProject[n * mesh->cubNq + m];
-      cubInterpT[m + n * mesh->cubNq] = mesh->cubInterp[m * mesh->Nq + n];
-    }
-
-  mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq, sizeof(dfloat));
-  mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq, sizeof(dfloat));
-  mesh->intz = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq, sizeof(dfloat));
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      for(int n = 0; n < mesh->cubNq; ++n) {
-        dfloat ix = 0, iy = 0, iz = 0;
-        for(int m = 0; m < mesh->Nq; ++m) {
-          dlong vid = mesh->vmapM[m + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces];
-          dfloat xm = mesh->x[vid];
-          dfloat ym = mesh->y[vid];
-          dfloat zm = mesh->z[vid];
-
-          dfloat Inm = mesh->cubInterp[m + n * mesh->Nq];
-          ix += Inm * xm;
-          iy += Inm * ym;
-          iz += Inm * ym;
-        }
-        dlong id = n + f * mesh->cubNq + e * mesh->Nfaces * mesh->cubNq;
-        mesh->intx[id] = ix;
-        mesh->inty[id] = iy;
-        mesh->intz[id] = iz;
-      }
-
-  mesh->o_D = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
-
-  bytes = mesh->device.memoryAllocated();
-  printf("bytes allocated: %lg\n", bytes / 1.e9);
-
-  // bundle D and (W^{-1} D^t W)
-  dfloat* Dmatrices = (dfloat*) calloc(mesh->Nq * mesh->Nq * 2, sizeof(dfloat));
-  for(int n = 0; n < mesh->Nq * mesh->Nq; ++n)
-    Dmatrices[n] = mesh->D[n];
-  for(int j = 0; j < mesh->Nq; ++j)
-    for(int i = 0; i < mesh->Nq; ++i)
-      // note minus
-      Dmatrices[mesh->Nq * mesh->Nq + i + j * mesh->Nq] = -mesh->D[i * mesh->Nq + j] *
-                                                          mesh->gllw[i] / mesh->gllw[j];
-
-  mesh->o_Dmatrices = mesh->device.malloc(2 * mesh->Nq * mesh->Nq * sizeof(dfloat), Dmatrices);
-
-  free(Dmatrices);
-
-  mesh->o_Smatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D); //dummy
-
-  mesh->o_vgeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->Np * sizeof(dfloat),
-                        mesh->vgeo);
-  mesh->o_sgeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat),
-                        mesh->sgeo);
-  mesh->o_ggeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat),
-                        mesh->ggeo);
-
-  bytes = mesh->device.memoryAllocated();
-  printf("bytes allocated: %lg\n", bytes / 1.e9);
-
-  mesh->o_cubvgeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp * sizeof(dfloat),
-                        mesh->cubvgeo);
-
-  mesh->o_cubsgeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * mesh->Nsgeo * sizeof(dfloat),
-                        mesh->cubsgeo);
-
-  mesh->o_cubInterpT =
-    mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                        cubInterpT);
-
-  mesh->o_cubProjectT =
-    mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                        cubProjectT);
-
-  mesh->o_cubDWT =
-    mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
-                        cubDWT);
-
-  mesh->o_cubDWmatrices = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat), cubDWT);
-
-  dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-
-  mesh->o_LIFTT =
-    mesh->device.malloc(1 * sizeof(dfloat)); // dummy
-
-  mesh->o_intx =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * sizeof(dfloat),
-                        mesh->intx);
-
-  mesh->o_inty =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * sizeof(dfloat),
-                        mesh->inty);
-
-  mesh->o_intz =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNq * sizeof(dfloat),
-                        mesh->intz);
-
-  //dummy quadrature lifter operators
-  mesh->o_intInterpT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
-  mesh->o_intInterpT.copyFrom(mesh->o_cubInterpT);
-
-  mesh->o_intLIFTT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
-  mesh->o_intLIFTT.copyFrom(mesh->o_cubProjectT);
-
-  mesh->o_MM =
-    mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                        mesh->MM);
-
-  mesh->o_vmapM =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapM);
-
-  mesh->o_vmapP =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapP);
-
-  mesh->o_EToB =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),
-                        mesh->EToB);
-
-  mesh->o_x =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->x);
-
-  mesh->o_y =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->y);
-
-  // dummy z variables (note used y)
-  mesh->o_z =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->z);
-
-  if(mesh->totalHaloPairs > 0) {
-    // copy halo element list to DEVICE
-    mesh->o_haloElementList =
-      mesh->device.malloc(mesh->totalHaloPairs * sizeof(dlong), mesh->haloElementList);
-
-    // temporary DEVICE buffer for halo (maximum size Nfields*Np for dfloat)
-    mesh->o_haloBuffer =
-      mesh->device.malloc(mesh->totalHaloPairs * mesh->Np * mesh->Nfields * sizeof(dfloat));
-  }
-
-  //-------------------------------------
-  // NBN: 2 streams for async MPI updates
-  // {Vol, Surf, update}  run on q[0]
-  // {halo-get, copy} run on q[1]
-  //-------------------------------------
-  mesh->stream0 = mesh->device.getStream();
-#ifdef USE_2_STREAMS
-  mesh->stream1 = mesh->device.createStream();  // NBN: second stream
-#else
-  mesh->stream1 = mesh->stream0;                // NBN: stream1 == stream0
-#endif
-  mesh->device.setStream(mesh->stream0);
-  //-------------------------------------
-
-  kernelInfo["defines/" "p_Nfields"] = mesh->Nfields;
-  kernelInfo["defines/" "p_N"] = mesh->N;
-  kernelInfo["defines/" "p_Nq"] = mesh->N + 1;
-  kernelInfo["defines/" "p_Np"] = mesh->Np;
-  kernelInfo["defines/" "p_Nfp"] = mesh->Nfp;
-  kernelInfo["defines/" "p_Nfaces"] = mesh->Nfaces;
-  kernelInfo["defines/" "p_NfacesNfp"] = mesh->Nfp * mesh->Nfaces;
-  kernelInfo["defines/" "p_Nvgeo"] = mesh->Nvgeo;
-  kernelInfo["defines/" "p_Nsgeo"] = mesh->Nsgeo;
-  kernelInfo["defines/" "p_Nggeo"] = mesh->Nggeo;
-
-  kernelInfo["defines/" "p_NXID"] = NXID;
-  kernelInfo["defines/" "p_NYID"] = NYID;
-  kernelInfo["defines/" "p_NZID"] = NZID;
-  kernelInfo["defines/" "p_SJID"] = SJID;
-  kernelInfo["defines/" "p_IJID"] = IJID;
-  kernelInfo["defines/" "p_IHID"] = IHID;
-  kernelInfo["defines/" "p_WIJID"] = WIJID;
-  kernelInfo["defines/" "p_WSJID"] = WSJID;
-
-  kernelInfo["defines/" "p_max_EL_nnz"] = mesh->max_EL_nnz; // for Bernstein Bezier lift
-
-  kernelInfo["defines/" "p_cubNq"] = mesh->cubNq;
-  kernelInfo["defines/" "p_cubNp"] = mesh->cubNp;
-  kernelInfo["defines/" "p_intNfp"] = mesh->intNfp;
-  kernelInfo["defines/" "p_intNfpNfaces"] = mesh->intNfp * mesh->Nfaces;
-
-  if(sizeof(dfloat) == 4) {
-    kernelInfo["defines/" "dfloat"] = "float";
-    kernelInfo["defines/" "dfloat2"] = "float2";
-    kernelInfo["defines/" "dfloat4"] = "float4";
-    kernelInfo["defines/" "dfloat8"] = "float8";
-  }
-  if(sizeof(dfloat) == 8) {
-    kernelInfo["defines/" "dfloat"] = "double";
-    kernelInfo["defines/" "dfloat2"] = "double2";
-    kernelInfo["defines/" "dfloat4"] = "double4";
-    kernelInfo["defines/" "dfloat8"] = "double8";
-  }
-
-  if(sizeof(dlong) == 4)
-    kernelInfo["defines/" "dlong"] = "int";
-  if(sizeof(dlong) == 8)
-    kernelInfo["defines/" "dlong"] = "long long int";
-
-  if(mesh->device.mode() == "CUDA") { // add backend compiler optimization for CUDA
-    kernelInfo["compiler_flags"] += " --ftz=true ";
-    kernelInfo["compiler_flags"] += " --prec-div=false ";
-    kernelInfo["compiler_flags"] += " --prec-sqrt=false ";
-    kernelInfo["compiler_flags"] += " --use_fast_math ";
-    kernelInfo["compiler_flags"] += " --fmad=true "; // compiler option for cuda
-  }
-
-  kernelInfo["defines/" "p_G00ID"] = G00ID;
-  kernelInfo["defines/" "p_G01ID"] = G01ID;
-  kernelInfo["defines/" "p_G02ID"] = G02ID;
-  kernelInfo["defines/" "p_G11ID"] = G11ID;
-  kernelInfo["defines/" "p_G12ID"] = G12ID;
-  kernelInfo["defines/" "p_G22ID"] = G22ID;
-  kernelInfo["defines/" "p_GWJID"] = GWJID;
-
-  kernelInfo["defines/" "p_RXID"] = RXID;
-  kernelInfo["defines/" "p_SXID"] = SXID;
-  kernelInfo["defines/" "p_TXID"] = TXID;
-
-  kernelInfo["defines/" "p_RYID"] = RYID;
-  kernelInfo["defines/" "p_SYID"] = SYID;
-  kernelInfo["defines/" "p_TYID"] = TYID;
-
-  kernelInfo["defines/" "p_RZID"] = RZID;
-  kernelInfo["defines/" "p_SZID"] = SZID;
-  kernelInfo["defines/" "p_TZID"] = TZID;
-
-  kernelInfo["defines/" "p_JID"] = JID;
-  kernelInfo["defines/" "p_JWID"] = JWID;
-  kernelInfo["defines/" "p_IJWID"] = IJWID;
-}
diff --git a/src/libP/src/meshOccaSetupTri3D.c b/src/libP/src/meshOccaSetupTri3D.c
deleted file mode 100644
index 88a0fc2c2..000000000
--- a/src/libP/src/meshOccaSetupTri3D.c
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh3D.h"
-
-void meshOccaSetupTri3D(mesh_t* mesh, setupAide &newOptions, occa::properties &kernelInfo)
-{
-  //make seperate stream for halo exchange
-  mesh->defaultStream = mesh->device.getStream();
-  mesh->dataStream = mesh->device.createStream();
-  mesh->device.setStream(mesh->defaultStream);
-
-  // find elements that have all neighbors on this process
-  dlong* internalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-  dlong* notInternalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
-
-  dlong Ninterior = 0, NnotInterior = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    int flag = 0;
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      if(mesh->EToP[e * mesh->Nfaces + f] != -1)
-        flag = 1;
-    if(!flag)
-      internalElementIds[Ninterior++] = e;
-    else
-      notInternalElementIds[NnotInterior++] = e;
-  }
-
-  //printf("NinteriorElements = %d, NnotInternalElements = %d\n", Ninterior, NnotInterior);
-
-  mesh->NinternalElements = Ninterior;
-  mesh->NnotInternalElements = NnotInterior;
-  if(Ninterior)
-    mesh->o_internalElementIds    = mesh->device.malloc(Ninterior * sizeof(dlong),
-                                                        internalElementIds);
-
-  if(NnotInterior > 0)
-    mesh->o_notInternalElementIds = mesh->device.malloc(NnotInterior * sizeof(dlong),
-                                                        notInternalElementIds);
-
-  // OCCA allocate device memory (remember to go back for halo)
-  mesh->o_q =
-    mesh->device.malloc(
-      mesh->Np * (mesh->totalHaloPairs + mesh->Nelements) * mesh->Nfields * sizeof(dfloat),
-      mesh->q);
-  mesh->o_rhsq =
-    mesh->device.malloc(mesh->Np * mesh->Nelements * mesh->Nfields * sizeof(dfloat), mesh->rhsq);
-  mesh->o_resq =
-    mesh->device.malloc(mesh->Np * mesh->Nelements * mesh->Nfields * sizeof(dfloat), mesh->resq);
-
-  // build Dr, Ds, LIFT transposes
-  dfloat* DrT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  dfloat* DsT = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      DrT[n + m * mesh->Np] = mesh->Dr[n * mesh->Np + m];
-      DsT[n + m * mesh->Np] = mesh->Ds[n * mesh->Np + m];
-    }
-
-  // build Dr, Ds transposes
-  dfloat* DrsT = (dfloat*) calloc(2 * mesh->Np * mesh->Np, sizeof(dfloat));
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      DrsT[n + m * mesh->Np] = mesh->Dr[n * mesh->Np + m];
-      DrsT[n + m * mesh->Np + mesh->Np * mesh->Np] = mesh->Ds[n * mesh->Np + m];
-    }
-
-  dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Nfaces * mesh->Nfp; ++m)
-      LIFTT[n + m * mesh->Np] = mesh->LIFT[n * mesh->Nfp * mesh->Nfaces + m];
-
-  // build volume cubature matrix transposes
-  int cubNpBlocked = mesh->Np * ((mesh->cubNp + mesh->Np - 1) / mesh->Np);
-  dfloat* cubDrWT = (dfloat*) calloc(cubNpBlocked * mesh->Np, sizeof(dfloat));
-  dfloat* cubDsWT = (dfloat*) calloc(cubNpBlocked * mesh->Np, sizeof(dfloat));
-  dfloat* cubDrsWT = (dfloat*) calloc(2 * mesh->cubNp * mesh->Np, sizeof(dfloat));
-  dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-  dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNp * mesh->Np, sizeof(dfloat));
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->cubNp; ++m) {
-      cubDrWT[n + m * mesh->Np] = mesh->cubDrW[n * mesh->cubNp + m];
-      cubDsWT[n + m * mesh->Np] = mesh->cubDsW[n * mesh->cubNp + m];
-
-      cubDrsWT[n + m * mesh->Np] = mesh->cubDrW[n * mesh->cubNp + m];
-      cubDrsWT[n + m * mesh->Np + mesh->cubNp * mesh->Np] = mesh->cubDsW[n * mesh->cubNp + m];
-
-      cubProjectT[n + m * mesh->Np] = mesh->cubProject[n * mesh->cubNp + m];
-      cubInterpT[m + n * mesh->cubNp] = mesh->cubInterp[m * mesh->Np + n];
-    }
-
-  // build surface integration matrix transposes
-  dfloat* intLIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-  dfloat* intInterpT = (dfloat*) calloc(mesh->Nfp * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Nfaces * mesh->intNfp; ++m)
-      intLIFTT[n + m * mesh->Np] = mesh->intLIFT[n * mesh->intNfp * mesh->Nfaces + m];
-  for(int n = 0; n < mesh->intNfp * mesh->Nfaces; ++n)
-    for(int m = 0; m < mesh->Nfp; ++m)
-      intInterpT[n + m * mesh->Nfaces * mesh->intNfp] = mesh->intInterp[n * mesh->Nfp + m];
-
-  // =============== BB operators [added by JC] ===============
-  // deriv operators: transpose from row major to column major
-  int* D1ids = (int*) calloc(mesh->Np * 3,sizeof(int));
-  int* D2ids = (int*) calloc(mesh->Np * 3,sizeof(int));
-  int* D3ids = (int*) calloc(mesh->Np * 3,sizeof(int));
-  dfloat* Dvals = (dfloat*) calloc(mesh->Np * 3,sizeof(dfloat));
-
-  dfloat* VBq = (dfloat*) calloc(mesh->Np * mesh->cubNp,sizeof(dfloat));
-  dfloat* PBq = (dfloat*) calloc(mesh->Np * mesh->cubNp,sizeof(dfloat));
-
-  dfloat* L0vals = (dfloat*) calloc(mesh->Nfp * 3,sizeof(dfloat)); // tridiag
-  int* ELids = (int*) calloc(1 + mesh->Np * mesh->max_EL_nnz,sizeof(int));
-  dfloat* ELvals = (dfloat*) calloc(1 + mesh->Np * mesh->max_EL_nnz,sizeof(dfloat));
-
-  for (int i = 0; i < mesh->Np; ++i)
-    for (int j = 0; j < 3; ++j) {
-      D1ids[i + j * mesh->Np] = mesh->D1ids[j + i * 3];
-      D2ids[i + j * mesh->Np] = mesh->D2ids[j + i * 3];
-      D3ids[i + j * mesh->Np] = mesh->D3ids[j + i * 3];
-      Dvals[i + j * mesh->Np] = mesh->Dvals[j + i * 3];
-    }
-
-  for (int i = 0; i < mesh->cubNp; ++i)
-    for (int j = 0; j < mesh->Np; ++j) {
-      VBq[i + j * mesh->cubNp] = mesh->VBq[j + i * mesh->Np];
-      PBq[j + i * mesh->Np] = mesh->PBq[i + j * mesh->cubNp];
-    }
-
-
-  for (int i = 0; i < mesh->Nfp; ++i)
-    for (int j = 0; j < 3; ++j)
-      L0vals[i + j * mesh->Nfp] = mesh->L0vals[j + i * 3];
-
-  for (int i = 0; i < mesh->Np; ++i)
-    for (int j = 0; j < mesh->max_EL_nnz; ++j) {
-      ELids[i + j * mesh->Np] = mesh->ELids[j + i * mesh->max_EL_nnz];
-      ELvals[i + j * mesh->Np] = mesh->ELvals[j + i * mesh->max_EL_nnz]; // ???
-    }
-
-  //BB mass matrix
-  mesh->BBMM = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-  for (int n = 0; n < mesh->Np; ++n)
-    for (int m = 0; m < mesh->Np; ++m)
-      for (int i = 0; i < mesh->Np; ++i)
-        for (int j = 0; j < mesh->Np; ++j)
-          mesh->BBMM[n + m * mesh->Np] += mesh->VB[m + j * mesh->Np] * mesh->MM[i + j * mesh->Np] *
-                                          mesh->VB[n + i * mesh->Np];
-
-  // =============== end BB stuff =============================
-
-  //build element stiffness matrices
-  dfloat* SrrT, * SrsT, * SsrT, * SssT;
-  if (mesh->Nverts == 3) {
-    mesh->Srr = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Srs = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Ssr = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    mesh->Sss = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    for (int n = 0; n < mesh->Np; n++)
-      for (int m = 0; m < mesh->Np; m++)
-        for (int k = 0; k < mesh->Np; k++)
-          for (int l = 0; l < mesh->Np; l++) {
-            mesh->Srr[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dr[m + k * mesh->Np];
-            mesh->Srs[m + n * mesh->Np] += mesh->Dr[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Ds[m + k * mesh->Np];
-            mesh->Ssr[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Dr[m + k * mesh->Np];
-            mesh->Sss[m + n * mesh->Np] += mesh->Ds[n + l * mesh->Np] * mesh->MM[k + l * mesh->Np] *
-                                           mesh->Ds[m + k * mesh->Np];
-          }
-    SrrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SrsT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SsrT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    SssT = (dfloat*) calloc(mesh->Np * mesh->Np,sizeof(dfloat));
-    for (int n = 0; n < mesh->Np; n++)
-      for (int m = 0; m < mesh->Np; m++) {
-        SrrT[m + n * mesh->Np] = mesh->Srr[n + m * mesh->Np];
-        SrsT[m + n * mesh->Np] = mesh->Srs[n + m * mesh->Np];
-        SsrT[m + n * mesh->Np] = mesh->Ssr[n + m * mesh->Np];
-        SssT[m + n * mesh->Np] = mesh->Sss[n + m * mesh->Np];
-      }
-  }
-
-  dfloat* ST = (dfloat*) calloc(3 * mesh->Np * mesh->Np, sizeof(dfloat));
-  for(int n = 0; n < mesh->Np; ++n)
-    for(int m = 0; m < mesh->Np; ++m) {
-      ST[n + m * mesh->Np + 0 * mesh->Np * mesh->Np] = mesh->Srr[n * mesh->Np + m];
-      ST[n + m * mesh->Np + 1 * mesh->Np * mesh->Np] = mesh->Srs[n * mesh->Np + m] +
-                                                       mesh->Ssr[n * mesh->Np + m];
-      ST[n + m * mesh->Np + 2 * mesh->Np * mesh->Np] = mesh->Sss[n * mesh->Np + m];
-    }
-
-  mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-  mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp, sizeof(dfloat));
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      for(int n = 0; n < mesh->intNfp; ++n) {
-        dfloat ix = 0, iy = 0;
-        for(int m = 0; m < mesh->Nfp; ++m) {
-          dlong vid = mesh->vmapM[m + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces];
-          dfloat xm = mesh->x[vid];
-          dfloat ym = mesh->y[vid];
-          //dfloat Inm = mesh->intInterp[n+f*mesh->intNfp+m*mesh->intNfp*mesh->Nfaces];
-          dfloat Inm = mesh->intInterp[m + n * mesh->Nfp + f * mesh->intNfp * mesh->Nfp]; // Fixed
-          ix += Inm * xm;
-          iy += Inm * ym;
-        }
-        dlong id = n + f * mesh->intNfp + e * mesh->Nfaces * mesh->intNfp;
-        mesh->intx[id] = ix;
-        mesh->inty[id] = iy;
-      }
-
-  mesh->o_Dr = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                   mesh->Dr);
-
-  mesh->o_Ds = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                   mesh->Ds);
-
-  mesh->o_DrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                    DrT);
-
-  mesh->o_DsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                    DsT);
-
-  mesh->o_DtT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                                    DsT); // note: dummy allocated with DsT
-
-  mesh->o_Dmatrices = mesh->device.malloc(2 * mesh->Np * mesh->Np * sizeof(dfloat), DrsT);
-
-  mesh->o_LIFT =
-    mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->Nfp * sizeof(dfloat),
-                        mesh->LIFT);
-
-  mesh->o_LIFTT =
-    mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->Nfp * sizeof(dfloat),
-                        LIFTT);
-
-  mesh->o_vgeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * sizeof(dfloat),
-                        mesh->vgeo);
-
-  mesh->o_cubvgeo =   mesh->device.malloc(sizeof(dfloat));// dummy
-
-  mesh->o_sgeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nsgeo * sizeof(dfloat),
-                        mesh->sgeo);
-
-  mesh->o_ggeo =
-    mesh->device.malloc(mesh->Nelements * mesh->Nggeo * sizeof(dfloat),
-                        mesh->ggeo);
-
-  mesh->o_cubsgeo = mesh->o_sgeo; //dummy cubature geo factors
-
-  mesh->o_SrrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrrT);
-  mesh->o_SrsT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SrsT);
-  mesh->o_SsrT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SsrT);
-  mesh->o_SssT = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat), SssT);
-  mesh->o_Smatrices = mesh->device.malloc(3 * mesh->Np * mesh->Np * sizeof(dfloat), ST);
-
-  mesh->o_D1ids = mesh->device.malloc(mesh->Np * 3 * sizeof(int),D1ids);
-  mesh->o_D2ids = mesh->device.malloc(mesh->Np * 3 * sizeof(int),D2ids);
-  mesh->o_D3ids = mesh->device.malloc(mesh->Np * 3 * sizeof(int),D3ids);
-  mesh->o_Dvals = mesh->device.malloc(mesh->Np * 3 * sizeof(dfloat),Dvals);
-
-  mesh->o_BBMM = mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),mesh->BBMM);
-
-  mesh->o_VBq = mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),VBq);
-  mesh->o_PBq = mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),PBq);
-
-  mesh->o_L0vals = mesh->device.malloc(mesh->Nfp * 3 * sizeof(dfloat),L0vals);
-  mesh->o_ELids =
-    mesh->device.malloc(mesh->Np * mesh->max_EL_nnz * sizeof(int),ELids);
-  mesh->o_ELvals =
-    mesh->device.malloc(mesh->Np * mesh->max_EL_nnz * sizeof(dfloat),ELvals);
-
-  mesh->o_cubInterpT =
-    mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                        cubInterpT);
-
-  mesh->o_cubProjectT =
-    mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                        cubProjectT);
-
-  mesh->o_cubDrWT =
-    mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                        cubDrWT);
-
-  mesh->o_cubDsWT =
-    mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                        cubDsWT);
-
-  mesh->o_cubDtWT =
-    mesh->device.malloc(mesh->Np * mesh->cubNp * sizeof(dfloat),
-                        cubDsWT); // dummy to align with 3d
-
-  mesh->o_cubDWmatrices =
-    mesh->device.malloc(2 * mesh->cubNp * mesh->Np * sizeof(dfloat), cubDrsWT);
-
-  mesh->o_intInterpT =
-    mesh->device.malloc(mesh->Nfp * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                        intInterpT);
-
-  mesh->o_intLIFTT =
-    mesh->device.malloc(mesh->Np * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                        intLIFTT);
-
-  mesh->o_intx =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                        mesh->intx);
-
-  mesh->o_inty =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                        mesh->inty);
-
-  mesh->o_intz =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->intNfp * sizeof(dfloat),
-                        mesh->inty); // dummy to align with 3d
-
-  free(DrsT);
-  free(ST);
-
-  mesh->o_MM =
-    mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
-                        mesh->MM);
-
-  mesh->o_vmapM =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapM);
-
-  mesh->o_vmapP =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
-                        mesh->vmapP);
-
-  mesh->o_EToB =
-    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),
-                        mesh->EToB);
-
-  mesh->o_x =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->x);
-
-  mesh->o_y =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->y);
-
-  // dummy z variables (note used y)
-  mesh->o_z =
-    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat),
-                        mesh->z);
-
-  if(mesh->totalHaloPairs > 0) {
-    // copy halo element list to DEVICE
-    mesh->o_haloElementList =
-      mesh->device.malloc(mesh->totalHaloPairs * sizeof(dlong), mesh->haloElementList);
-
-    // temporary DEVICE buffer for halo (maximum size Nfields*Np for dfloat)
-    mesh->o_haloBuffer =
-      mesh->device.malloc(mesh->totalHaloPairs * mesh->Np * mesh->Nfields * sizeof(dfloat));
-  }
-
-  //-------------------------------------
-  // NBN: 2 streams for async MPI updates
-  // {Vol, Surf, update}  run on q[0]
-  // {halo-get, copy} run on q[1]
-  //-------------------------------------
-  mesh->stream0 = mesh->device.getStream();
-#ifdef USE_2_STREAMS
-  mesh->stream1 = mesh->device.createStream();  // NBN: second stream
-#else
-  mesh->stream1 = mesh->stream0;                // NBN: stream1 == stream0
-#endif
-  mesh->device.setStream(mesh->stream0);
-  //-------------------------------------
-
-  kernelInfo["defines/" "p_Nfields"] = mesh->Nfields;
-  kernelInfo["defines/" "p_N"] = mesh->N;
-  kernelInfo["defines/" "p_Nq"] = mesh->N + 1;
-  kernelInfo["defines/" "p_Np"] = mesh->Np;
-  kernelInfo["defines/" "p_Nfp"] = mesh->Nfp;
-  kernelInfo["defines/" "p_Nfaces"] = mesh->Nfaces;
-  kernelInfo["defines/" "p_NfacesNfp"] = mesh->Nfp * mesh->Nfaces;
-  kernelInfo["defines/" "p_Nvgeo"] = mesh->Nvgeo;
-  kernelInfo["defines/" "p_Nsgeo"] = mesh->Nsgeo;
-  kernelInfo["defines/" "p_Nggeo"] = mesh->Nggeo;
-
-  kernelInfo["defines/" "p_NXID"] = NXID;
-  kernelInfo["defines/" "p_NYID"] = NYID;
-  kernelInfo["defines/" "p_NZID"] = NZID;
-  kernelInfo["defines/" "p_SJID"] = SJID;
-  kernelInfo["defines/" "p_IJID"] = IJID;
-  kernelInfo["defines/" "p_IHID"] = IHID;
-  kernelInfo["defines/" "p_WIJID"] = WIJID;
-  kernelInfo["defines/" "p_WSJID"] = WSJID;
-
-  kernelInfo["defines/" "p_max_EL_nnz"] = mesh->max_EL_nnz; // for Bernstein Bezier lift
-
-  kernelInfo["defines/" "p_cubNq"] = mesh->cubNq;
-  kernelInfo["defines/" "p_cubNp"] = mesh->cubNp;
-  kernelInfo["defines/" "p_intNfp"] = mesh->intNfp;
-  kernelInfo["defines/" "p_intNfpNfaces"] = mesh->intNfp * mesh->Nfaces;
-
-  if(sizeof(dfloat) == 4) {
-    kernelInfo["defines/" "dfloat"] = "float";
-    kernelInfo["defines/" "dfloat2"] = "float2";
-    kernelInfo["defines/" "dfloat4"] = "float4";
-    kernelInfo["defines/" "dfloat8"] = "float8";
-  }
-  if(sizeof(dfloat) == 8) {
-    kernelInfo["defines/" "dfloat"] = "double";
-    kernelInfo["defines/" "dfloat2"] = "double2";
-    kernelInfo["defines/" "dfloat4"] = "double4";
-    kernelInfo["defines/" "dfloat8"] = "double8";
-  }
-
-  if(sizeof(dlong) == 4)
-    kernelInfo["defines/" "dlong"] = "int";
-  if(sizeof(dlong) == 8)
-    kernelInfo["defines/" "dlong"] = "long long int";
-
-  if(mesh->device.mode() == "CUDA") { // add backend compiler optimization for CUDA
-    kernelInfo["compiler_flags"] += " --ftz=true ";
-    kernelInfo["compiler_flags"] += " --prec-div=false ";
-    kernelInfo["compiler_flags"] += " --prec-sqrt=false ";
-    kernelInfo["compiler_flags"] += " --use_fast_math ";
-    kernelInfo["compiler_flags"] += " --fmad=true "; // compiler option for cuda
-  }
-
-  kernelInfo["defines/" "p_G00ID"] = G00ID;
-  kernelInfo["defines/" "p_G01ID"] = G01ID;
-  kernelInfo["defines/" "p_G11ID"] = G11ID;
-  kernelInfo["defines/" "p_GWJID"] = GWJID;
-
-  kernelInfo["defines/" "p_RXID"] = RXID;
-  kernelInfo["defines/" "p_SXID"] = SXID;
-  kernelInfo["defines/" "p_TXID"] = TXID;
-
-  kernelInfo["defines/" "p_RYID"] = RYID;
-  kernelInfo["defines/" "p_SYID"] = SYID;
-  kernelInfo["defines/" "p_TYID"] = TYID;
-
-  kernelInfo["defines/" "p_RZID"] = RZID;
-  kernelInfo["defines/" "p_SZID"] = SZID;
-  kernelInfo["defines/" "p_TZID"] = TZID;
-
-  kernelInfo["defines/" "p_JID"] = JID;
-  kernelInfo["defines/" "p_JWID"] = JWID;
-  kernelInfo["defines/" "p_IJWID"] = IJWID;
-}
diff --git a/src/libP/src/meshParallelConnectNodes.c b/src/libP/src/meshParallelConnectNodes.c
deleted file mode 100644
index 15bce9ae3..000000000
--- a/src/libP/src/meshParallelConnectNodes.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-
-#include "mesh.h"
-
-typedef struct
-{
-  int baseRank;
-  hlong baseId;
-}parallelNode_t;
-
-// uniquely label each node with a global index, used for gatherScatter
-void meshParallelConnectNodes(mesh_t* mesh)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  dlong localNodeCount = mesh->Np * mesh->Nelements;
-  dlong* allLocalNodeCounts = (dlong*) calloc(size, sizeof(dlong));
-
-  MPI_Allgather(&localNodeCount,    1, MPI_DLONG,
-                allLocalNodeCounts, 1, MPI_DLONG,
-                mesh->comm);
-
-  hlong gatherNodeStart = 0;
-  for(int r = 0; r < rank; ++r)
-    gatherNodeStart += allLocalNodeCounts[r];
-
-  free(allLocalNodeCounts);
-
-  // form continuous node numbering (local=>virtual gather)
-  parallelNode_t* localNodes =
-    (parallelNode_t*) calloc((mesh->totalHaloPairs + mesh->Nelements) * mesh->Np,
-                             sizeof(parallelNode_t));
-
-  // use local numbering
-  for(dlong e = 0; e < mesh->Nelements; ++e) {
-    for(int n = 0; n < mesh->Np; ++n) {
-      dlong id = e * mesh->Np + n;
-
-      localNodes[id].baseRank = rank;
-      localNodes[id].baseId = 1 + id + mesh->Nnodes + gatherNodeStart;
-    }
-
-    // use vertex ids for vertex nodes to reduce iterations
-    for(int v = 0; v < mesh->Nverts; ++v) {
-      dlong id = e * mesh->Np + mesh->vertexNodes[v];
-      hlong gid = mesh->EToV[e * mesh->Nverts + v] + 1;
-      localNodes[id].baseId = gid;
-    }
-  }
-
-  dlong localChange = 0, gatherChange = 1;
-
-  parallelNode_t* sendBuffer =
-    (parallelNode_t*) calloc(mesh->totalHaloPairs * mesh->Np, sizeof(parallelNode_t));
-
-  // keep comparing numbers on positive and negative traces until convergence
-  while(gatherChange > 0) {
-    // reset change counter
-    localChange = 0;
-
-    // send halo data and recv into extension of buffer
-    meshHaloExchange(mesh, mesh->Np * sizeof(parallelNode_t),
-                     localNodes, sendBuffer, localNodes + localNodeCount);
-
-    // compare trace nodes
-    for(dlong e = 0; e < mesh->Nelements; ++e)
-      for(int n = 0; n < mesh->Nfp * mesh->Nfaces; ++n) {
-        dlong id  = e * mesh->Nfp * mesh->Nfaces + n;
-        dlong idM = mesh->vmapM[id];
-        dlong idP = mesh->vmapP[id];
-        hlong gidM = localNodes[idM].baseId;
-        hlong gidP = localNodes[idP].baseId;
-
-        int baseRankM = localNodes[idM].baseRank;
-        int baseRankP = localNodes[idP].baseRank;
-
-        if(gidM < gidP || (gidP == gidM && baseRankM < baseRankP)) {
-          ++localChange;
-          localNodes[idP].baseRank    = localNodes[idM].baseRank;
-          localNodes[idP].baseId      = localNodes[idM].baseId;
-        }
-
-        if(gidP < gidM || (gidP == gidM && baseRankP < baseRankM)) {
-          ++localChange;
-          localNodes[idM].baseRank    = localNodes[idP].baseRank;
-          localNodes[idM].baseId      = localNodes[idP].baseId;
-        }
-      }
-
-    // sum up changes
-    MPI_Allreduce(&localChange, &gatherChange, 1, MPI_DLONG, MPI_SUM, mesh->comm);
-  }
-
-  //make a locally-ordered version
-  mesh->globalIds = (hlong*) calloc(localNodeCount, sizeof(hlong));
-  for(dlong id = 0; id < localNodeCount; ++id)
-    mesh->globalIds[id] = localNodes[id].baseId;
-
-  free(localNodes);
-  free(sendBuffer);
-}
diff --git a/src/libP/src/meshParallelPrint2D.c b/src/libP/src/meshParallelPrint2D.c
deleted file mode 100644
index 218d3eafb..000000000
--- a/src/libP/src/meshParallelPrint2D.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "mpi.h"
-#include "mesh2D.h"
-
-void meshParallelPrint2D(mesh2D* mesh)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  printf("rank %d: Nelements=" dlongFormat " Nnodes=" hlongFormat "\n",
-         rank, mesh->Nelements, mesh->Nnodes);
-
-#if 0
-  printf("EToV:\n");
-  for(int e = 0; e < mesh->Nelements; ++e)
-    printf("%d %d %d\n",
-           mesh->EToV[e * mesh->Nverts + 0],
-           mesh->EToV[e * mesh->Nverts + 1],
-           mesh->EToV[e * mesh->Nverts + 2]);
-
-#endif
-
-  dlong* otherNelements = (dlong*) calloc(size, sizeof(dlong));
-  MPI_Allgather(&(mesh->Nelements), 1, MPI_DLONG,
-                otherNelements, 1, MPI_DLONG,
-                mesh->comm);
-
-  hlong* elementStarts = (hlong*) calloc(size, sizeof(hlong));
-  for(int r = 1; r < size; ++r)
-    elementStarts[r] = elementStarts[r - 1] + otherNelements[r - 1];
-
-  for(int r1 = 0; r1 < size; ++r1) {
-    MPI_Barrier(mesh->comm);
-    if(rank == r1) {
-      fflush(stdout);
-      if(r1 == 0)
-        printf("EToE:\n");
-      for(dlong e1 = 0; e1 < mesh->Nelements; ++e1) {
-        dlong id = e1 * mesh->Nfaces;
-        for(int f1 = 0; f1 < mesh->Nfaces; ++f1) {
-          hlong e2 = (hlong) mesh->EToE[id + f1];
-          int f2 = mesh->EToF[id + f1];
-          int r2 = mesh->EToP[id + f1];
-          if(e2 == -1 || f2 == -1) {
-            printf("(" hlongFormat " %d )=>X (" hlongFormat ", %d )\n",
-                   e1 + elementStarts[r1], f1, e2, f2);
-          }else{
-            if(r2 != -1)
-              e2 += elementStarts[r2];
-            else
-              e2 += elementStarts[r1];
-
-            printf("(" hlongFormat " %d )=>(" hlongFormat " %d )\n",
-                   e1 + elementStarts[r1], f1, e2, f2);
-          }
-        }
-        fflush(stdout);
-      }
-    }
-    MPI_Barrier(mesh->comm);
-  }
-  free(otherNelements);
-  free(elementStarts);
-}
diff --git a/src/libP/src/meshParallelPrint3D.c b/src/libP/src/meshParallelPrint3D.c
deleted file mode 100644
index 617337ca3..000000000
--- a/src/libP/src/meshParallelPrint3D.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "mesh3D.h"
-
-void meshParallelPrint3D(mesh3D* mesh)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  printf("rank %d: Nelements=" dlongFormat " Nnodes=" hlongFormat "\n",
-         rank, mesh->Nelements, mesh->Nnodes);
-
-#if 0
-  printf("EToV:\n");
-  for(int e = 0; e < mesh->Nelements; ++e)
-    printf("%d %d %d\n",
-           mesh->EToV[e * mesh->Nverts + 0],
-           mesh->EToV[e * mesh->Nverts + 1],
-           mesh->EToV[e * mesh->Nverts + 2]);
-
-#endif
-
-  dlong* otherNelements = (dlong*) calloc(size, sizeof(dlong));
-  MPI_Allgather(&(mesh->Nelements), 1, MPI_DLONG,
-                otherNelements, 1, MPI_DLONG,
-                mesh->comm);
-
-  hlong* elementStarts = (hlong*) calloc(size, sizeof(hlong));
-  for(int r = 1; r < size; ++r)
-    elementStarts[r] = elementStarts[r - 1] + otherNelements[r - 1];
-
-  for(int r1 = 0; r1 < size; ++r1) {
-    MPI_Barrier(mesh->comm);
-    if(rank == r1) {
-      fflush(stdout);
-      if(r1 == 0)
-        printf("EToE:\n");
-      for(dlong e1 = 0; e1 < mesh->Nelements; ++e1) {
-        dlong id = e1 * mesh->Nfaces;
-        for(int f1 = 0; f1 < mesh->Nfaces; ++f1) {
-          hlong e2 = (hlong) mesh->EToE[id + f1];
-          int f2 = mesh->EToF[id + f1];
-          int r2 = mesh->EToP[id + f1];
-          if(e2 == -1 || f2 == -1) {
-            printf("(" hlongFormat " " "%d" ")=>X (" hlongFormat "," "%d" ")\n",
-                   e1 + elementStarts[r1], f1, e2, f2);
-          }else{
-            if(r2 != -1)
-              e2 += elementStarts[r2];
-            else
-              e2 += elementStarts[r1];
-
-            printf("(" hlongFormat " " "%d" ")=>(" hlongFormat " " "%d" ")\n",
-                   e1 + elementStarts[r1], f1, e2, f2);
-          }
-        }
-        fflush(stdout);
-      }
-    }
-    MPI_Barrier(mesh->comm);
-  }
-  free(otherNelements);
-  free(elementStarts);
-}
diff --git a/src/libP/src/meshParallelReaderHex3D.c b/src/libP/src/meshParallelReaderHex3D.c
deleted file mode 100644
index c3d037d85..000000000
--- a/src/libP/src/meshParallelReaderHex3D.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "mesh3D.h"
-
-/*
-   purpose: read gmsh hexrahedra mesh
- */
-mesh3D* meshParallelReaderHex3D(char* fileName)
-{
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  FILE* fp = fopen(fileName, "r");
-
-  char* status;
-
-  mesh_t* mesh = new mesh_t();
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->dim = 3;
-  mesh->Nverts = 8; // number of vertices per element
-  mesh->Nfaces = 6;
-  mesh->NfaceVertices = 4;
-
-  // vertices on each face
-  int faceVertices[6][4] = {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  if(fp == NULL) {
-    printf("meshReaderHex3D: could not load file %s\n", fileName);
-    exit(0);
-  }
-
-  char buf[BUFSIZ];
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &(mesh->Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat* VX = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VY = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VZ = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(hlong n = 0; n < mesh->Nnodes; ++n) {
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-           VX + n, VY + n, VZ + n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  hlong Nelements;
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &Nelements);
-
-  /* find # of hexes */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  hlong Nhexes = 0, NboundaryFaces = 0;
-
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 5) ++Nhexes; // hex code is 5
-    if(elementType == 3) ++NboundaryFaces; // quad codes is 3
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  hlong chunk = (hlong) Nhexes / size;
-  int remainder = (int) (Nhexes - chunk * size);
-
-  hlong NhexesLocal = chunk + (rank < remainder);
-
-  /* where do these elements start ? */
-  hlong start = rank * chunk + mymin(rank, remainder);
-  hlong end = start + NhexesLocal - 1;
-
-  /* allocate space for Element node index data */
-
-  mesh->EToV
-    = (hlong*) calloc(NhexesLocal * mesh->Nverts, sizeof(hlong));
-
-  mesh->elementInfo
-    = (hlong*) calloc(NhexesLocal,sizeof(hlong));
-
-  /* scan through file looking for hexrahedra elements */
-  hlong cnt = 0, bcnt = 0;
-  Nhexes = 0;
-
-  mesh->boundaryInfo = (hlong*) calloc(NboundaryFaces * (mesh->NfaceVertices + 1), sizeof(hlong));
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    hlong v1, v2, v3, v4, v5, v6, v7, v8;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-
-    if(elementType == 3) { // quad boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d " hlongFormat hlongFormat hlongFormat hlongFormat,
-             mesh->boundaryInfo + bcnt * 5, &v1, &v2, &v3, &v4);
-
-      mesh->boundaryInfo[bcnt * 5 + 1] = v1 - 1;
-      mesh->boundaryInfo[bcnt * 5 + 2] = v2 - 1;
-      mesh->boundaryInfo[bcnt * 5 + 3] = v3 - 1;
-      mesh->boundaryInfo[bcnt * 5 + 4] = v4 - 1;
-      ++bcnt;
-    }
-
-    if(elementType == 5) { // hex code is 5
-      if(start <= Nhexes && Nhexes <= end) {
-        sscanf(buf,
-               "%*d%*d%*d " hlongFormat " %*d"
-               hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat,
-               mesh->elementInfo + cnt,
-               &v1,
-               &v2,
-               &v3,
-               &v4,
-               &v5,
-               &v6,
-               &v7,
-               &v8);
-
-        mesh->EToV[cnt * mesh->Nverts + 0] = v1 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 1] = v2 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 2] = v3 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 3] = v4 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 4] = v5 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 5] = v6 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 6] = v7 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 7] = v8 - 1;
-
-        //      printf("%d: %d,%d,%d,%d %d,%d,%d,%d", cnt, v1-1, v2-1,v3-1,v4-1,v5-1,v6-1,v7-1,v8-1);
-
-        ++cnt;
-      }
-      ++Nhexes;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  mesh->NboundaryFaces = bcnt;
-
-  /* record number of found hexes */
-  mesh->Nelements = (dlong) NhexesLocal;
-
-  /* collect vertices for each element */
-  mesh->EX = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      hlong vid = mesh->EToV[e * mesh->Nverts + n];
-      mesh->EX[e * mesh->Nverts + n] = VX[vid];
-      mesh->EY[e * mesh->Nverts + n] = VY[vid];
-      mesh->EZ[e * mesh->Nverts + n] = VZ[vid];
-    }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-
-  return mesh;
-}
diff --git a/src/libP/src/meshParallelReaderQuad2D.c b/src/libP/src/meshParallelReaderQuad2D.c
deleted file mode 100644
index 5fb8d6178..000000000
--- a/src/libP/src/meshParallelReaderQuad2D.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh2D.h"
-
-/*
-   purpose: read gmsh quadrilateral mesh
- */
-mesh2D* meshParallelReaderQuad2D(char* fileName)
-{
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  FILE* fp = fopen(fileName, "r");
-
-  char* status;
-
-  mesh_t* mesh = new mesh_t();
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->dim = 2;
-  mesh->Nverts = 4; // number of vertices per element
-  mesh->Nfaces = 4;
-  mesh->NfaceVertices = 2;
-
-  int faceVertices[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  if(fp == NULL) {
-    printf("meshParallelReaderQuad2D: could not load file %s\n", fileName);
-    exit(0);
-  }
-
-  char buf[BUFSIZ];
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &(mesh->Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat* VX = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VY = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(hlong n = 0; n < mesh->Nnodes; ++n) {
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat, VX + n, VY + n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  hlong Nelements;
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &Nelements);
-
-  /* find # of quadrilaterals */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  hlong Nquadrilaterals = 0;
-  hlong NboundaryFaces = 0;
-
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 1) ++NboundaryFaces;
-    if(elementType == 3) ++Nquadrilaterals;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  hlong chunk = (hlong) Nquadrilaterals / size;
-  int remainder = (int) (Nquadrilaterals - chunk * size);
-
-  hlong NquadrilateralsLocal = chunk + (rank < remainder);
-
-  /* where do these elements start ? */
-  hlong start = rank * chunk + mymin(rank, remainder);
-  hlong end = start + NquadrilateralsLocal - 1;
-
-  /* allocate space for Element node index data */
-
-  mesh->EToV
-    = (hlong*) calloc(NquadrilateralsLocal * mesh->Nverts,
-                      sizeof(hlong));
-
-  mesh->elementInfo
-    = (hlong*) calloc(NquadrilateralsLocal,sizeof(hlong));
-
-  /* scan through file looking for quadrilateral elements */
-  hlong cnt = 0, bcnt = 0;
-  Nquadrilaterals = 0;
-
-  mesh->boundaryInfo = (hlong*) calloc(NboundaryFaces * 3, sizeof(hlong));
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    hlong v1, v2, v3, v4;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-
-    if(elementType == 1) { // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat,
-             mesh->boundaryInfo + bcnt * 3, &v1, &v2);
-      mesh->boundaryInfo[bcnt * 3 + 1] = v1 - 1;
-      mesh->boundaryInfo[bcnt * 3 + 2] = v2 - 1;
-      ++bcnt;
-    }
-
-    if(elementType == 3) { // quadrilateral
-      if(start <= Nquadrilaterals && Nquadrilaterals <= end) {
-        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat hlongFormat,
-               mesh->elementInfo + cnt, &v1, &v2, &v3, &v4);
-
-        // check orientation
-        dfloat xe1 = VX[v1 - 1], xe2 = VX[v2 - 1], xe4 = VX[v4 - 1];
-        dfloat ye1 = VY[v1 - 1], ye2 = VY[v2 - 1], ye4 = VY[v4 - 1];
-        dfloat J = 0.25 * ((xe2 - xe1) * (ye4 - ye1) - (xe4 - xe1) * (ye2 - ye1));
-        if(J < 0) {
-          hlong v4tmp = v4;
-          v4 = v2;
-          v2 = v4tmp;
-          //      printf("unwarping element\n");
-        }
-
-        /* read vertex triplet for trianngle */
-        mesh->EToV[cnt * mesh->Nverts + 0] = v1 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 1] = v2 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 2] = v3 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 3] = v4 - 1;
-        ++cnt;
-      }
-      ++Nquadrilaterals;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  mesh->NboundaryFaces = bcnt;
-
-  /* record number of found quadrilaterals */
-  mesh->Nelements = (dlong) NquadrilateralsLocal;
-
-  /* collect vertices for each element */
-  mesh->EX = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      mesh->EX[e * mesh->Nverts + n] = VX[mesh->EToV[e * mesh->Nverts + n]];
-      mesh->EY[e * mesh->Nverts + n] = VY[mesh->EToV[e * mesh->Nverts + n]];
-    }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-
-  return mesh;
-}
diff --git a/src/libP/src/meshParallelReaderQuad3D.c b/src/libP/src/meshParallelReaderQuad3D.c
deleted file mode 100644
index ed3ceed87..000000000
--- a/src/libP/src/meshParallelReaderQuad3D.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh3D.h"
-
-/*
-   purpose: read gmsh quadrilateral mesh
- */
-mesh_t* meshParallelReaderQuad3D(char* fileName)
-{
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  FILE* fp = fopen(fileName, "r");
-  int n;
-
-  //  mesh_t *mesh = (mesh_t*) calloc(1, sizeof(mesh_t));
-  mesh_t* mesh = new mesh_t[1];
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->dim = 3;
-  mesh->Nverts = 4; // number of vertices per element
-  mesh->Nfaces = 4;
-  mesh->NfaceVertices = 2;
-
-  int faceVertices[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  if(fp == NULL) {
-    printf("meshReader2D: could not load file %s\n", fileName);
-    exit(0);
-  }
-
-  char buf[BUFSIZ];
-  do{
-    fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &(mesh->Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat* VX = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VY = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VZ = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(n = 0; n < mesh->Nnodes; ++n) {
-    fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-           VX + n, VY + n, VZ + n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  fgets(buf, BUFSIZ, fp);
-  sscanf(buf, "%d", &(mesh->Nelements));
-
-  /* find # of quadrilaterals */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  int Nquadrilaterals = 0;
-
-  int NboundaryFaces = 0;
-  for(n = 0; n < mesh->Nelements; ++n) {
-    int elementType;
-    fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 1) ++NboundaryFaces;
-    if(elementType == 3) ++Nquadrilaterals;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  int chunk = Nquadrilaterals / size;
-  int remainder = Nquadrilaterals - chunk * size;
-
-  int NquadrilateralsLocal = chunk + (rank < remainder);
-
-  /* where do these elements start ? */
-  int start = rank * chunk + mymin(rank, remainder);
-  int end = start + NquadrilateralsLocal - 1;
-
-  /* allocate space for Element node index data */
-
-  mesh->EToV
-    = (hlong*) calloc(NquadrilateralsLocal * mesh->Nverts,
-                      sizeof(hlong));
-
-  mesh->elementInfo
-    = (hlong*) calloc(NquadrilateralsLocal,sizeof(hlong));
-
-  /* scan through file looking for quadrilateral elements */
-  int cnt = 0, bcnt = 0;
-  Nquadrilaterals = 0;
-
-  mesh->boundaryInfo = (hlong*) calloc(NboundaryFaces * 3, sizeof(hlong));
-  for(n = 0; n < mesh->Nelements; ++n) {
-    int elementType;
-    hlong v1, v2, v3, v4;
-    fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-
-    if(elementType == 1) { // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d " hlongFormat hlongFormat,
-             mesh->boundaryInfo + bcnt * 3, &v1, &v2);
-      mesh->boundaryInfo[bcnt * 3 + 1] = v1 - 1;
-      mesh->boundaryInfo[bcnt * 3 + 2] = v2 - 1;
-      ++bcnt;
-    }
-
-    if(elementType == 3) { // quadrilateral
-      if(start <= Nquadrilaterals && Nquadrilaterals <= end) {
-        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat hlongFormat,
-               mesh->elementInfo + cnt, &v1, &v2, &v3, &v4);
-
-#if 0
-        // check orientation
-        dfloat xe1 = VX[v1 - 1], xe2 = VX[v2 - 1], xe4 = VX[v4 - 1];
-        dfloat ye1 = VY[v1 - 1], ye2 = VY[v2 - 1], ye4 = VY[v4 - 1];
-        dfloat J = 0.25 * ((xe2 - xe1) * (ye4 - ye1) - (xe4 - xe1) * (ye2 - ye1));
-        if(J < 0) {
-          int v4tmp = v4;
-          v4 = v2;
-          v2 = v4tmp;
-          printf("unwarping element\n");
-        }
-#endif
-
-        /* read vertex triplet for trianngle */
-        mesh->EToV[cnt * mesh->Nverts + 0] = v1 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 1] = v2 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 2] = v3 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 3] = v4 - 1;
-        ++cnt;
-      }
-      ++Nquadrilaterals;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  mesh->NboundaryFaces = bcnt;
-
-  /* record number of found quadrilaterals */
-  mesh->Nelements = NquadrilateralsLocal;
-
-  /* collect vertices for each element */
-  mesh->EX = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    for(n = 0; n < mesh->Nverts; ++n) {
-      mesh->EX[e * mesh->Nverts + n] = VX[mesh->EToV[e * mesh->Nverts + n]];
-      mesh->EY[e * mesh->Nverts + n] = VY[mesh->EToV[e * mesh->Nverts + n]];
-      mesh->EZ[e * mesh->Nverts + n] = VZ[mesh->EToV[e * mesh->Nverts + n]];
-#if 0
-      printf("e %d v %d %g %g %g\n",
-             e, n,
-             mesh->EX[e * mesh->Nverts + n],
-             mesh->EY[e * mesh->Nverts + n],
-             mesh->EZ[e * mesh->Nverts + n]);
-#endif
-    }
-  }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-
-  return mesh;
-}
diff --git a/src/libP/src/meshParallelReaderTet3D.c b/src/libP/src/meshParallelReaderTet3D.c
deleted file mode 100644
index 9448502aa..000000000
--- a/src/libP/src/meshParallelReaderTet3D.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh3D.h"
-
-/*
-   purpose: read gmsh tetrahedra mesh
- */
-mesh3D* meshParallelReaderTet3D(char* fileName)
-{
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  FILE* fp = fopen(fileName, "r");
-
-  char* status;
-
-  mesh_t* mesh = new mesh_t();
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->dim = 3;
-  mesh->Nverts = 4; // number of vertices per element
-  mesh->Nfaces = 4;
-
-  // vertices on each face
-  int faceVertices[4][3] = {{0,1,2},{0,1,3},{1,2,3},{2,0,3}};
-  mesh->NfaceVertices = 3;
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-  memcpy(mesh->faceVertices, faceVertices[0], 12 * sizeof(int));
-
-  if(fp == NULL) {
-    printf("meshReaderTet3D: could not load file %s\n", fileName);
-    exit(0);
-  }
-
-  char buf[BUFSIZ];
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &(mesh->Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat* VX = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VY = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VZ = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(hlong n = 0; n < mesh->Nnodes; ++n) {
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-           VX + n, VY + n, VZ + n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  hlong Nelements;
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &Nelements);
-
-  /* find # of tets */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  hlong Ntets = 0, NboundaryFaces = 0;
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 4) ++Ntets; // tet code is 4
-    if(elementType == 2) ++NboundaryFaces;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  hlong chunk = (hlong) Ntets / size;
-  int remainder = (int) (Ntets - chunk * size);
-
-  hlong NtetsLocal = chunk + (rank < remainder);
-
-  /* where do these elements start ? */
-  hlong start = rank * chunk + mymin(rank, remainder);
-  hlong end = start + NtetsLocal - 1;
-
-  /* allocate space for Element node index data */
-
-  mesh->EToV
-    = (hlong*) calloc(NtetsLocal * mesh->Nverts, sizeof(hlong));
-  mesh->elementInfo
-    = (hlong*) calloc(NtetsLocal,sizeof(hlong));
-
-  /* scan through file looking for tetrahedra elements */
-  hlong cnt = 0, bcnt = 0;
-  Ntets = 0;
-
-  mesh->boundaryInfo = (hlong*) calloc(NboundaryFaces * 4, sizeof(hlong));
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    hlong v1, v2, v3, v4;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 2) { // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat hlongFormat,
-             mesh->boundaryInfo + bcnt * 4, &v1, &v2, &v3);
-      mesh->boundaryInfo[bcnt * 4 + 1] = v1 - 1;
-      mesh->boundaryInfo[bcnt * 4 + 2] = v2 - 1;
-      mesh->boundaryInfo[bcnt * 4 + 3] = v3 - 1;
-      ++bcnt;
-    }
-
-    if(elementType == 4) { // tet code is 4
-      if(start <= Ntets && Ntets <= end) {
-        sscanf(buf,
-               "%*d%*d%*d " hlongFormat " %*d"
-               hlongFormat hlongFormat hlongFormat hlongFormat,
-               mesh->elementInfo + cnt,&v1, &v2, &v3, &v4);
-        /* read vertex triplet for trianngle */
-        mesh->EToV[cnt * mesh->Nverts + 0] = v1 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 1] = v2 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 2] = v3 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 3] = v4 - 1;
-        ++cnt;
-      }
-      ++Ntets;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  mesh->NboundaryFaces = bcnt;
-
-  /* record number of found tets */
-  mesh->Nelements = (dlong) NtetsLocal;
-
-  /* collect vertices for each element */
-  mesh->EX = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      hlong vid = mesh->EToV[e * mesh->Nverts + n];
-      mesh->EX[e * mesh->Nverts + n] = VX[vid];
-      mesh->EY[e * mesh->Nverts + n] = VY[vid];
-      mesh->EZ[e * mesh->Nverts + n] = VZ[vid];
-    }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-
-  return mesh;
-}
diff --git a/src/libP/src/meshParallelReaderTri2D.c b/src/libP/src/meshParallelReaderTri2D.c
deleted file mode 100644
index 2848d29c0..000000000
--- a/src/libP/src/meshParallelReaderTri2D.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh2D.h"
-
-/*
-   purpose: read gmsh triangle mesh
- */
-mesh2D* meshParallelReaderTri2D(char* fileName)
-{
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  FILE* fp = fopen(fileName, "r");
-
-  char* status;
-
-  //  mesh2D *mesh = (mesh2D*) calloc(1, sizeof(mesh2D));
-  mesh_t* mesh = new mesh_t();
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->dim = 2;
-  mesh->Nverts = 3; // number of vertices per element
-  mesh->Nfaces = 3;
-  mesh->NfaceVertices = 2;
-
-  /* vertices on each face */
-  int faceVertices[4][2] = {{0,1},{1,2},{2,0}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  if(fp == NULL) {
-    printf("meshParallelReaderTri2D: could not load file %s\n", fileName);
-    exit(0);
-  }
-
-  char buf[BUFSIZ];
-
-  // look for Nodes section
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &(mesh->Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat* VX = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VY = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(hlong n = 0; n < mesh->Nnodes; ++n) {
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat, VX + n, VY + n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    status = fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of elements in mesh */
-  hlong Nelements;
-  status = fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &Nelements);
-
-  /* find # of triangles */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  hlong Ntriangles = 0;
-  hlong NboundaryFaces = 0;
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 1) ++NboundaryFaces;
-    if(elementType == 2) ++Ntriangles;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  hlong chunk = (hlong) Ntriangles / size;
-  int remainder = (int) (Ntriangles - chunk * size);
-
-  hlong NtrianglesLocal = chunk + (rank < remainder);
-
-  /* where do these elements start ? */
-  hlong start = rank * chunk + mymin(rank, remainder);
-  hlong end   = start + NtrianglesLocal - 1;
-
-  /* allocate space for Element node index data */
-
-  mesh->EToV
-    = (hlong*) calloc(NtrianglesLocal * mesh->Nverts,
-                      sizeof(hlong));
-  mesh->elementInfo
-    = (hlong*) calloc(NtrianglesLocal,sizeof(hlong));
-
-  /* scan through file looking for triangle elements */
-  hlong cnt = 0, bcnt = 0;
-  Ntriangles = 0;
-
-  mesh->boundaryInfo = (hlong*) calloc(NboundaryFaces * 3, sizeof(hlong));
-  for(hlong n = 0; n < Nelements; ++n) {
-    int elementType;
-    hlong v1, v2, v3;
-    status = fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 1) { // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat,
-             mesh->boundaryInfo + bcnt * 3, &v1, &v2);
-      mesh->boundaryInfo[bcnt * 3 + 1] = v1 - 1;
-      mesh->boundaryInfo[bcnt * 3 + 2] = v2 - 1;
-      ++bcnt;
-    }
-    if(elementType == 2) { // triangle
-      if(start <= Ntriangles && Ntriangles <= end) {
-        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat,
-               mesh->elementInfo + cnt, &v1, &v2, &v3);
-
-        // check orientation
-        dfloat xe1 = VX[v1 - 1], xe2 = VX[v2 - 1], xe3 = VX[v3 - 1];
-        dfloat ye1 = VY[v1 - 1], ye2 = VY[v2 - 1], ye3 = VY[v3 - 1];
-        dfloat J = 0.25 * ((xe2 - xe1) * (ye3 - ye1) - (xe3 - xe1) * (ye2 - ye1));
-        if(J < 0) {
-          hlong v3tmp = v3;
-          v3 = v2;
-          v2 = v3tmp;
-          //      printf("unwarping element\n");
-        }
-
-        /* read vertex triplet for trianngle */
-        mesh->EToV[cnt * mesh->Nverts + 0] = v1 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 1] = v2 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 2] = v3 - 1;
-
-        ++cnt;
-      }
-      ++Ntriangles;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  mesh->NboundaryFaces = bcnt;
-
-  /* record number of found triangles */
-  mesh->Nelements = (dlong) NtrianglesLocal;
-
-  /* collect vertices for each element */
-  mesh->EX = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Nverts; ++n) {
-      mesh->EX[e * mesh->Nverts + n] = VX[mesh->EToV[e * mesh->Nverts + n]];
-      mesh->EY[e * mesh->Nverts + n] = VY[mesh->EToV[e * mesh->Nverts + n]];
-    }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-
-  return mesh;
-}
diff --git a/src/libP/src/meshParallelReaderTri3D.c b/src/libP/src/meshParallelReaderTri3D.c
deleted file mode 100644
index d671b1483..000000000
--- a/src/libP/src/meshParallelReaderTri3D.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include  "mpi.h"
-
-#include "mesh3D.h"
-
-/*
-   purpose: read gmsh triangle mesh
- */
-mesh3D* meshParallelReaderTri3D(char* fileName)
-{
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  FILE* fp = fopen(fileName, "r");
-  int n;
-
-  //  mesh3D *mesh = (mesh3D*) calloc(1, sizeof(mesh3D));
-  mesh_t* mesh = new mesh_t[1];
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->dim = 3;
-  mesh->Nverts = 3; // number of vertices per element
-  mesh->Nfaces = 3;
-  mesh->NfaceVertices = 2;
-
-  /* vertices on each face */
-  int faceVertices[4][2] = {{0,1},{1,2},{2,0}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  if(fp == NULL) {
-    printf("meshParallelReaderTri3D: could not load file %s\n", fileName);
-    exit(0);
-  }
-
-  char buf[BUFSIZ];
-
-  // look for Nodes section
-  do{
-    fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  fgets(buf, BUFSIZ, fp);
-  sscanf(buf, hlongFormat, &(mesh->Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat* VX = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VY = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-  dfloat* VZ = (dfloat*) calloc(mesh->Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(n = 0; n < mesh->Nnodes; ++n) {
-    fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-           VX + n, VY + n, VZ + n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    fgets(buf, BUFSIZ, fp);
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  fgets(buf, BUFSIZ, fp);
-  sscanf(buf, "%d", &(mesh->Nelements));
-
-  /* find # of triangles */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  int Ntriangles = 0;
-  int NboundaryFaces = 0;
-  for(n = 0; n < mesh->Nelements; ++n) {
-    int elementType;
-    fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 1) ++NboundaryFaces;
-    if(elementType == 2) ++Ntriangles;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  int chunk = Ntriangles / size;
-  int remainder = Ntriangles - chunk * size;
-
-  int NtrianglesLocal = chunk + (rank < remainder);
-
-  /* where do these elements start ? */
-  int start = rank * chunk + mymin(rank, remainder);
-  int end = start + NtrianglesLocal - 1;
-
-  /* allocate space for Element node index data */
-
-  mesh->EToV
-    = (hlong*) calloc(NtrianglesLocal * mesh->Nverts,
-                      sizeof(hlong));
-  mesh->elementInfo
-    = (hlong*) calloc(NtrianglesLocal,sizeof(hlong));
-
-  /* scan through file looking for triangle elements */
-  int cnt = 0, bcnt = 0;
-  Ntriangles = 0;
-
-  mesh->boundaryInfo = (hlong*) calloc(NboundaryFaces * 3, sizeof(hlong));
-  for(n = 0; n < mesh->Nelements; ++n) {
-    int elementType, v1, v2, v3;
-    fgets(buf, BUFSIZ, fp);
-    sscanf(buf, "%*d%d", &elementType);
-    if(elementType == 1) { // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d %d%d",
-             mesh->boundaryInfo + bcnt * 3, &v1, &v2);
-      mesh->boundaryInfo[bcnt * 3 + 1] = v1 - 1;
-      mesh->boundaryInfo[bcnt * 3 + 2] = v2 - 1;
-      ++bcnt;
-    }
-    if(elementType == 2) { // triangle
-      if(start <= Ntriangles && Ntriangles <= end) {
-        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d %d%d%d",
-               mesh->elementInfo + cnt, &v1, &v2, &v3);
-
-        // check orientation
-        dfloat xe1 = VX[v1 - 1], xe2 = VX[v2 - 1], xe3 = VX[v3 - 1];
-        dfloat ye1 = VY[v1 - 1], ye2 = VY[v2 - 1], ye3 = VY[v3 - 1];
-        dfloat ze1 = VZ[v1 - 1], ze2 = VZ[v2 - 1], ze3 = VZ[v3 - 1];
-
-#if 0
-        // TW: no idea
-        dfloat J = 0.25 * ((xe2 - xe1) * (ye3 - ye1) - (xe3 - xe1) * (ye2 - ye1));
-        if(J < 0) {
-          int v3tmp = v3;
-          v3 = v2;
-          v2 = v3tmp;
-          //	  printf("unwarping element\n");
-        }
-#endif
-
-        /* read vertex triplet for trianngle */
-        mesh->EToV[cnt * mesh->Nverts + 0] = v1 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 1] = v2 - 1;
-        mesh->EToV[cnt * mesh->Nverts + 2] = v3 - 1;
-
-        ++cnt;
-      }
-      ++Ntriangles;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  mesh->NboundaryFaces = bcnt;
-
-  /* record number of found triangles */
-  mesh->Nelements = NtrianglesLocal;
-
-  /* collect vertices for each element */
-  mesh->EX = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(mesh->Nverts * mesh->Nelements, sizeof(dfloat));
-  for(int e = 0; e < mesh->Nelements; ++e)
-    for(n = 0; n < mesh->Nverts; ++n) {
-      mesh->EX[e * mesh->Nverts + n] = VX[mesh->EToV[e * mesh->Nverts + n]];
-      mesh->EY[e * mesh->Nverts + n] = VY[mesh->EToV[e * mesh->Nverts + n]];
-      mesh->EZ[e * mesh->Nverts + n] = VZ[mesh->EToV[e * mesh->Nverts + n]];
-    }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-
-  return mesh;
-}
diff --git a/src/libP/src/meshPhysicalNodesHex3D.c b/src/libP/src/meshPhysicalNodesHex3D.c
deleted file mode 100644
index 72c33a9ee..000000000
--- a/src/libP/src/meshPhysicalNodesHex3D.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-void meshPhysicalNodesHex3D(mesh3D* mesh)
-{
-  mesh->x = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->y = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->z = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    dlong id = e * mesh->Nverts;
-
-    dfloat xe1 = mesh->EX[id + 0]; /* x-coordinates of vertices */
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-    dfloat xe4 = mesh->EX[id + 3];
-    dfloat xe5 = mesh->EX[id + 4];
-    dfloat xe6 = mesh->EX[id + 5];
-    dfloat xe7 = mesh->EX[id + 6];
-    dfloat xe8 = mesh->EX[id + 7];
-
-    dfloat ye1 = mesh->EY[id + 0]; /* y-coordinates of vertices */
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-    dfloat ye4 = mesh->EY[id + 3];
-    dfloat ye5 = mesh->EY[id + 4];
-    dfloat ye6 = mesh->EY[id + 5];
-    dfloat ye7 = mesh->EY[id + 6];
-    dfloat ye8 = mesh->EY[id + 7];
-
-    dfloat ze1 = mesh->EZ[id + 0]; /* z-coordinates of vertices */
-    dfloat ze2 = mesh->EZ[id + 1];
-    dfloat ze3 = mesh->EZ[id + 2];
-    dfloat ze4 = mesh->EZ[id + 3];
-    dfloat ze5 = mesh->EZ[id + 4];
-    dfloat ze6 = mesh->EZ[id + 5];
-    dfloat ze7 = mesh->EZ[id + 6];
-    dfloat ze8 = mesh->EZ[id + 7];
-
-    for(int n = 0; n < mesh->Np; ++n) { /* for each node */
-      /* (r,s,t) coordinates of interpolation nodes*/
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-      dfloat tn = mesh->t[n];
-
-      /* physical coordinate of interpolation node */
-      mesh->x[cnt] =
-        +0.125 * (1 - rn) * (1 - sn) * (1 - tn) * xe1
-        + 0.125 * (1 + rn) * (1 - sn) * (1 - tn) * xe2
-        + 0.125 * (1 + rn) * (1 + sn) * (1 - tn) * xe3
-        + 0.125 * (1 - rn) * (1 + sn) * (1 - tn) * xe4
-        + 0.125 * (1 - rn) * (1 - sn) * (1 + tn) * xe5
-        + 0.125 * (1 + rn) * (1 - sn) * (1 + tn) * xe6
-        + 0.125 * (1 + rn) * (1 + sn) * (1 + tn) * xe7
-        + 0.125 * (1 - rn) * (1 + sn) * (1 + tn) * xe8;
-
-      mesh->y[cnt] =
-        +0.125 * (1 - rn) * (1 - sn) * (1 - tn) * ye1
-        + 0.125 * (1 + rn) * (1 - sn) * (1 - tn) * ye2
-        + 0.125 * (1 + rn) * (1 + sn) * (1 - tn) * ye3
-        + 0.125 * (1 - rn) * (1 + sn) * (1 - tn) * ye4
-        + 0.125 * (1 - rn) * (1 - sn) * (1 + tn) * ye5
-        + 0.125 * (1 + rn) * (1 - sn) * (1 + tn) * ye6
-        + 0.125 * (1 + rn) * (1 + sn) * (1 + tn) * ye7
-        + 0.125 * (1 - rn) * (1 + sn) * (1 + tn) * ye8;
-
-      mesh->z[cnt] =
-        +0.125 * (1 - rn) * (1 - sn) * (1 - tn) * ze1
-        + 0.125 * (1 + rn) * (1 - sn) * (1 - tn) * ze2
-        + 0.125 * (1 + rn) * (1 + sn) * (1 - tn) * ze3
-        + 0.125 * (1 - rn) * (1 + sn) * (1 - tn) * ze4
-        + 0.125 * (1 - rn) * (1 - sn) * (1 + tn) * ze5
-        + 0.125 * (1 + rn) * (1 - sn) * (1 + tn) * ze6
-        + 0.125 * (1 + rn) * (1 + sn) * (1 + tn) * ze7
-        + 0.125 * (1 - rn) * (1 + sn) * (1 + tn) * ze8;
-
-      ++cnt;
-    }
-  }
-}
diff --git a/src/libP/src/meshPhysicalNodesQuad2D.c b/src/libP/src/meshPhysicalNodesQuad2D.c
deleted file mode 100644
index 08fdb5526..000000000
--- a/src/libP/src/meshPhysicalNodesQuad2D.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshPhysicalNodesQuad2D(mesh2D* mesh)
-{
-  mesh->x = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->y = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->z = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    dlong id = e * mesh->Nverts;
-
-    dfloat xe1 = mesh->EX[id + 0]; /* x-coordinates of vertices */
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-    dfloat xe4 = mesh->EX[id + 3];
-
-    dfloat ye1 = mesh->EY[id + 0]; /* y-coordinates of vertices */
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-    dfloat ye4 = mesh->EY[id + 3];
-
-    for(int n = 0; n < mesh->Np; ++n) { /* for each node */
-      /* (r,s) coordinates of interpolation nodes*/
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-
-      /* physical coordinate of interpolation node */
-      mesh->x[cnt] =
-        +0.25 * (1 - rn) * (1 - sn) * xe1
-        + 0.25 * (1 + rn) * (1 - sn) * xe2
-        + 0.25 * (1 + rn) * (1 + sn) * xe3
-        + 0.25 * (1 - rn) * (1 + sn) * xe4;
-
-      mesh->y[cnt] =
-        +0.25 * (1 - rn) * (1 - sn) * ye1
-        + 0.25 * (1 + rn) * (1 - sn) * ye2
-        + 0.25 * (1 + rn) * (1 + sn) * ye3
-        + 0.25 * (1 - rn) * (1 + sn) * ye4;
-
-      ++cnt;
-    }
-  }
-}
diff --git a/src/libP/src/meshPhysicalNodesQuad3D.c b/src/libP/src/meshPhysicalNodesQuad3D.c
deleted file mode 100644
index 2661b4d70..000000000
--- a/src/libP/src/meshPhysicalNodesQuad3D.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh.h"
-#include "mesh3D.h"
-
-void meshPhysicalNodesQuad3D(mesh_t* mesh)
-{
-  mesh->x = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->y = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->z = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-
-  int cnt = 0;
-  for(int e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    int id = e * mesh->Nverts;
-
-    dfloat xe1 = mesh->EX[id + 0]; /* x-coordinates of vertices */
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-    dfloat xe4 = mesh->EX[id + 3];
-
-    dfloat ye1 = mesh->EY[id + 0]; /* y-coordinates of vertices */
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-    dfloat ye4 = mesh->EY[id + 3];
-
-    dfloat ze1 = mesh->EZ[id + 0]; /* z-coordinates of vertices */
-    dfloat ze2 = mesh->EZ[id + 1];
-    dfloat ze3 = mesh->EZ[id + 2];
-    dfloat ze4 = mesh->EZ[id + 3];
-
-    for(int n = 0; n < mesh->Np; ++n) { /* for each node */
-      /* (r,s) coordinates of interpolation nodes*/
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-
-      /* physical coordinate of interpolation node */
-      dfloat xlin =
-        +0.25 * (1 - rn) * (1 - sn) * xe1
-        + 0.25 * (1 + rn) * (1 - sn) * xe2
-        + 0.25 * (1 + rn) * (1 + sn) * xe3
-        + 0.25 * (1 - rn) * (1 + sn) * xe4;
-
-      dfloat ylin =
-        +0.25 * (1 - rn) * (1 - sn) * ye1
-        + 0.25 * (1 + rn) * (1 - sn) * ye2
-        + 0.25 * (1 + rn) * (1 + sn) * ye3
-        + 0.25 * (1 - rn) * (1 + sn) * ye4;
-
-      dfloat zlin =
-        +0.25 * (1 - rn) * (1 - sn) * ze1
-        + 0.25 * (1 + rn) * (1 - sn) * ze2
-        + 0.25 * (1 + rn) * (1 + sn) * ze3
-        + 0.25 * (1 - rn) * (1 + sn) * ze4;
-
-      //      printf("xlin=%g, ylin=%g, zlin=%g\n", xlin, ylin, zlin);
-
-      // project to sphere
-      dfloat rlin = sqrt(xlin * xlin + ylin * ylin + zlin * zlin);
-      mesh->x[cnt] = xlin / rlin;
-      mesh->y[cnt] = ylin / rlin;
-      mesh->z[cnt] = zlin / rlin;
-
-      ++cnt;
-    }
-  }
-}
diff --git a/src/libP/src/meshPhysicalNodesTet3D.c b/src/libP/src/meshPhysicalNodesTet3D.c
deleted file mode 100644
index 5266b5b43..000000000
--- a/src/libP/src/meshPhysicalNodesTet3D.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-void meshPhysicalNodesTet3D(mesh3D* mesh)
-{
-  mesh->x = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->y = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->z = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    dlong id = e * mesh->Nverts;
-
-    dfloat xe1 = mesh->EX[id + 0]; /* x-coordinates of vertices */
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-    dfloat xe4 = mesh->EX[id + 3];
-
-    dfloat ye1 = mesh->EY[id + 0]; /* y-coordinates of vertices */
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-    dfloat ye4 = mesh->EY[id + 3];
-
-    dfloat ze1 = mesh->EZ[id + 0]; /* z-coordinates of vertices */
-    dfloat ze2 = mesh->EZ[id + 1];
-    dfloat ze3 = mesh->EZ[id + 2];
-    dfloat ze4 = mesh->EZ[id + 3];
-
-    for(int n = 0; n < mesh->Np; ++n) { /* for each node */
-      /* (r,s,t) coordinates of interpolation nodes*/
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-      dfloat tn = mesh->t[n];
-
-      /* physical coordinate of interpolation node */
-      mesh->x[cnt] = -0.5 * (1 + rn + sn + tn) * xe1 + 0.5 * (1 + rn) * xe2 + 0.5 * (1 + sn) * xe3 +
-                     0.5 * (1 + tn) * xe4;
-      mesh->y[cnt] = -0.5 * (1 + rn + sn + tn) * ye1 + 0.5 * (1 + rn) * ye2 + 0.5 * (1 + sn) * ye3 +
-                     0.5 * (1 + tn) * ye4;
-      mesh->z[cnt] = -0.5 * (1 + rn + sn + tn) * ze1 + 0.5 * (1 + rn) * ze2 + 0.5 * (1 + sn) * ze3 +
-                     0.5 * (1 + tn) * ze4;
-      ++cnt;
-    }
-  }
-}
diff --git a/src/libP/src/meshPhysicalNodesTri2D.c b/src/libP/src/meshPhysicalNodesTri2D.c
deleted file mode 100644
index b9ac22735..000000000
--- a/src/libP/src/meshPhysicalNodesTri2D.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshPhysicalNodesTri2D(mesh2D* mesh)
-{
-  mesh->x = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->y = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->z = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat)); // dummy
-
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    dlong id = e * mesh->Nverts + 0;
-
-    dfloat xe1 = mesh->EX[id + 0]; /* x-coordinates of vertices */
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-
-    dfloat ye1 = mesh->EY[id + 0]; /* y-coordinates of vertices */
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-
-    for(int n = 0; n < mesh->Np; ++n) { /* for each node */
-      /* (r,s) coordinates of interpolation nodes*/
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-
-      /* physical coordinate of interpolation node */
-      mesh->x[cnt] = -0.5 * (rn + sn) * xe1 + 0.5 * (1 + rn) * xe2 + 0.5 * (1 + sn) * xe3;
-      mesh->y[cnt] = -0.5 * (rn + sn) * ye1 + 0.5 * (1 + rn) * ye2 + 0.5 * (1 + sn) * ye3;
-      ++cnt;
-    }
-  }
-}
diff --git a/src/libP/src/meshPhysicalNodesTri3D.c b/src/libP/src/meshPhysicalNodesTri3D.c
deleted file mode 100644
index 93e653af0..000000000
--- a/src/libP/src/meshPhysicalNodesTri3D.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-void meshPhysicalNodesTri3D(mesh3D* mesh)
-{
-  mesh->x = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->y = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-  mesh->z = (dfloat*) calloc(mesh->Nelements * mesh->Np,sizeof(dfloat));
-
-  int cnt = 0;
-  for(int e = 0; e < mesh->Nelements; ++e) { /* for each element */
-    int id = e * mesh->Nverts + 0;
-
-    dfloat xe1 = mesh->EX[id + 0]; /* x-coordinates of vertices */
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-
-    dfloat ye1 = mesh->EY[id + 0]; /* y-coordinates of vertices */
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-
-    dfloat ze1 = mesh->EZ[id + 0]; /* z-coordinates of vertices */
-    dfloat ze2 = mesh->EZ[id + 1];
-    dfloat ze3 = mesh->EZ[id + 2];
-
-    for(int n = 0; n < mesh->Np; ++n) { /* for each node */
-      /* (r,s) coordinates of interpolation nodes*/
-      dfloat rn = mesh->r[n];
-      dfloat sn = mesh->s[n];
-
-      /* physical coordinate of interpolation node */
-      dfloat xlin = -0.5 * (rn + sn) * xe1 + 0.5 * (1 + rn) * xe2 + 0.5 * (1 + sn) * xe3;
-      dfloat ylin = -0.5 * (rn + sn) * ye1 + 0.5 * (1 + rn) * ye2 + 0.5 * (1 + sn) * ye3;
-      dfloat zlin = -0.5 * (rn + sn) * ze1 + 0.5 * (1 + rn) * ze2 + 0.5 * (1 + sn) * ze3;
-
-      // project to sphere
-      dfloat rlin = sqrt(xlin * xlin + ylin * ylin + zlin * zlin);
-      mesh->x[cnt] = xlin / rlin;
-      mesh->y[cnt] = ylin / rlin;
-      mesh->z[cnt] = zlin / rlin;
-
-      //      printf("x,y,z,rlin=%g,%g,%g,%g\n", xlin/rlin, ylin/rlin, zlin/rlin, rlin);
-      ++cnt;
-    }
-  }
-}
diff --git a/src/libP/src/meshPlotAdaptiveContour3D.c b/src/libP/src/meshPlotAdaptiveContour3D.c
deleted file mode 100644
index e8ca92651..000000000
--- a/src/libP/src/meshPlotAdaptiveContour3D.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh3D.h"
-
-void meshPlotAdaptiveContour3D(mesh_t* mesh,
-                               char* fname,
-                               dfloat* u,
-                               int Nlevels,
-                               dfloat* levels,
-                               dfloat tol)
-{
-  // function PlotAdaptiveContour3D(u, levels, tol)
-  // Purpose: adaptively refine the mesh to approximately locate isocontours
-
-  // build interpolation matrix (coarse->fine)
-  // assume these are loaded from node file
-  // mesh->contourEToV = [1 5 7 8; 5 2 6 9; 7 6 3 10; 8 9 10 4; 8 5 7 9; 7 5 6 9; 8 9 7 10; 9 6 7 10];
-  // mesh->contourVX   = [-1  1 -1 -1  0  0 -1 -1  0 -1];
-  // mesh->contourVY   = [-1 -1  1 -1 -1  0  0 -1 -1  0];
-  // mesh->contourVZ   = [-1 -1 -1  1 -1 -1 -1  0  0  0];
-  // mesh->contourInterpN
-  // v1 = EToVi(:,1); v2 = EToVi(:,2); v3 = EToVi(:,3); v4 = EToVi(:,4);
-  // ri = 0.5*(-(r+s+t+1)*VXi(v1) + (1+r)*VXi(v2) + (1+s)*VXi(v3) + (1+t)*VXi(v4) );
-  // si = 0.5*(-(r+s+t+1)*VYi(v1) + (1+r)*VYi(v2) + (1+s)*VYi(v3) + (1+t)*VYi(v4) );
-  // ti = 0.5*(-(r+s+t+1)*VZi(v1) + (1+r)*VZi(v2) + (1+s)*VZi(v3) + (1+t)*VZi(v4) );
-  //interp = Vandermonde3D(N, ri(:), si(:), ti(:))*invV;
-
-  // mesh->contourInterp1
-  // ri = [-1;1;-1;-1]; si = [-1;-1;1;-1]; ti = [-1;-1;-1;1]; refNp = length(ri);
-  // interp1 = Vandermonde3D(N, ri(:), si(:), ti(:))*invV;
-  // mesh->contourF
-  //sk = 1;
-  //F = spalloc(Np,Np,1);
-  //for i=0:N % old ordering
-  //  for j=0:N - i
-  //    for k=0:N - i - j
-  //      if(i+j+k<=1), F(sk,sk) = 1.; end;
-  //      sk = sk+1;
-  //    end
-  //  end
-  //end
-
-  // contourFilter:     ufilt = V*F*invV
-
-  int MAXLEVELS = 0;
-
-  int plotNp = 4;
-  int Nelements = mesh->Nelements;
-  int Np = mesh->Np;
-
-  dfloat* refu = (dfloat*) calloc(Nelements * Np, sizeof(dfloat));
-  dfloat* refx = (dfloat*) calloc(Nelements * Np, sizeof(dfloat));
-  dfloat* refy = (dfloat*) calloc(Nelements * Np, sizeof(dfloat));
-  dfloat* refz = (dfloat*) calloc(Nelements * Np, sizeof(dfloat));
-
-  //copy in data
-  for(int n = 0; n < Np * Nelements; ++n) {
-    refu[n] = u[n];
-    refx[n] = mesh->x[n];
-    refy[n] = mesh->y[n];
-    refz[n] = mesh->z[n];
-  }
-
-  dfloat* newu, * newx, * newy, * newz;
-
-  dfloat err = 1;
-  int refLevel = 0;
-  while ((err > tol) && (refLevel < MAXLEVELS)) {
-    int* refineFlag = (int*) calloc(Nelements,sizeof(int));
-    int Nrefine = 0;
-    for(int e = 0; e < Nelements; ++e) {
-      dfloat umin = refu[e * Np + 0];
-      dfloat umax = refu[e * Np + 0];
-
-      for(int n = 1; n < Np; ++n) {
-        umin = mymin(umin, refu[e * Np + n]);
-        umax = mymax(umax, refu[e * Np + n]);
-      }
-
-      for (int lev = 0; lev < Nlevels; lev++)
-        if((umin <= levels[lev]) && (umax >= levels[lev])) {
-          refineFlag[e] = 1;
-          ++Nrefine;
-          break;
-        }
-    }
-
-    int newNelements = 8 * Nrefine;
-
-    newu = (dfloat*) calloc(Np * newNelements, sizeof(dfloat));
-    newx = (dfloat*) calloc(Np * newNelements, sizeof(dfloat));
-    newy = (dfloat*) calloc(Np * newNelements, sizeof(dfloat));
-    newz = (dfloat*) calloc(Np * newNelements, sizeof(dfloat));
-    int cnt = 0;
-    for(int e = 0; e < Nelements; ++e) {
-      if (refineFlag[e] == 0) continue;
-      for(int m = 0; m < 8 * Np; ++m)
-        for(int i = 0; i < Np; ++i) {
-          // note layout
-          newu[8 * Np * cnt + m] += mesh->contourInterp[m * Np + i] * refu[e * Np + i];
-          newx[8 * Np * cnt + m] += mesh->contourInterp[m * Np + i] * refx[e * Np + i];
-          newy[8 * Np * cnt + m] += mesh->contourInterp[m * Np + i] * refy[e * Np + i];
-          newz[8 * Np * cnt + m] += mesh->contourInterp[m * Np + i] * refz[e * Np + i];
-          cnt++;
-        }
-    }
-    free(refineFlag);
-
-    free(refu);
-    free(refx);
-    free(refy);
-    free(refz);
-
-    Nelements = newNelements;
-    refu = newu;
-    refx = newx;
-    refy = newy;
-    refz = newz;
-
-    err = 0;
-    for(int e = 0; e < Nelements; ++e)
-      for(int n = 0; n < Np; ++n) {
-        dfloat errn = -refu[e * Np + n];
-        for(int m = 0; m < Np; ++m)
-          errn += mesh->contourFilter[n * Np + m] * refu[e * Np + m];
-        err = mymax(err, fabs(errn));
-      }
-    refLevel++;
-  }
-
-  int* refineFlag = (int*) calloc(Nelements,sizeof(int));
-  int Nrefine = 0;
-  for(int e = 0; e < Nelements; ++e) {
-    dfloat umin = refu[e * Np + 0];
-    dfloat umax = refu[e * Np + 0];
-
-    for(int n = 1; n < Np; ++n) {
-      umin = mymin(umin, refu[e * Np + n]);
-      umax = mymax(umax, refu[e * Np + n]);
-    }
-
-    for (int lev = 0; lev < Nlevels; lev++)
-      if((umin <= levels[lev]) && (umax >= levels[lev])) {
-        refineFlag[e] = 1;
-        ++Nrefine;
-        break;
-      }
-  }
-
-  FILE* fp = fopen(fname, "w");
-
-  fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-  fprintf(fp, "  <UnstructuredGrid>\n");
-  fprintf(fp, "    <Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
-          Nrefine * mesh->plotNp,
-          Nrefine * mesh->plotNelements);
-
-  // write out nodes
-  fprintf(fp, "      <Points>\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-  // compute plot node coordinates on the fly
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (refineFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotxn = 0, plotyn = 0, plotzn = 0;
-      for(int m = 0; m < mesh->Np; ++m) {
-        plotxn += mesh->plotInterp[n * mesh->Np + m] * refx[m + e * mesh->Np];
-        plotyn += mesh->plotInterp[n * mesh->Np + m] * refy[m + e * mesh->Np];
-        plotzn += mesh->plotInterp[n * mesh->Np + m] * refz[m + e * mesh->Np];
-      }
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", plotxn,plotyn,plotzn);
-    }
-  }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Points>\n");
-
-  fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
-
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (refineFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotpn = 0;
-      for(int m = 0; m < mesh->Np; ++m) {
-        dfloat pm = refu[m + e * mesh->Np];
-        plotpn += mesh->plotInterp[n * mesh->Np + m] * pm;
-      }
-      fprintf(fp, "       ");
-      fprintf(fp, "%g\n", plotpn);
-    }
-  }
-
-  fprintf(fp, "       </DataArray>\n");
-  fprintf(fp, "     </PointData>\n");
-
-  fprintf(fp, "    <Cells>\n");
-  fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-  int cnt = 0;
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (refineFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      fprintf(fp, "       ");
-      for(int m = 0; m < mesh->plotNverts; ++m)
-        fprintf(fp, "%d ", cnt * mesh->plotNp + mesh->plotEToV[n * mesh->plotNverts + m]);
-      fprintf(fp, "\n");
-    }
-    cnt++;
-  }
-
-  fprintf(fp, "        </DataArray>\n");
-
-  fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-  cnt = 0;
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (refineFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      cnt += mesh->plotNverts;
-      fprintf(fp, "       ");
-      fprintf(fp, "%d\n", cnt);
-    }
-  }
-  fprintf(fp, "       </DataArray>\n");
-
-  fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (refineFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      fprintf(fp, "10\n"); // TET code ?
-    }
-  }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Cells>\n");
-  fprintf(fp, "    </Piece>\n");
-  fprintf(fp, "  </UnstructuredGrid>\n");
-  fprintf(fp, "</VTKFile>\n");
-  fclose(fp);
-
-  free(refineFlag);
-
-#if 0
-  dfloat* plotx = (dfloat*) calloc(4 * Nrefine,sizeof(dfloat));
-  dfloat* ploty = (dfloat*) calloc(4 * Nrefine,sizeof(dfloat));
-  dfloat* plotz = (dfloat*) calloc(4 * Nrefine,sizeof(dfloat));
-  dfloat* plotu = (dfloat*) calloc(4 * Nrefine,sizeof(dfloat));
-
-  int cnt = 0;
-  for(int e = 0; e < Nelements; ++e) {
-    if (refineFlag[e] == 0) continue;
-    for(int n = 0; n < plotNp; ++n) {
-      dfloat px = 0, py = 0, pz = 0, pu = 0;
-
-      for(int m = 0; m < Np; ++m) {
-        px += mesh->contourInterp1[n * Np + m] * refx[e * Np + m];
-        py += mesh->contourInterp1[n * Np + m] * refy[e * Np + m];
-        pz += mesh->contourInterp1[n * Np + m] * refz[e * Np + m];
-        pu += mesh->contourInterp1[n * Np + m] * refu[e * Np + m];
-      }
-
-      plotx[cnt * plotNp + n] = px;
-      ploty[cnt * plotNp + n] = py;
-      plotz[cnt * plotNp + n] = pz;
-      plotu[cnt * plotNp + n] = pu;
-      cnt++;
-    }
-  }
-
-  Nelements = Nrefine;
-  int plotNelements = Nelements;
-
-  FILE* fp = fopen(fname, "w");
-
-  fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-  fprintf(fp, "  <UnstructuredGrid>\n");
-  fprintf(fp, "    <Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
-          plotNelements * plotNp,
-          plotNelements);
-
-  // write out nodes
-  fprintf(fp, "      <Points>\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-  // compute plot node coordinates on the fly
-  for(int n = 0; n < plotNelements * plotNp; ++n) {
-    fprintf(fp, "       ");
-    fprintf(fp, "%g %g %g\n", plotx[n],ploty[n],plotz[n]);
-  }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Points>\n");
-
-  // write out pressure
-  fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
-
-  for(int e = 0; e < plotNelements; ++e)
-    for(int n = 0; n < plotNp; ++n) {
-      fprintf(fp, "       ");
-      fprintf(fp, "%g\n", plotu[e * plotNp + n]);
-    }
-
-  fprintf(fp, "       </DataArray>\n");
-  fprintf(fp, "     </PointData>\n");
-
-  fprintf(fp, "    <Cells>\n");
-  fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-  for(int e = 0; e < plotNelements; ++e) {
-    fprintf(fp, "       ");
-    for(int m = 0; m < mesh->plotNverts; ++m)
-      fprintf(fp, "%d ", e * plotNp + m);
-    fprintf(fp, "\n");
-  }
-
-  fprintf(fp, "        </DataArray>\n");
-
-  fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-  int cnt = 0;
-  for(int e = 0; e < plotNelements; ++e) {
-    cnt += mesh->plotNverts;
-    fprintf(fp, "       ");
-    fprintf(fp, "%d\n", cnt);
-  }
-  fprintf(fp, "       </DataArray>\n");
-
-  fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-  for(int e = 0; e < plotNelements; ++e) {
-    fprintf(fp, "10\n"); // TET code ?
-  }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Cells>\n");
-  fprintf(fp, "    </Piece>\n");
-  fprintf(fp, "  </UnstructuredGrid>\n");
-  fprintf(fp, "</VTKFile>\n");
-  fclose(fp);
-#endif
-}
diff --git a/src/libP/src/meshPlotContour3D.c b/src/libP/src/meshPlotContour3D.c
deleted file mode 100644
index b776f5ffc..000000000
--- a/src/libP/src/meshPlotContour3D.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh3D.h"
-
-void meshPlotContour3D(mesh_t* mesh, char* fname, dfloat* u, int Nlevels, dfloat* levels)
-{
-  int* plotFlag = (int*) calloc(mesh->Nelements,sizeof(int));
-  int* plotSubFlag = (int*) calloc(mesh->Nelements * mesh->plotNelements,sizeof(int));
-  dfloat* plotu = (dfloat*) calloc(mesh->plotNp,sizeof(dfloat));
-
-  int NcontourElements = 0;
-  int plotElements = 0;
-
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      plotu[n] = 0;
-      for(int m = 0; m < mesh->Np; ++m)
-        plotu[n] += mesh->plotInterp[n * mesh->Np + m] * u[m + e * mesh->Np];
-    }
-
-    for (int k = 0; k < mesh->plotNelements; k++) {
-      int id0 = mesh->plotEToV[k * mesh->plotNverts + 0];
-      int id1 = mesh->plotEToV[k * mesh->plotNverts + 1];
-      int id2 = mesh->plotEToV[k * mesh->plotNverts + 2];
-      int id3 = mesh->plotEToV[k * mesh->plotNverts + 3];
-
-      dfloat umin = plotu[id0];
-      dfloat umax = plotu[id0];
-      umin = mymin(umin, plotu[id1]);
-      umax = mymax(umax, plotu[id1]);
-      umin = mymin(umin, plotu[id2]);
-      umax = mymax(umax, plotu[id2]);
-      umin = mymin(umin, plotu[id3]);
-      umax = mymax(umax, plotu[id3]);
-
-      for (int lev = 0; lev < Nlevels; lev++)
-        if((umin <= levels[lev]) && (umax >= levels[lev])) {
-          NcontourElements++;
-          if (plotFlag[e] == 0) plotElements++;
-          plotFlag[e] = 1;
-          plotSubFlag[e * mesh->plotNelements + k] = 1;
-          break;
-        }
-    }
-  }
-  free(plotu);
-
-  FILE* fp = fopen(fname, "w");
-
-  fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-  fprintf(fp, "  <UnstructuredGrid>\n");
-  fprintf(fp, "    <Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
-          plotElements * mesh->plotNp,
-          NcontourElements);
-
-  // write out nodes
-  fprintf(fp, "      <Points>\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-  // compute plot node coordinates on the fly
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (plotFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotxn = 0, plotyn = 0, plotzn = 0;
-      for(int m = 0; m < mesh->Np; ++m) {
-        plotxn += mesh->plotInterp[n * mesh->Np + m] * mesh->x[m + e * mesh->Np];
-        plotyn += mesh->plotInterp[n * mesh->Np + m] * mesh->y[m + e * mesh->Np];
-        plotzn += mesh->plotInterp[n * mesh->Np + m] * mesh->z[m + e * mesh->Np];
-      }
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", plotxn,plotyn,plotzn);
-    }
-  }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Points>\n");
-
-  fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
-
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (plotFlag[e] == 0) continue;
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotu = 0;
-      for(int m = 0; m < mesh->Np; ++m)
-        plotu += mesh->plotInterp[n * mesh->Np + m] * u[m + e * mesh->Np];
-      fprintf(fp, "       ");
-      fprintf(fp, "%g\n", plotu);
-    }
-  }
-  fprintf(fp, "       </DataArray>\n");
-  fprintf(fp, "     </PointData>\n");
-
-  fprintf(fp, "    <Cells>\n");
-  fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-  int cnt = 0;
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (plotFlag[e] == 0) continue;
-
-    for(int k = 0; k < mesh->plotNelements; ++k) {
-      if (plotSubFlag[e * mesh->plotNelements + k] == 0) continue;
-      fprintf(fp, "       ");
-      for(int m = 0; m < mesh->plotNverts; ++m)
-        fprintf(fp, "%d ", cnt * mesh->plotNp + mesh->plotEToV[k * mesh->plotNverts + m]);
-      fprintf(fp, "\n");
-    }
-    cnt++;
-  }
-
-  fprintf(fp, "        </DataArray>\n");
-
-  fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-  cnt = 0;
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (plotFlag[e] == 0) continue;
-    for(int k = 0; k < mesh->plotNelements; ++k) {
-      if (plotSubFlag[e * mesh->plotNelements + k] == 0) continue;
-      cnt += mesh->plotNverts;
-      fprintf(fp, "       ");
-      fprintf(fp, "%d\n", cnt);
-    }
-  }
-  fprintf(fp, "       </DataArray>\n");
-
-  fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-  for(int e = 0; e < mesh->Nelements; ++e) {
-    if (plotFlag[e] == 0) continue;
-    for(int k = 0; k < mesh->plotNelements; ++k) {
-      if (plotSubFlag[e * mesh->plotNelements + k] == 0) continue;
-      fprintf(fp, "10\n"); // TET code ?
-    }
-  }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Cells>\n");
-  fprintf(fp, "    </Piece>\n");
-  fprintf(fp, "  </UnstructuredGrid>\n");
-  fprintf(fp, "</VTKFile>\n");
-  fclose(fp);
-
-  free(plotFlag);
-  free(plotSubFlag);
-}
diff --git a/src/libP/src/meshPlotVTU2D.c b/src/libP/src/meshPlotVTU2D.c
deleted file mode 100644
index a3beaa4b3..000000000
--- a/src/libP/src/meshPlotVTU2D.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "mpi.h"
-
-#include "mesh2D.h"
-
-// interpolate data to plot nodes and save to file (one per process
-void meshPlotVTU2D(mesh2D* mesh, char* fileNameBase, int fld)
-{
-  int rank;
-  rank = mesh->rank;
-
-  FILE* fp;
-  char fileName[BUFSIZ];
-  //sprintf(fileName, "%s_%04d.vtu", fileNameBase, rank);
-  strcpy(fileName, fileNameBase);
-
-  fp = fopen(fileName, "w");
-
-  fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-  fprintf(fp, "  <UnstructuredGrid>\n");
-  fprintf(fp, "    <Piece NumberOfPoints=\"" dlongFormat "\" NumberOfCells=\"" dlongFormat "\">\n",
-          mesh->Nelements * mesh->plotNp,
-          mesh->Nelements * mesh->plotNelements);
-
-  // write out nodes
-  fprintf(fp, "      <Points>\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-  // compute plot node coordinates on the fly
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotxn = 0, plotyn = 0;
-      for(int m = 0; m < mesh->Np; ++m) {
-        plotxn += mesh->plotInterp[n * mesh->Np + m] * mesh->x[m + e * mesh->Np];
-        plotyn += mesh->plotInterp[n * mesh->Np + m] * mesh->y[m + e * mesh->Np];
-      }
-
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", plotxn,plotyn,0.);
-    }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Points>\n");
-
-  // write out pressure
-  fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" Name=\"pressure\" Format=\"ascii\">\n");
-
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotpn = 0;
-
-      for(int m = 0; m < mesh->Np; ++m) {
-        dfloat pm = mesh->q[fld + mesh->Nfields * (m + e * mesh->Np)];
-        //dfloat pm = mesh->invTau[m+e*mesh->Np];
-        plotpn += mesh->plotInterp[n * mesh->Np + m] * pm;
-      }
-
-      fprintf(fp, "       ");
-      fprintf(fp, "%g\n", plotpn);
-    }
-
-  fprintf(fp, "       </DataArray>\n");
-  fprintf(fp, "     </PointData>\n");
-
-  fprintf(fp, "    <Cells>\n");
-  fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      fprintf(fp, "       ");
-      for(int m = 0; m < mesh->plotNverts; ++m)
-        fprintf(fp, "%d ", e * mesh->plotNp + mesh->plotEToV[n * mesh->plotNverts + m]);
-      fprintf(fp, "\n");
-    }
-
-  fprintf(fp, "        </DataArray>\n");
-
-  fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      cnt += mesh->plotNverts;
-      fprintf(fp, "       ");
-      fprintf(fp, dlongFormat "\n", cnt);
-    }
-  fprintf(fp, "       </DataArray>\n");
-
-  fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNelements; ++n)
-      fprintf(fp, "5\n");
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Cells>\n");
-  fprintf(fp, "    </Piece>\n");
-  fprintf(fp, "  </UnstructuredGrid>\n");
-  fprintf(fp, "</VTKFile>\n");
-  fclose(fp);
-}
diff --git a/src/libP/src/meshPlotVTU3D.c b/src/libP/src/meshPlotVTU3D.c
deleted file mode 100644
index 314e9aa1b..000000000
--- a/src/libP/src/meshPlotVTU3D.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "mesh3D.h"
-
-// interpolate data to plot nodes and save to file (one per process
-void meshPlotVTU3D(mesh3D* mesh, char* fileNameBase, int fld)
-{
-  int rank;
-  rank = mesh->rank;
-
-  FILE* fp;
-  char fileName[BUFSIZ];
-  //sprintf(fileName, "%s_%04d.vtu", fileNameBase, rank);
-  strcpy(fileName,fileNameBase);
-
-  fp = fopen(fileName, "w");
-
-  fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-  fprintf(fp, "  <UnstructuredGrid>\n");
-  fprintf(fp, "    <Piece NumberOfPoints=\"" dlongFormat "\" NumberOfCells=\"" dlongFormat "\">\n",
-          mesh->Nelements * mesh->plotNp,
-          mesh->Nelements * mesh->plotNelements);
-
-  // write out nodes
-  fprintf(fp, "      <Points>\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-  // compute plot node coordinates on the fly
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotxn = 0, plotyn = 0, plotzn = 0;
-      for(int m = 0; m < mesh->Np; ++m) {
-        plotxn += mesh->plotInterp[n * mesh->Np + m] * mesh->x[m + e * mesh->Np];
-        plotyn += mesh->plotInterp[n * mesh->Np + m] * mesh->y[m + e * mesh->Np];
-        plotzn += mesh->plotInterp[n * mesh->Np + m] * mesh->z[m + e * mesh->Np];
-      }
-
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", plotxn,plotyn,plotzn);
-    }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Points>\n");
-
-  // write out pressure
-  fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  fprintf(fp, "        <DataArray type=\"Float32\" Name=\"pressure\" Format=\"ascii\">\n");
-
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNp; ++n) {
-      dfloat plotpn = 0;
-      for(int m = 0; m < mesh->Np; ++m) {
-        dfloat pm = mesh->q[fld + mesh->Nfields * (m + e * mesh->Np)];
-        plotpn += mesh->plotInterp[n * mesh->Np + m] * pm;
-      }
-      fprintf(fp, "       ");
-      fprintf(fp, "%g\n", plotpn);
-    }
-
-  fprintf(fp, "       </DataArray>\n");
-  fprintf(fp, "     </PointData>\n");
-
-  fprintf(fp, "    <Cells>\n");
-  fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      fprintf(fp, "       ");
-      for(int m = 0; m < mesh->plotNverts; ++m)
-        fprintf(fp, dlongFormat " ", e * mesh->plotNp + mesh->plotEToV[n * mesh->plotNverts + m]);
-      fprintf(fp, "\n");
-    }
-
-  fprintf(fp, "        </DataArray>\n");
-
-  fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-  dlong cnt = 0;
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      cnt += mesh->plotNverts;
-      fprintf(fp, "       ");
-      fprintf(fp, dlongFormat "\n", cnt);
-    }
-  fprintf(fp, "       </DataArray>\n");
-
-  fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->plotNelements; ++n) {
-      if(mesh->NfaceVertices == 2)
-        fprintf(fp, "5\n");
-      else
-        fprintf(fp, "10\n");
-    }
-  fprintf(fp, "        </DataArray>\n");
-  fprintf(fp, "      </Cells>\n");
-  fprintf(fp, "    </Piece>\n");
-  fprintf(fp, "  </UnstructuredGrid>\n");
-  fprintf(fp, "</VTKFile>\n");
-  fclose(fp);
-}
diff --git a/src/libP/src/meshPrint2D.c b/src/libP/src/meshPrint2D.c
deleted file mode 100644
index 0d573281c..000000000
--- a/src/libP/src/meshPrint2D.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "mesh2D.h"
-
-void meshPrint2D(mesh2D* mesh)
-{
-  printf("EToV:\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    printf(hlongFormat hlongFormat hlongFormat "\n",
-           mesh->EToV[e * mesh->Nverts + 0],
-           mesh->EToV[e * mesh->Nverts + 1],
-           mesh->EToV[e * mesh->Nverts + 2]);
-
-  printf("EToE:\n");
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    printf(dlongFormat dlongFormat dlongFormat "\n",
-           mesh->EToE[e * mesh->Nfaces + 0],
-           mesh->EToE[e * mesh->Nfaces + 1],
-           mesh->EToE[e * mesh->Nfaces + 2]);
-}
diff --git a/src/libP/src/meshProbeSetup2D.c b/src/libP/src/meshProbeSetup2D.c
deleted file mode 100644
index 3152d858f..000000000
--- a/src/libP/src/meshProbeSetup2D.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mpi.h"
-#include "mesh2D.h"
-
-void meshProbeSetup2D(mesh2D* mesh, dfloat* pX, dfloat* pY)
-{
-  dfloat mindist = -1.e-12; // Set minum distance
-  //
-
-  mesh->probeN = 0;
-
-#if 1
-
-  dfloat* probeR           = (dfloat*) calloc(10 * mesh->probeNTotal,sizeof(dfloat));
-  dfloat* probeS           = (dfloat*) calloc(10 * mesh->probeNTotal,sizeof(dfloat));
-  mesh->probeElementIds    = (int*)   calloc(10 * mesh->probeNTotal,sizeof(int));
-  mesh->probeIds           = (int*)   calloc(10 * mesh->probeNTotal,sizeof(int));
-
-  double* A        = (double*) calloc((mesh->dim + 1) * mesh->Nverts,sizeof(double));
-  int* IPIV     = (int*)   calloc(mesh->Nverts,sizeof(int));
-  int* IPIV2    = (int*)   calloc((mesh->Np + 1),sizeof(int));
-
-  dfloat* b       = (dfloat*) calloc((mesh->dim + 1),sizeof(dfloat));
-  double* q       = (double*) calloc(mesh->Nverts,sizeof(double));
-
-  for(int n = 0; n < mesh->Nverts; n++)
-    IPIV[n] = 1;
-
-  for(int n = 0; n < (mesh->Np + 1); n++)
-    IPIV2[n] = 1;
-
-  int N    = (mesh->dim + 1); // A->Nrwos
-  int NRHS =  1; // B->Ncolumns
-  int LDA  = N;
-  int LDB  = (mesh->dim + 1); // B->Nrows
-  int INFO;
-
-  // for(int n=0; n<mesh->mesh->probeNTotal; n++){
-  // // Coordinates of probe
-  // printf("Probe %d  pX: %g pY:%g \n",n, pX[n], pY[n]);
-  // }
-
-// Assumes each probe is in one element, may change later
-  for(int n = 0; n < mesh->probeNTotal; n++) {
-    // Coordinates of probe
-    b[0] = 1.0;
-    b[1] = pX[n];
-    b[2] = pY[n];
-
-    for (int e = 0; e < mesh->Nelements; e++) {
-      // Create A[1 vx vy]
-      for (int v = 0; v < mesh->Nverts; v++) {
-        dfloat vx = mesh->EX[e * mesh->Nverts + v];
-        dfloat vy = mesh->EY[e * mesh->Nverts + v];
-        //
-        A[v * mesh->Nverts + 0] = 1.0;
-        A[v * mesh->Nverts + 1] = vx;
-        A[v * mesh->Nverts + 2] = vy;
-      }
-
-      for(int l = 0; l < (mesh->dim + 1); l++)
-        q[l] = b[l];
-
-      dgesv_(&N,&NRHS,(double*) A,&LDA,IPIV, (double*) q,&LDB,&INFO);
-
-      if(INFO)
-        printf("DGSEV error: %d \n", INFO);
-
-      // Check all non-negative barycentric coordinates
-      // Assumes a probe can be represented by single element!!!!
-
-      dfloat qmin = q[0];
-      for(int i = 1; i < (mesh->dim + 1); i++)
-        qmin = mymin(qmin, q[i]);
-
-      // Catch the element
-      if(qmin > mindist) {
-        // Increase number of probes
-        mesh->probeIds[mesh->probeN] = n + 1;
-        // hold element ids
-        mesh->probeElementIds[mesh->probeN] = e;
-        // hold local r,s coordinates
-        dfloat l1 =  q[2];
-        dfloat l2 =  q[0];
-        dfloat l3 =  q[1];
-
-        probeR[mesh->probeN] = 2. * l3 - 1.;
-        probeS[mesh->probeN] = 2. * l1 - 1.;
-
-        printf("element: %d probe Id: %d qmin:%.5e R: %.5e S:%.5e\n", e, n, qmin,
-               probeR[mesh->probeN],probeS[mesh->probeN]);
-        //
-        mesh->probeN++;
-      }
-    }
-  }
-
-#else  /* if 1 */
-
-  mesh->probeN = 0;
-
-  dfloat* probeR           = (dfloat*) calloc(mesh->probeNTotal,sizeof(dfloat));
-  dfloat* probeS           = (dfloat*) calloc(mesh->probeNTotal,sizeof(dfloat));
-  mesh->probeElementIds    = (int*)   calloc(mesh->probeNTotal,sizeof(int));
-
-  double* A        = (double*) calloc((mesh->dim + 1) * mesh->Nverts,sizeof(double));
-  int* IPIV     = (int*)   calloc(mesh->Nverts,sizeof(int));
-  int* IPIV2    = (int*)   calloc((mesh->Np + 1),sizeof(int));
-
-  dfloat* b       = (dfloat*) calloc((mesh->dim + 1) * mesh->probeNTotal,sizeof(dfloat));
-  double* q       = (double*) calloc(mesh->Nverts * mesh->probeNTotal,sizeof(double));
-
-  for(int n = 0; n < mesh->Nverts; n++)
-    IPIV[n] = 1;
-
-  for(int n = 0; n < (mesh->Np + 1); n++)
-    IPIV2[n] = 1;
-
-  int N    = (mesh->dim + 1); // A->Nrwos
-  int NRHS =  mesh->probeNTotal; // B->Ncolumns
-  int LDA  = N;
-  int LDB  = (mesh->dim + 1); // B->Nrows
-  int INFO;
-
-  // Assumes each probe is in one element, may change later
-  for(int n = 0; n < mesh->probeNTotal; n++) {
-    // Coordinates of probe
-    b[n * mesh->probeNTotal + 0] = 1.0;
-    b[n * mesh->probeNTotal + 1] = pX[n];
-    b[n * mesh->probeNTotal + 2] = pY[n];
-  }
-
-  //
-  for (int e = 0; e < mesh->Nelements; e++) {
-    // Create A[1 vx vy]
-    for (int v = 0; v < mesh->Nverts; v++) {
-      dfloat vx = mesh->EX[e * mesh->Nverts + v];
-      dfloat vy = mesh->EY[e * mesh->Nverts + v];
-      //
-      A[v * mesh->Nverts + 0] = 1.0;
-      A[v * mesh->Nverts + 1] = vx;
-      A[v * mesh->Nverts + 2] = vy;
-    }
-
-    for(int l = 0; l < mesh->probeNTotal * (mesh->dim + 1); l++)
-      q[l] = b[l];
-
-    // Find barycentric coordinates
-    // q = A^-1*b
-
-    dgesv_(&N,&NRHS,(double*) A,&LDA,IPIV, (double*) q,&LDB,&INFO);
-
-    if(INFO)
-      printf("DGSEV error: %d \n", INFO);
-
-    // Check all non-negative barycentric coordinates
-    // Assumes a probe can be represented by single element!!!!
-    for(int n = 0; n < mesh->probeNTotal; n++) {
-      dfloat qmin = q[n * mesh->probeNTotal + 0];
-      for(int i = 1; i < (mesh->dim + 1); i++)
-        qmin = mymin(qmin, q[n * mesh->probeNTotal + i]);
-
-      // Catch the element
-      if(qmin > mindist) {
-        mesh->probeN++;
-        // hold element ids
-        mesh->probeElementIds[n] = e;
-        // hold local r,s coordinates
-        dfloat l1 =  q[n * mesh->probeNTotal + 2];
-        dfloat l2 =  q[n * mesh->probeNTotal + 0];
-        dfloat l3 =  q[n * mesh->probeNTotal + 1];
-
-        probeR[n] = 2. * l3 - 1.;
-        probeS[n] = 2. * l1 - 1.;
-
-        printf("element: %d probe %d qmin:%.5e R: %.5e S:%.5e\n", e, n, qmin, probeR[n],probeS[n]);
-      }
-    }
-  }
-
-#endif
-
-  printf("probe Number: %d \n", mesh->probeN);
-
-  if(mesh->probeN) {
-    //Reallocate ProbeIds and Element Ids, Now take cares of  cares
-    mesh->probeIds        = (int*)   realloc(mesh->probeIds,        mesh->probeN * sizeof(int));
-    mesh->probeElementIds = (int*)   realloc(mesh->probeElementIds, mesh->probeN * sizeof(int));
-    probeR                = (dfloat*) realloc(probeR, mesh->probeN * sizeof(dfloat));
-    probeS                = (dfloat*) realloc(probeS, mesh->probeN * sizeof(dfloat));
-
-    // Compute Vandermonde Matrix and invert  it
-    dfloat* V = (dfloat*) calloc(mesh->Np * (mesh->N + 1) * (mesh->N + 2) / 2, sizeof(dfloat));
-    meshVandermonde2D(mesh->N, mesh->Np, mesh->r, mesh->s, V);
-
-    double* dV = (double*) calloc(mesh->Np * (mesh->N + 1) * (mesh->N + 2) / 2, sizeof(double));
-    for(int n = 0; n < mesh->Np * (mesh->N + 1) * (mesh->N + 2) / 2; ++n)
-      dV[n] = V[n];
-
-    //
-    N    = mesh->Np;
-    int LWORK = mesh->Np * mesh->Np;
-    double* WORK = (double*) calloc(LWORK, sizeof(double));
-
-    dgetrf_(&N,&N,(double*)dV,&N,IPIV2,&INFO);
-    dgetri_(&N,(double*)dV,&N,IPIV2,(double*)WORK,&LWORK,&INFO);
-
-    if(INFO)
-      printf("DGE_TRI/TRF error: %d \n", INFO);
-
-    // Compute Vandermonde matrix of probes
-    dfloat* Vprobe = (dfloat*) calloc(mesh->probeN * mesh->Np,sizeof(dfloat));
-    meshVandermonde2D(mesh->N, mesh->probeN, probeR, probeS, Vprobe);
-
-    mesh->probeI = (dfloat*) calloc(mesh->probeN * mesh->Np, sizeof(dfloat));
-
-    for(int r = 0; r < mesh->probeN; r++)
-      for(int c = 0; c < mesh->Np; c++) {
-        dfloat s = 0;
-        for(int i = 0; i < mesh->Np; i++)
-          s += Vprobe[r * mesh->Np + i] * dV[i * mesh->Np + c];
-        mesh->probeI[r * mesh->Np + c] = s;
-      }
-
-
-    free(V);
-    free(dV);
-    free(WORK);
-  }
-
-  free(IPIV);
-  free(IPIV2);
-  free(probeR);
-  free(probeS);
-  free(A);
-  free(b);
-  free(q);
-}
-
-void meshVandermonde2D(int N, int Npoints, dfloat* r, dfloat* s, dfloat* V)
-{
-  // First convert to ab coordinates
-  dfloat* a = (dfloat*) calloc(Npoints, sizeof(dfloat));
-  dfloat* b = (dfloat*) calloc(Npoints, sizeof(dfloat));
-  for(int n = 0; n < Npoints; n++) {
-    if(fabs(s[n] - 1.0) > 1e-8)
-      a[n] = 2.0 * (1. + r[n]) / (1.0 - s[n]) - 1.0;
-    else
-      a[n] = -1.0;
-
-    b[n] = s[n];
-  }
-
-  int sk = 0;
-
-  int Np = (N + 1) * (N + 2) / 2;
-
-  for(int i = 0; i <= N; i++)
-    for(int j = 0; j <= N - i; j++) {
-      for(int n = 0; n < Npoints; n++)
-        V[n * Np + sk] = meshSimplex2D(a[n], b[n], i, j);
-      sk++;
-    }
-
-
-  free(a);
-  free(b);
-}
-
-dfloat meshSimplex2D(dfloat a, dfloat b, int i, int j)
-{
-  //
-  dfloat p1 = meshJacobiP(a,0,0,i);
-  dfloat p2 = meshJacobiP(b,2 * i + 1,0,j);
-  dfloat P = sqrt(2.0) * p1 * p2 * pow(1 - b,i);
-
-  return P;
-}
-
-dfloat meshJacobiP(dfloat a, dfloat alpha, dfloat beta, int N)
-{
-  dfloat ax = a;
-
-  dfloat* P = (dfloat*) calloc((N + 1), sizeof(dfloat));
-
-  // Zero order
-  dfloat gamma0 =
-    pow(2,
-        (alpha + beta + 1)) / (alpha + beta + 1) * meshFactorial(alpha) * meshFactorial(beta) /
-    meshFactorial(alpha + beta);
-  dfloat p0     = 1.0 / sqrt(gamma0);
-
-  if (N == 0) {
-    free(P);
-    return p0;
-  }
-  P[0] = p0;
-
-  // first order
-  dfloat gamma1 = (alpha + 1) * (beta + 1) / (alpha + beta + 3) * gamma0;
-  dfloat p1     = ((alpha + beta + 2) * ax / 2 + (alpha - beta) / 2) / sqrt(gamma1);
-  if (N == 1) {
-    free(P);
-    return p1;
-  }
-
-  P[1] = p1;
-
-  /// Repeat value in recurrence.
-  dfloat aold = 2 / (2 + alpha + beta) * sqrt((alpha + 1.) * (beta + 1.) / (alpha + beta + 3.));
-  /// Forward recurrence using the symmetry of the recurrence.
-  for(int i = 1; i <= N - 1; ++i) {
-    dfloat h1 = 2. * i + alpha + beta;
-    dfloat anew = 2. / (h1 + 2.) *
-                  sqrt(
-      (i + 1.) * (i + 1. + alpha + beta) * (i + 1 + alpha) * (i + 1 + beta) / (h1 + 1) / (h1 + 3));
-    dfloat bnew = -(alpha * alpha - beta * beta) / h1 / (h1 + 2);
-    P[i + 1] = 1. / anew * ( -aold * P[i - 1] + (ax - bnew) * P[i]);
-    aold = anew;
-  }
-
-  dfloat pN = P[N];
-  free(P);
-  return pN;
-}
-
-dfloat meshFactorial(int n)
-{
-  if(n == 0)
-    return 1;
-  else
-    return n * meshFactorial(n - 1);
-}
diff --git a/src/libP/src/meshRecursiveSpectralBisectionPartition.c b/src/libP/src/meshRecursiveSpectralBisectionPartition.c
deleted file mode 100644
index 891f631b7..000000000
--- a/src/libP/src/meshRecursiveSpectralBisectionPartition.c
+++ /dev/null
@@ -1,237 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include "mpi.h"
-#include "mesh.h"
-
-#if 0
-#include "parmetis.h"
-#include "defs.h"
-#endif
-
-void meshRecursiveSpectralBisectionPartition(mesh_t* mesh)
-{
-  printf("meshRecursiveSpectralBisectionPartition is disabled, exiting\n");
-  exit(-1);
-
-#if 0
-
-  int rank = mesh->rank;
-  int size = mesh->size;
-  int Nverts = mesh->Nverts;
-
-  int* allNelements = (int*) calloc(size, sizeof(int));
-
-  /* local number of elements */
-  int Nelements = mesh->Nelements;
-
-  /* find number of elements on all processors */
-  MPI_Allgather(&Nelements, 1, MPI_INT, allNelements, 1, MPI_INT, mesh->comm);
-
-  /* element distribution -- cumulative element count on processes */
-  idx_t* elmdist = (idx_t*) calloc(size + 1, sizeof(idx_t)); // element starts
-
-  int e,r,n;
-
-  elmdist[0] = 0;
-  for(r = 0; r < size; ++r)
-    elmdist[r + 1] = elmdist[r] + allNelements[r];
-
-  /* list of element starts */
-  idx_t* eptr = (idx_t*) calloc(Nelements + 1, sizeof(idx_t)); // element starts
-
-  eptr[0] = 0;
-  for(e = 0; e < Nelements; ++e)
-    eptr[e + 1] = eptr[e] + Nverts;
-
-  /* local element to vertex */
-  idx_t* eind = (idx_t*) calloc(Nverts * Nelements, sizeof(idx_t)); // element starts
-
-  for(e = 0; e < Nelements; ++e)
-    for(n = 0; n < Nverts; ++n)
-      eind[e * Nverts + n] = mesh->EToV[e * Nverts + n];
-
-  /* weight per element */
-  idx_t* elmwgt = (idx_t*) calloc(Nelements, sizeof(idx_t)); // element starts
-
-  for(e = 0; e < Nelements; ++e)
-    elmwgt[e] = 1.;
-
-  /* weight flag */
-  int wgtflag = 0;
-
-  /* number flag (1=fortran, 0=c) */
-  int numflag = 0;
-
-  /* ncon = 1 */
-  int ncon = 1;
-
-  /* nodes on element face */
-  int ncommonnodes = mesh->NfaceVertices;
-
-  /* number of partitions */
-  int nparts = size;
-
-  /* tpwgts */
-  float* tpwgts = (float*) calloc(nparts, sizeof(float));
-
-  for(e = 0; e < nparts; ++e)
-    tpwgts[e] = 1. / (float)nparts;
-
-#define MAXNCON 32
-  float ubvec[MAXNCON];
-
-  for (n = 0; n < ncon; ++n)
-    ubvec[n] = UNBALANCE_FRACTION;
-
-  int options[10];
-
-  options[0] = 1;
-  options[PMV3_OPTION_DBGLVL] = 7;
-
-  options[PMV3_OPTION_SEED] = 0;
-
-  int edgecut;
-
-  idx_t* part = (idx_t*) calloc(Nelements, sizeof(idx_t)); // element starts
-
-  MPI_Comm comm;
-  MPI_Comm_dup(MPI_COMM_WORLD, &comm);
-
-  ParMETIS_V3_PartMeshKway
-    (elmdist,
-    eptr,
-    eind,
-    elmwgt,
-    &wgtflag,
-    &numflag,
-    &ncon,
-    &ncommonnodes,
-    &nparts,
-    tpwgts,
-    ubvec,
-    options,
-    &edgecut,
-    part,
-    &comm);
-
-  /* now repartition EToV */
-
-  /* add up how many ints need to be sent to each process (element count in each partition) */
-  int* outNdata = (int*) calloc(size, sizeof(int));
-  for(e = 0; e < Nelements; ++e)
-    outNdata[part[e]] += Nverts;
-
-  /* get count of incoming elements from each process */
-  int* inNdata = (int*) calloc(size, sizeof(int));
-  MPI_Alltoall(outNdata, 1, MPI_INT,
-               inNdata,  1, MPI_INT,
-               MPI_COMM_WORLD);
-
-  /* find offsets into outgoing array for each rank data */
-  int* outStarts = (int*) calloc(size, sizeof(int));
-  for(r = 1; r < size; ++r)
-    outStarts[r] = outStarts[r - 1] + outNdata[r - 1];
-
-  /* find offsets into incoming array for each rank's data */
-  int* inStarts = (int*) calloc(size, sizeof(int));
-  for(r = 1; r < size; ++r)
-    inStarts[r] = inStarts[r - 1] + inNdata[r - 1];
-
-  /* create array for outgoing data */
-  int* outEToV = (int*) calloc(Nelements * Nverts, sizeof(int));
-  int* outElementInfo = (int*) calloc(Nelements * Nverts, sizeof(int));
-
-  int* outCnt  = (int*) calloc(size, sizeof(int));
-  dfloat* outEX = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  dfloat* outEY = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  dfloat* outEZ = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-
-  for(r = 0; r < size; ++r)
-    outCnt[r] = outStarts[r];
-
-  for(e = 0; e < Nelements; ++e)
-    for(n = 0; n < Nverts; ++n) {
-      outEToV[outCnt[part[e]]] = mesh->EToV[e * Nverts + n];
-      outEX[outCnt[part[e]]] = mesh->EX[e * Nverts + n];
-      outEY[outCnt[part[e]]] = mesh->EY[e * Nverts + n];
-      outEZ[outCnt[part[e]]] = mesh->EZ[e * Nverts + n];
-      outElementInfo[outCnt[part[e]]] = mesh->elementInfo[e]; // yes this is lazy
-      ++outCnt[part[e]];
-    }
-
-  // reset number of elements
-  Nelements = 0;
-  for(r = 0; r < size; ++r)
-    //   printf("rank %d gets %d new elements from rank %d \n", rank, inNdata[r]/Nverts, r);
-    Nelements += inNdata[r] / Nverts;
-
-  /* send elements to their new rank */
-  hlong* inEToV = (hlong*) calloc(Nelements * Nverts, sizeof(hlong));
-  int* inElementInfo = (int*) calloc(Nelements * Nverts, sizeof(int));
-  dfloat* inEX = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  dfloat* inEY = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  dfloat* inEZ = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-
-  MPI_Alltoallv(outEToV, outNdata, outStarts, MPI_HLONG,
-                inEToV,   inNdata,  inStarts, MPI_HLONG,
-                mesh->comm);
-
-  MPI_Alltoallv(outElementInfo, outNdata, outStarts, MPI_INT,
-                inElementInfo,   inNdata,  inStarts, MPI_INT,
-                mesh->comm);
-
-  MPI_Alltoallv(outEX, outNdata, outStarts, MPI_DFLOAT,
-                inEX,   inNdata,  inStarts, MPI_DFLOAT,
-                mesh->comm);
-
-  MPI_Alltoallv(outEY, outNdata, outStarts, MPI_DFLOAT,
-                inEY,   inNdata,  inStarts, MPI_DFLOAT,
-                mesh->comm);
-
-  MPI_Alltoallv(outEZ, outNdata, outStarts, MPI_DFLOAT,
-                inEZ,   inNdata,  inStarts, MPI_DFLOAT,
-                mesh->comm);
-
-  free(mesh->EToV);
-  free(mesh->EX);
-  free(mesh->EY);
-  free(mesh->EZ);
-  free(mesh->elementInfo);
-
-  // scrape EToV from inEToV (may be different type hlong to EToV)
-  mesh->EToV = (hlong*) calloc(Nelements * Nverts, sizeof(hlong));
-  mesh->elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
-  mesh->EX   = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  mesh->EY   = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  mesh->EZ   = (dfloat*) calloc(Nelements * Nverts, sizeof(dfloat));
-  mesh->elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
-  for(e = 0; e < Nelements; ++e)
-    for(n = 0; n < Nverts; ++n) {
-      mesh->EToV[e * Nverts + n] = inEToV[e * Nverts + n];
-      mesh->elementInfo[e] = inElementInfo[e * Nverts]; // lazy
-      mesh->EX[e * Nverts + n] = inEX[e * Nverts + n];
-      mesh->EY[e * Nverts + n] = inEY[e * Nverts + n];
-      mesh->EZ[e * Nverts + n] = inEZ[e * Nverts + n];
-    }
-
-  // reset element count
-  mesh->Nelements = Nelements;
-
-  // free temporaries
-  free(allNelements);
-  free(tpwgts);
-  free(outNdata);
-  free(outStarts);
-  free(outEToV);
-  free(outCnt);
-  free(inNdata);
-  free(inStarts);
-  free(inEToV);
-
-  free(elmdist);
-  free(eptr);
-  free(eind);
-  free(elmwgt);
-  free(part);
-#endif
-}
diff --git a/src/libP/src/meshReport3D.c b/src/libP/src/meshReport3D.c
deleted file mode 100644
index 304f0da87..000000000
--- a/src/libP/src/meshReport3D.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-void meshReport3D(const char* mess, mesh3D* mesh)
-{
-  printf("%s: (Nfields=%d,Np=%d,Nfaces=%d,Nfp=%d,Nvgeo=%d)\n",
-         mess, mesh->Nfields, mesh->Np, mesh->Nfaces, mesh->Nfp, mesh->Nvgeo);
-
-  dfloat maxq = 0, minq = 1e9;
-  dfloat maxrhsq = 0, minrhsq = 1e9;
-
-  for(int n = 0; n < mesh->Np * mesh->Nelements * mesh->Nfields; ++n) {
-    maxq = mymax(maxq, mesh->q[n]);
-    minq = mymin(minq, mesh->q[n]);
-    maxrhsq = mymax(maxrhsq, mesh->rhsq[n]);
-    minrhsq = mymin(minrhsq, mesh->rhsq[n]);
-
-    printf("%g ", mesh->rhsq[n]);
-    if((n % mesh->Nfields) == mesh->Nfields - 1)
-      printf("\n");
-  }
-  printf("q in %g,%g\n", minq, maxq);
-  printf("rhsq in %g,%g\n", minrhsq, maxrhsq);
-}
diff --git a/src/libP/src/meshSetup.c b/src/libP/src/meshSetup.c
deleted file mode 100644
index db673a7e6..000000000
--- a/src/libP/src/meshSetup.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh2D.h"
-#include "mesh3D.h"
-
-mesh_t* meshSetup(char* filename, int N, setupAide &options)
-{
-  int dim, elementType;
-
-  options.getArgs("ELEMENT TYPE", elementType);
-  options.getArgs("MESH DIMENSION", dim);
-
-  mesh_t* mesh;
-  switch(elementType) {
-  case TRIANGLES:
-    mesh = meshSetupTri2D(filename, N);
-    break;
-  case QUADRILATERALS: {
-    if(dim == 2) {
-      mesh = meshSetupQuad2D(filename, N);
-    }else  {
-      dfloat radius = 1;
-      options.getArgs("SPHERE RADIUS", radius);
-      mesh = meshSetupQuad3D(filename, N, radius);
-    }
-    break;
-  }
-  case TETRAHEDRA:
-    mesh = meshSetupTet3D(filename, N);
-    break;
-  case HEXAHEDRA:
-    mesh = meshSetupHex3D(filename, N);
-    break;
-  }
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupBoxHex3D.c b/src/libP/src/meshSetupBoxHex3D.c
deleted file mode 100644
index 89507baef..000000000
--- a/src/libP/src/meshSetupBoxHex3D.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh3D.h"
-
-mesh3D* meshSetupBoxHex3D(int N, setupAide &options)
-{
-  mesh_t* mesh = new mesh_t();
-
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  mesh->Nfields = 1;
-  mesh->dim = 3;
-  mesh->Nverts = 8; // number of vertices per element
-  mesh->Nfaces = 6;
-  mesh->NfaceVertices = 4;
-
-  // vertices on each face
-  int faceVertices[6][4] =
-  {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  // build an NX x NY x NZ periodic box grid
-
-  hlong NX = 10, NY = 10, NZ = 10; // defaults
-
-  options.getArgs("BOX NX", NX);
-  options.getArgs("BOX NY", NY);
-  options.getArgs("BOX NZ", NZ);
-
-  dfloat XMIN = -1, XMAX = +1; // default bi-unit cube
-  dfloat YMIN = -1, YMAX = +1;
-  dfloat ZMIN = -1, ZMAX = +1;
-
-  options.getArgs("BOX XMIN", XMIN);
-  options.getArgs("BOX YMIN", YMIN);
-  options.getArgs("BOX ZMIN", ZMIN);
-
-  options.getArgs("BOX XMAX", XMAX);
-  options.getArgs("BOX YMAX", YMAX);
-  options.getArgs("BOX ZMAX", ZMAX);
-
-  hlong allNelements = NX * NY * NZ;
-
-  hlong chunkNelements = allNelements / size;
-
-  hlong start = chunkNelements * rank;
-  hlong end   = chunkNelements * (rank + 1);
-
-  if(mesh->rank == (size - 1))
-    end = allNelements;
-
-  mesh->Nnodes = NX * NY * NZ; // assume periodic and global number of nodes
-  mesh->Nelements = end - start;
-  mesh->NboundaryFaces = 0;
-
-  printf("Rank %d initially has %d elements\n", mesh->rank, mesh->Nelements);
-
-  mesh->EToV = (hlong*) calloc(mesh->Nelements * mesh->Nverts, sizeof(hlong));
-
-  mesh->EX = (dfloat*) calloc(mesh->Nelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(mesh->Nelements * mesh->Nverts, sizeof(dfloat));
-
-  mesh->elementInfo = (hlong*) calloc(mesh->Nelements, sizeof(hlong));
-
-  // [0,NX]
-  dfloat dx = (XMAX - XMIN) / NX; // xmin+0*dx, xmin + NX*(XMAX-XMIN)/NX
-  dfloat dy = (YMAX - YMIN) / NY;
-  dfloat dz = (ZMAX - ZMIN) / NZ;
-  for(hlong n = start; n < end; ++n) {
-    int i = n % NX;      // [0, NX)
-    int j = (n / NX) % NY; // [0, NY)
-    int k = n / (NX * NY); // [0, NZ)
-
-    hlong e = n - start;
-
-    int ip = (i + 1) % NX;
-    int jp = (j + 1) % NY;
-    int kp = (k + 1) % NZ;
-
-    // do not use for coordinates
-    mesh->EToV[e * mesh->Nverts + 0] = i  +  j * NX + k * NX * NY;
-    mesh->EToV[e * mesh->Nverts + 1] = ip +  j * NX + k * NX * NY;
-    mesh->EToV[e * mesh->Nverts + 2] = ip + jp * NX + k * NX * NY;
-    mesh->EToV[e * mesh->Nverts + 3] = i  + jp * NX + k * NX * NY;
-
-    mesh->EToV[e * mesh->Nverts + 4] = i  +  j * NX + kp * NX * NY;
-    mesh->EToV[e * mesh->Nverts + 5] = ip +  j * NX + kp * NX * NY;
-    mesh->EToV[e * mesh->Nverts + 6] = ip + jp * NX + kp * NX * NY;
-    mesh->EToV[e * mesh->Nverts + 7] = i  + jp * NX + kp * NX * NY;
-
-    dfloat xo = XMIN + dx * i;
-    dfloat yo = YMIN + dy * j;
-    dfloat zo = ZMIN + dz * k;
-
-    dfloat* ex = mesh->EX + e * mesh->Nverts;
-    dfloat* ey = mesh->EY + e * mesh->Nverts;
-    dfloat* ez = mesh->EZ + e * mesh->Nverts;
-
-    ex[0] = xo;
-    ey[0] = yo;
-    ez[0] = zo;
-    ex[1] = xo + dx;
-    ey[1] = yo;
-    ez[1] = zo;
-    ex[2] = xo + dx;
-    ey[2] = yo + dy;
-    ez[2] = zo;
-    ex[3] = xo;
-    ey[3] = yo + dy;
-    ez[3] = zo;
-
-    ex[4] = xo;
-    ey[4] = yo;
-    ez[4] = zo + dz;
-    ex[5] = xo + dx;
-    ey[5] = yo;
-    ez[5] = zo + dz;
-    ex[6] = xo + dx;
-    ey[6] = yo + dy;
-    ez[6] = zo + dz;
-    ex[7] = xo;
-    ey[7] = yo + dy;
-    ez[7] = zo + dz;
-
-    mesh->elementInfo[e] = 1; // ?
-  }
-
-#if 0
-  char fileName[BUFSIZ];
-  sprintf(fileName, "box%04d.dat", mesh->rank);
-
-  FILE* fp = fopen(fileName, "w");
-
-  fprintf(fp, "EToV = [\n");
-  for(hlong e = 0; e < mesh->Nelements; ++e) {
-    for(int v = 0; v < mesh->Nverts; ++v)
-      fprintf(fp, "%d ", mesh->EToV[e * mesh->Nverts + v]);
-    fprintf(fp, "\n");
-  }
-
-  fclose(fp);
-
-  MPI_Finalize();
-  exit(0);
-#endif
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition3D(mesh);
-  //  meshRecursiveSpectralBisectionPartition(mesh);
-
-  mesh->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int));
-
-  mesh->boundaryInfo = NULL; // no boundaries
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // load reference (r,s,t) element nodes
-  meshLoadReferenceNodesHex3D(mesh, N);
-
-  // compute physical (x,y) locations of the element nodes
-  meshPhysicalNodesHex3D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsHex3D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectPeriodicFaceNodes3D(mesh,XMAX - XMIN,YMAX - YMIN,ZMAX - ZMIN); // needs to fix this !
-
-  // connect elements to boundary faces
-  //  meshConnectBoundary(mesh);
-#if 0
-  // diagnostic
-  for(hlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Nfaces * mesh->Nfp; ++n) {
-      hlong idM = mesh->vmapM[e * mesh->Nfaces * mesh->Nfp + n];
-      hlong idP = mesh->vmapP[e * mesh->Nfaces * mesh->Nfp + n];
-
-      dfloat dx = mesh->x[idP] - mesh->x[idM];
-      dfloat dy = mesh->y[idP] - mesh->y[idM];
-      dfloat dz = mesh->z[idP] - mesh->z[idM];
-
-      dfloat d = sqrt(dx * dx + dy * dy + dz * dz);
-      printf("%d,%d |d|=|%lf,%lf,%lf|=%lf\n", idM, idP, dx, dy, dz, d);
-    }
-
-#endif
-
-  // compute surface geofacs (including halo)
-  meshSurfaceGeometricFactorsHex3D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupBoxQuad2D.c b/src/libP/src/meshSetupBoxQuad2D.c
deleted file mode 100644
index 2a63cca63..000000000
--- a/src/libP/src/meshSetupBoxQuad2D.c
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh2D.h"
-
-mesh2D* meshSetupBoxQuad2D(int N, setupAide &options)
-{
-  //  mesh_t *mesh = new mesh_t[1];
-  mesh_t* mesh = (mesh_t*) calloc(1, sizeof(mesh_t));
-
-  int rank, size;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &mesh->comm);
-
-  mesh->rank = rank;
-  mesh->size = size;
-
-  mesh->Nfields = 1;
-  mesh->dim = 2;
-  mesh->Nverts = 4; // number of vertices per element
-  mesh->Nfaces = 4;
-  mesh->NfaceVertices = 2;
-
-  // vertices on each face
-  int faceVertices[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  mesh->faceVertices =
-    (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int));
-
-  memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int));
-
-  // build an NX x NY x NZ periodic box grid
-
-  hlong NX = 10, NY = 10; // defaults
-
-  options.getArgs("BOX NX", NX);
-  options.getArgs("BOX NY", NY);
-
-  dfloat XMIN = -1, XMAX = +1; // default bi-unit cube
-  dfloat YMIN = -1, YMAX = +1;
-
-  options.getArgs("BOX XMIN", XMIN);
-  options.getArgs("BOX YMIN", YMIN);
-
-  options.getArgs("BOX XMAX", XMAX);
-  options.getArgs("BOX YMAX", YMAX);
-
-  hlong allNelements = NX * NY;
-
-  hlong chunkNelements = allNelements / size;
-
-  hlong start = chunkNelements * rank;
-  hlong end   = chunkNelements * (rank + 1);
-
-  if(mesh->rank == (size - 1))
-    end = allNelements;
-
-  mesh->Nnodes = NX * NY; // assume periodic and global number of nodes
-  mesh->Nelements = end - start;
-  mesh->NboundaryFaces = 0;
-
-  printf("Rank %d initially has %d elements\n", mesh->rank, mesh->Nelements);
-
-  mesh->EToV = (hlong*) calloc(mesh->Nelements * mesh->Nverts, sizeof(hlong));
-
-  mesh->EX = (dfloat*) calloc(mesh->Nelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nelements * mesh->Nverts, sizeof(dfloat));
-  mesh->EZ = (dfloat*) calloc(mesh->Nelements * mesh->Nverts, sizeof(dfloat));
-
-  mesh->elementInfo = (hlong*) calloc(mesh->Nelements, sizeof(hlong));
-
-  // [0,NX]
-  dfloat dx = (XMAX - XMIN) / NX; // xmin+0*dx, xmin + NX*(XMAX-XMIN)/NX
-  dfloat dy = (YMAX - YMIN) / NY;
-
-  for(hlong n = start; n < end; ++n) {
-    int i = n % NX;      // [0, NX)
-    int j = (n / NY); // [0, NY)
-    hlong e = n - start;
-
-    int ip = (i + 1) % NX;
-    int jp = (j + 1) % NY;
-
-    // do not use for coordinates
-    mesh->EToV[e * mesh->Nverts + 0] = i  +  j * NX;
-    mesh->EToV[e * mesh->Nverts + 1] = ip +  j * NX;
-    mesh->EToV[e * mesh->Nverts + 2] = ip + jp * NX;
-    mesh->EToV[e * mesh->Nverts + 3] = i  + jp * NX;
-
-    dfloat xo = XMIN + dx * i;
-    dfloat yo = YMIN + dy * j;
-
-    dfloat* ex = mesh->EX + e * mesh->Nverts;
-    dfloat* ey = mesh->EY + e * mesh->Nverts;
-
-    ex[0] = xo;
-    ey[0] = yo;
-    ex[1] = xo + dx;
-    ey[1] = yo;
-    ex[2] = xo + dx;
-    ey[2] = yo + dy;
-    ex[3] = xo;
-    ey[3] = yo + dy;
-
-    mesh->elementInfo[e] = 1; // ?
-  }
-
-#if 0
-  char fileName[BUFSIZ];
-  sprintf(fileName, "box%04d.dat", mesh->rank);
-
-  FILE* fp = fopen(fileName, "w");
-
-  fprintf(fp, "EToV = [\n");
-  for(hlong e = 0; e < mesh->Nelements; ++e) {
-    for(int v = 0; v < mesh->Nverts; ++v)
-      fprintf(fp, "%d ", mesh->EToV[e * mesh->Nverts + v]);
-    fprintf(fp, "\n");
-  }
-
-  fclose(fp);
-
-  MPI_Finalize();
-  exit(0);
-#endif
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition2D(mesh);
-
-  //meshRecursiveSpectralBisectionPartition(mesh);
-
-  mesh->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int));
-  mesh->boundaryInfo = NULL; // no boundaries
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // load reference (r,s,t) element nodes
-  meshLoadReferenceNodesQuad2D(mesh, N);
-
-  // compute physical (x,y) locations of the element nodes
-  meshPhysicalNodesQuad2D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsQuad2D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectPeriodicFaceNodes2D(mesh,XMAX - XMIN,YMAX - YMIN);
-
-#if 0
-  // diagnostic
-  for(hlong e = 0; e < mesh->Nelements; ++e)
-    for(int n = 0; n < mesh->Nfaces * mesh->Nfp; ++n) {
-      hlong idM = mesh->vmapM[e * mesh->Nfaces * mesh->Nfp + n];
-      hlong idP = mesh->vmapP[e * mesh->Nfaces * mesh->Nfp + n];
-
-      dfloat dx = mesh->x[idP] - mesh->x[idM];
-      dfloat dy = mesh->y[idP] - mesh->y[idM];
-      dfloat dz = mesh->z[idP] - mesh->z[idM];
-
-      dfloat d = sqrt(dx * dx + dy * dy + dz * dz);
-      printf("%d,%d |d|=|%lf,%lf,%lf|=%lf\n", idM, idP, dx, dy, dz, d);
-    }
-
-#endif
-
-  // compute surface geofacs (including halo)
-  meshSurfaceGeometricFactorsQuad2D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupHex3D.c b/src/libP/src/meshSetupHex3D.c
deleted file mode 100644
index efa183d24..000000000
--- a/src/libP/src/meshSetupHex3D.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh3D.h"
-
-mesh3D* meshSetupHex3D(char* filename, int N)
-{
-  // read chunk of elements
-  mesh3D* mesh = meshParallelReaderHex3D(filename);
-
-  mesh->Nfields = 1; // TW: note this is a temporary patch (halo exchange depends on nfields)
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition3D(mesh);
-  //meshRecursiveSpectralBisectionPartition(mesh);
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  // load reference (r,s,t) element nodes
-  meshLoadReferenceNodesHex3D(mesh, N);
-
-  // compute physical (x,y) locations of the element nodes
-  meshPhysicalNodesHex3D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsHex3D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes3D(mesh);
-
-  // compute surface geofacs (including halo)
-  meshSurfaceGeometricFactorsHex3D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupQuad2D.c b/src/libP/src/meshSetupQuad2D.c
deleted file mode 100644
index 13b69cb28..000000000
--- a/src/libP/src/meshSetupQuad2D.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh2D.h"
-
-mesh2D* meshSetupQuad2D(char* filename, int N)
-{
-  // read chunk of elements
-  mesh2D* mesh = meshParallelReaderQuad2D(filename);
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition2D(mesh);
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  // load reference (r,s) element nodes
-  meshLoadReferenceNodesQuad2D(mesh, N);
-
-  // compute physical (x,y) locations of the element nodes
-  meshPhysicalNodesQuad2D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsQuad2D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes2D(mesh);
-
-  // compute surface geofacs
-  meshSurfaceGeometricFactorsQuad2D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupQuad3D.c b/src/libP/src/meshSetupQuad3D.c
deleted file mode 100644
index e7fff4886..000000000
--- a/src/libP/src/meshSetupQuad3D.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh2D.h"
-#include "mesh3D.h"
-
-mesh_t* meshSetupQuad3D(char* filename, int N, dfloat sphereRadius)
-{
-  // read chunk of elements
-  mesh_t* mesh = meshParallelReaderQuad3D(filename);
-
-  // set sphere radius (will be used later in building physical nodes)
-  mesh->sphereRadius = sphereRadius;
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition3D(mesh); // need to double check this
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-#if 1
-  for(int e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      if(mesh->EToB[e * mesh->Nfaces + f] != -1)
-        printf("YIKES %d %d %d ", e, f, mesh->EToB[e * mesh->Nfaces + f]);
-      if(mesh->EToE[e * mesh->Nfaces + f] == e ||
-         mesh->EToE[e * mesh->Nfaces + f] == -1)
-        printf("YUCKS %d %d %d ", e, f, mesh->EToB[e * mesh->Nfaces + f]);
-    }
-
-#endif
-
-  // load reference (r,s) element nodes
-  meshLoadReferenceNodesQuad2D(mesh, N);
-
-  // compute physical (x,y,z) locations of the element nodes
-  meshPhysicalNodesQuad3D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsQuad3D(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes3D(mesh);
-
-  for(int n = 0; n < mesh->Nfp * mesh->Nelements * mesh->Nfaces; ++n)
-    if(mesh->vmapM[n] == mesh->vmapP[n])
-      printf("node %d matches self \n", n);
-
-
-  // compute surface geofacs
-  meshSurfaceGeometricFactorsQuad3D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupTet3D.c b/src/libP/src/meshSetupTet3D.c
deleted file mode 100644
index cae1f3e01..000000000
--- a/src/libP/src/meshSetupTet3D.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh3D.h"
-
-mesh3D* meshSetupTet3D(char* filename, int N)
-{
-  // read chunk of elements
-  mesh3D* mesh = meshParallelReaderTet3D(filename);
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition3D(mesh);
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  // load reference (r,s,t) element nodes
-  meshLoadReferenceNodesTet3D(mesh, N);
-
-  // compute physical (x,y,z) locations of the element nodes
-  meshPhysicalNodesTet3D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsTet3D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes3D(mesh);
-
-  // compute surface geofacs
-  meshSurfaceGeometricFactorsTet3D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  dfloat rkc[5] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, Nrk * sizeof(dfloat));
-
-#if 0
-  for(int eM = 0; eM < mesh->Nelements; ++eM)
-    for(int fM = 0; fM < mesh->Nfaces; ++fM) {
-      int eP = mesh->EToE[eM * mesh->Nfaces + fM];
-      int fP = mesh->EToF[eM * mesh->Nfaces + fM];
-
-      if(eP < mesh->Nelements) {
-        if(eP < 0) {
-          eP = eM;
-          fP = fM;
-        }
-
-        dfloat sc = 1;
-        if(eM == eP) sc = -1;
-
-        int baseM = (eM * mesh->Nfaces + fM) * mesh->Nsgeo;
-        int baseP = (eP * mesh->Nfaces + fP) * mesh->Nsgeo;
-
-        dfloat avenx = mesh->sgeo[baseM + NXID] + sc * mesh->sgeo[baseP + NXID];
-        dfloat aveny = mesh->sgeo[baseM + NYID] + sc * mesh->sgeo[baseP + NYID];
-        dfloat avenz = mesh->sgeo[baseM + NZID] + sc * mesh->sgeo[baseP + NZID];
-        dfloat dsJ   = mesh->sgeo[baseM + SJID] - mesh->sgeo[baseP + SJID];
-        dfloat aven  = norm3(avenx,aveny,avenz);
-
-        if(aven > 1e-4)
-          printf("(%d,%d=>%d,%d) aven = %g,%g,%g", eM,fM, eP,fP, avenx,aveny,avenz);
-
-        if(fabs(dsJ) > 1e-4)
-          printf("sJ mismatch %g\n", dsJ);
-
-        for(int n = 0; n < mesh->Nfp; ++n) {
-          int id = n + fM * mesh->Nfp + eM * mesh->Nfp * mesh->Nfaces;
-          int idM = mesh->vmapM[id];
-          int idP = mesh->vmapP[id];
-
-          dfloat d = norm3(mesh->x[idM] - mesh->x[idP],
-                           mesh->y[idM] - mesh->y[idP],
-                           mesh->z[idM] - mesh->z[idP]);
-
-          if(d > 1e-4)
-            printf(", %g\n", d);
-        }
-      }
-    }
-
-#endif
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupTri2D.c b/src/libP/src/meshSetupTri2D.c
deleted file mode 100644
index ad287c419..000000000
--- a/src/libP/src/meshSetupTri2D.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh2D.h"
-
-mesh2D* meshSetupTri2D(char* filename, int N)
-{
-  // read chunk of elements
-  mesh2D* mesh = meshParallelReaderTri2D(filename);
-
-  // partition elements using Morton ordering & parallel sort
-
-  meshGeometricPartition2D(mesh);
-
-  //printf("Space-filling is off\n");
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  // load reference (r,s) element nodes
-  meshLoadReferenceNodesTri2D(mesh, N);
-
-  // compute physical (x,y) locations of the element nodes
-  meshPhysicalNodesTri2D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsTri2D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes2D(mesh);
-
-  // compute surface geofacs
-  meshSurfaceGeometricFactorsTri2D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  // added one more for advanced time step
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.0};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  //Adam-Bashforth
-  mesh->mrab[0] = 23. / 12.;   // deprecated
-  mesh->mrab[1] = -4. / 3.;
-  mesh->mrab[2] =  5. / 12.;
-
-  //AB half step
-  mesh->mrabb[0] = 17. / 24.;  // deprecated
-  mesh->mrabb[1] = -7. / 24.;
-  mesh->mrabb[2] =  2. / 24.;
-
-  // Clasical Adams-Bashforth Coefficients
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSetupTri3D.c b/src/libP/src/meshSetupTri3D.c
deleted file mode 100644
index a92c966a8..000000000
--- a/src/libP/src/meshSetupTri3D.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "mesh2D.h"
-#include "mesh3D.h"
-
-mesh3D* meshSetupTri3D(char* filename, int N, double sphereRadius)
-{
-  // read chunk of elements
-  mesh3D* mesh = meshParallelReaderTri3D(filename);
-
-  // set sphere radius (will be used later in building physical nodes)
-  mesh->sphereRadius = sphereRadius;
-
-  // partition elements using Morton ordering & parallel sort
-  meshGeometricPartition3D(mesh);
-
-  // connect elements using parallel sort
-  meshParallelConnect(mesh);
-
-  // print out connectivity statistics
-  meshPartitionStatistics(mesh);
-
-  // connect elements to boundary faces
-  meshConnectBoundary(mesh);
-
-  // load reference (r,s) element nodes
-  meshLoadReferenceNodesTri2D(mesh, N);
-
-  // compute physical (x,y) locations of the element nodes
-  meshPhysicalNodesTri3D(mesh);
-
-  // compute geometric factors
-  meshGeometricFactorsTri3D(mesh);
-
-  // set up halo exchange info for MPI (do before connect face nodes)
-  meshHaloSetup(mesh);
-
-  // connect face nodes (find trace indices)
-  meshConnectFaceNodes3D(mesh);
-
-  // compute surface geofacs
-  meshSurfaceGeometricFactorsTri3D(mesh);
-
-  // global nodes
-  meshParallelConnectNodes(mesh);
-
-  // initialize LSERK4 time stepping coefficients
-  int Nrk = 5;
-
-  dfloat rka[5] = {0.0,
-                   -567301805773.0 / 1357537059087.0,
-                   -2404267990393.0 / 2016746695238.0,
-                   -3550918686646.0 / 2091501179385.0,
-                   -1275806237668.0 / 842570457699.0};
-  dfloat rkb[5] = { 1432997174477.0 / 9575080441755.0,
-                    5161836677717.0 / 13612068292357.0,
-                    1720146321549.0 / 2090206949498.0,
-                    3134564353537.0 / 4481467310338.0,
-                    2277821191437.0 / 14882151754819.0};
-  // added one more for advanced time step
-  dfloat rkc[6] = {0.0,
-                   1432997174477.0 / 9575080441755.0,
-                   2526269341429.0 / 6820363962896.0,
-                   2006345519317.0 / 3224310063776.0,
-                   2802321613138.0 / 2924317926251.0,
-                   1.0};
-
-  mesh->Nrk = Nrk;
-  memcpy(mesh->rka, rka, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkb, rkb, Nrk * sizeof(dfloat));
-  memcpy(mesh->rkc, rkc, (Nrk + 1) * sizeof(dfloat));
-
-  //Adam-Bashforth
-  mesh->mrab[0] = 23. / 12.;   // deprecated
-  mesh->mrab[1] = -4. / 3.;
-  mesh->mrab[2] =  5. / 12.;
-
-  //AB half step
-  mesh->mrabb[0] = 17. / 24.;  // deprecated
-  mesh->mrabb[1] = -7. / 24.;
-  mesh->mrabb[2] =  2. / 24.;
-
-  // Clasical Adams-Bashforth Coefficients
-
-  return mesh;
-}
diff --git a/src/libP/src/meshSurfaceGeometricFactorsQuad2D.c b/src/libP/src/meshSurfaceGeometricFactorsQuad2D.c
deleted file mode 100644
index 561a4a347..000000000
--- a/src/libP/src/meshSurfaceGeometricFactorsQuad2D.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-/* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshSurfaceGeometricFactorsQuad2D(mesh2D* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nsgeo = 7;
-  mesh->sgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                mesh->Nsgeo * mesh->Nfp * mesh->Nfaces,
-                                sizeof(dfloat));
-
-  mesh->cubsgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                   mesh->Nsgeo * mesh->cubNq * mesh->Nfaces,
-                                   sizeof(dfloat));
-
-  for(dlong e = 0; e < mesh->Nelements + mesh->totalHaloPairs; ++e) { /* for each element */
-    /* find vertex indices and physical coordinates */
-    dlong id = e * mesh->Nverts;
-
-    dfloat* xe = mesh->EX + id;
-    dfloat* ye = mesh->EY + id;
-
-    for(int f = 0; f < mesh->Nfaces; ++f) { // for each face
-      for(int i = 0; i < mesh->Nfp; ++i) { // for each node on face
-        /* volume index of face node */
-        int n = mesh->faceNodes[f * mesh->Nfp + i];
-
-        /* local node coordinates */
-        dfloat rn = mesh->r[n];
-        dfloat sn = mesh->s[n];
-
-        /* Jacobian matrix */
-        dfloat xr = 0.25 * ( (1 - sn) * (xe[1] - xe[0]) + (1 + sn) * (xe[2] - xe[3]) );
-        dfloat xs = 0.25 * ( (1 - rn) * (xe[3] - xe[0]) + (1 + rn) * (xe[2] - xe[1]) );
-        dfloat yr = 0.25 * ( (1 - sn) * (ye[1] - ye[0]) + (1 + sn) * (ye[2] - ye[3]) );
-        dfloat ys = 0.25 * ( (1 - rn) * (ye[3] - ye[0]) + (1 + rn) * (ye[2] - ye[1]) );
-
-        /* compute geometric factors for affine coordinate transform*/
-        dfloat J = xr * ys - xs * yr;
-
-        /* face f normal and length */
-        dfloat nx =   ye[(f + 1) % mesh->Nverts] - ye[f];
-        dfloat ny = -(xe[(f + 1) % mesh->Nverts] - xe[f]);
-        dfloat d = norm2(nx,ny);
-
-        /* output index */
-        dlong base = mesh->Nsgeo * (mesh->Nfaces * mesh->Nfp * e + mesh->Nfp * f + i);
-
-        /* store normal, surface Jacobian, and reciprocal of volume Jacobian */
-        mesh->sgeo[base + NXID] = nx / d;
-        mesh->sgeo[base + NYID] = ny / d;
-        mesh->sgeo[base + SJID] = d / 2.;
-        mesh->sgeo[base + IJID] = 1. / J;
-
-        mesh->sgeo[base + WIJID] = 1. / (J * mesh->gllw[0]);
-        mesh->sgeo[base + WSJID] = (d / 2.) * mesh->gllw[i];
-      }
-
-      //geometric data for quadrature
-      for(int i = 0; i < mesh->cubNq; ++i) { // for each quadrature node on face
-        dfloat rn = 0., sn = 0.;
-
-        /* interpolate local node coordinates */
-        for (int j = 0; j < mesh->Nfp; j++) {
-          /* volume index of face node */
-          int n = mesh->faceNodes[f * mesh->Nfp + j];
-
-          rn += mesh->cubInterp[i * mesh->Nfp + j] * mesh->r[n];
-          sn += mesh->cubInterp[i * mesh->Nfp + j] * mesh->s[n];
-        }
-
-        /* Jacobian matrix */
-        dfloat xr = 0.25 * ( (1 - sn) * (xe[1] - xe[0]) + (1 + sn) * (xe[2] - xe[3]) );
-        dfloat xs = 0.25 * ( (1 - rn) * (xe[3] - xe[0]) + (1 + rn) * (xe[2] - xe[1]) );
-        dfloat yr = 0.25 * ( (1 - sn) * (ye[1] - ye[0]) + (1 + sn) * (ye[2] - ye[3]) );
-        dfloat ys = 0.25 * ( (1 - rn) * (ye[3] - ye[0]) + (1 + rn) * (ye[2] - ye[1]) );
-
-        /* compute geometric factors for affine coordinate transform*/
-        dfloat J = xr * ys - xs * yr;
-
-        /* face f normal and length */
-        dfloat nx =   ye[(f + 1) % mesh->Nverts] - ye[f];
-        dfloat ny = -(xe[(f + 1) % mesh->Nverts] - xe[f]);
-        dfloat d = norm2(nx,ny);
-
-        /* output index */
-        dlong base = mesh->Nsgeo * (mesh->Nfaces * mesh->cubNq * e + mesh->cubNq * f + i);
-
-        /* store normal, surface Jacobian, and reciprocal of volume Jacobian */
-        mesh->cubsgeo[base + NXID] = nx / d;
-        mesh->cubsgeo[base + NYID] = ny / d;
-        mesh->cubsgeo[base + SJID] = d / 2.;
-        mesh->cubsgeo[base + IJID] = 1. / J;
-
-        mesh->cubsgeo[base + WIJID] = 1. / (J * mesh->cubw[0]);
-        mesh->cubsgeo[base + WSJID] = (d / 2.) * mesh->cubw[i];
-      }
-    }
-  }
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) /* for each non-halo element */
-    for(int n = 0; n < mesh->Nfp * mesh->Nfaces; ++n) {
-      dlong baseM = e * mesh->Nfp * mesh->Nfaces + n;
-      dlong baseP = mesh->mapP[baseM];
-      if(baseP < 0) baseP = baseM;
-
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
-      dfloat hinvM = mesh->sgeo[baseM * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseM * mesh->Nsgeo + IJID];
-      dfloat hinvP = mesh->sgeo[baseP * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseP * mesh->Nsgeo + IJID];
-
-      mesh->sgeo[baseM * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-      mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-    }
-}
diff --git a/src/libP/src/meshSurfaceGeometricFactorsQuad3D.c b/src/libP/src/meshSurfaceGeometricFactorsQuad3D.c
deleted file mode 100644
index 5671b37ec..000000000
--- a/src/libP/src/meshSurfaceGeometricFactorsQuad3D.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-/* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshSurfaceGeometricFactorsQuad3D(mesh_t* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nsgeo = 14; // fix later
-  mesh->sgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                mesh->Nsgeo * mesh->Nfp * mesh->Nfaces,
-                                sizeof(dfloat));
-
-  mesh->cubsgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                   mesh->Nsgeo * mesh->cubNq * mesh->Nfaces,
-                                   sizeof(dfloat));
-
-  dfloat* cubx = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                  mesh->cubNq * mesh->Nfaces, sizeof(dfloat));
-
-  dfloat* cuby = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                  mesh->cubNq * mesh->Nfaces, sizeof(dfloat));
-
-  dfloat* cubz = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                  mesh->cubNq * mesh->Nfaces, sizeof(dfloat));
-
-  dfloat* xr = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* yr = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* zr = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-
-  dfloat* xs = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* ys = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* zs = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-
-  dfloat* J  = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-
-  for(int e = 0; e < mesh->Nelements + mesh->totalHaloPairs; ++e) { /* for each element */
-    for(int j = 0; j < mesh->Nq; ++j)
-      for(int i = 0; i < mesh->Nq; ++i) {
-        dfloat x = mesh->x[i + j * mesh->Nq + e * mesh->Np];
-        dfloat y = mesh->y[i + j * mesh->Nq + e * mesh->Np];
-        dfloat z = mesh->z[i + j * mesh->Nq + e * mesh->Np];
-
-        dfloat xrij = 0, yrij = 0, zrij = 0;
-        dfloat xsij = 0, ysij = 0, zsij = 0;
-
-        for(int n = 0; n < mesh->Nq; ++n) {
-          dfloat Din = mesh->D[i * mesh->Nq + n];
-          dfloat Djn = mesh->D[j * mesh->Nq + n];
-
-          xrij += Din * mesh->x[n + j * mesh->Nq + e * mesh->Np];
-          yrij += Din * mesh->y[n + j * mesh->Nq + e * mesh->Np];
-          zrij += Din * mesh->z[n + j * mesh->Nq + e * mesh->Np];
-
-          xsij += Djn * mesh->x[i + n * mesh->Nq + e * mesh->Np];
-          ysij += Djn * mesh->y[i + n * mesh->Nq + e * mesh->Np];
-          zsij += Djn * mesh->z[i + n * mesh->Nq + e * mesh->Np];
-        }
-
-        dfloat txij = yrij * zsij - zrij * ysij;
-        dfloat tyij = zrij * xsij - xrij * zsij;
-        dfloat tzij = xrij * ysij - yrij * xsij;
-
-        dfloat Gx = txij, Gy = tyij, Gz = tzij;
-
-        dfloat Jij = x * txij + y * tyij + z * tzij;
-
-        xr[i + j * mesh->Nq] = xrij;
-        yr[i + j * mesh->Nq] = yrij;
-        zr[i + j * mesh->Nq] = zrij;
-
-        xs[i + j * mesh->Nq] = xsij;
-        ys[i + j * mesh->Nq] = ysij;
-        zs[i + j * mesh->Nq] = zsij;
-
-        J[i + j * mesh->Nq] = sqrt(Gx * Gx + Gy * Gy + Gz * Gz);
-      }
-
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      for(int n = 0; n < mesh->Nq; ++n) {
-        int id = mesh->faceNodes[n + f * mesh->Nq];
-
-        dfloat xid = mesh->x[id + e * mesh->Np];
-        dfloat yid = mesh->y[id + e * mesh->Np];
-        dfloat zid = mesh->z[id + e * mesh->Np];
-        dfloat Jid = J[id];
-
-        dfloat nx, ny, nz;
-
-        if(f == 0) {
-          nx = yr[id] * zid - zr[id] * yid;
-          ny = zr[id] * xid - xr[id] * zid;
-          nz = xr[id] * yid - yr[id] * xid;
-        }
-
-        if(f == 1) {
-          nx = ys[id] * zid - zs[id] * yid;
-          ny = zs[id] * xid - xs[id] * zid;
-          nz = xs[id] * yid - ys[id] * xid;
-        }
-
-        if(f == 2) {
-          nx = -yr[id] * zid + zr[id] * yid;
-          ny = -zr[id] * xid + xr[id] * zid;
-          nz = -xr[id] * yid + yr[id] * xid;
-        }
-
-        if(f == 3) {
-          nx = -ys[id] * zid + zs[id] * yid;
-          ny = -zs[id] * xid + xs[id] * zid;
-          nz = -xs[id] * yid + ys[id] * xid;
-        }
-
-        dfloat R = sqrt(xid * xid + yid * yid + zid * zid);
-
-        nx /= R;
-        ny /= R;
-        nz /= R;
-
-        dfloat sJ = sqrt(nx * nx + ny * ny + nz * nz);
-
-        nx /= sJ;
-        ny /= sJ;
-        nz /= sJ;
-
-        if(sJ < 1e-8) {
-          printf("Negative or small surface Jacobian: %g\n", sJ);
-          exit(-1);
-        }
-
-        int base = mesh->Nsgeo * (e * mesh->Nq * mesh->Nfaces + n + f * mesh->Nq);
-
-        mesh->sgeo[base + NXID] = nx;
-        mesh->sgeo[base + NYID] = ny;
-        mesh->sgeo[base + NZID] = nz;
-        mesh->sgeo[base + SJID] = sJ;
-
-        mesh->sgeo[base + IJID] = 1. / Jid;
-
-        mesh->sgeo[base + WIJID] = 1. / (Jid * mesh->gllw[0]);
-        mesh->sgeo[base + WSJID] = sJ * mesh->gllw[n];
-      }
-
-    // interpolate geofacs to surface quadrature
-    for(int f = 0; f < mesh->Nfaces; ++f)
-
-      for(int n = 0; n < mesh->cubNq; ++n) {
-        dfloat cxr = 0, cxs = 0, cx = 0;
-        dfloat cyr = 0, cys = 0, cy = 0;
-        dfloat czr = 0, czs = 0, cz = 0;
-
-        for(int i = 0; i < mesh->Nq; ++i) {
-          int id = mesh->faceNodes[i + f * mesh->Nq];
-          dfloat cIni = mesh->cubInterp[n * mesh->Nq + i];
-          cxr += cIni * xr[id];
-          cxs += cIni * xs[id];
-          cyr += cIni * yr[id];
-          cys += cIni * ys[id];
-          czr += cIni * zr[id];
-          czs += cIni * zs[id];
-          cx  += cIni * mesh->x[id + e * mesh->Np];
-          cy  += cIni * mesh->y[id + e * mesh->Np];
-          cz  += cIni * mesh->z[id + e * mesh->Np];
-        }
-
-        cubx[e * mesh->cubNq * mesh->Nfaces + f * mesh->cubNq + n] = cx;
-        cuby[e * mesh->cubNq * mesh->Nfaces + f * mesh->cubNq + n] = cy;
-        cubz[e * mesh->cubNq * mesh->Nfaces + f * mesh->cubNq + n] = cz;
-
-        dfloat Gx = cyr * czs - czr * cys;
-        dfloat Gy = czr * cxs - cxr * czs;
-        dfloat Gz = cxr * cys - cyr * cxs;
-        dfloat cJ = sqrt(Gx * Gx + Gy * Gy + Gz * Gz);
-        dfloat volJ = cx * Gx + cy * Gy + cz * Gz; // xij*tx + yij*ty + zij*tz;
-        dfloat nx, ny, nz;
-
-        if(f == 0) {
-          nx = cyr * cz - czr * cy;
-          ny = czr * cx - cxr * cz;
-          nz = cxr * cy - cyr * cx;
-        }
-
-        if(f == 1) {
-          nx = cys * cz - czs * cy;
-          ny = czs * cx - cxs * cz;
-          nz = cxs * cy - cys * cx;
-        }
-
-        if(f == 2) {
-          nx = -cyr * cz + czr * cy;
-          ny = -czr * cx + cxr * cz;
-          nz = -cxr * cy + cyr * cx;
-        }
-
-        if(f == 3) {
-          nx = -cys * cz + czs * cy;
-          ny = -czs * cx + cxs * cz;
-          nz = -cxs * cy + cys * cx;
-        }
-
-        dfloat R = sqrt(cx * cx + cy * cy + cz * cz);
-
-        nx /= R;
-        ny /= R;
-        nz /= R;
-
-        dfloat sJ = sqrt(nx * nx + ny * ny + nz * nz);
-
-        nx /= sJ;
-        ny /= sJ;
-        nz /= sJ;
-
-        if(sJ < 1e-8) {
-          printf("Negative or small surface Jacobian: %g\n", sJ);
-          exit(-1);
-        }
-
-        int base = mesh->Nsgeo * (e * mesh->cubNq * mesh->Nfaces + n + f * mesh->cubNq);
-
-        mesh->cubsgeo[base + NXID] = nx;
-        mesh->cubsgeo[base + NYID] = ny;
-        mesh->cubsgeo[base + NZID] = nz;
-        mesh->cubsgeo[base + SJID] = sJ;
-        mesh->cubsgeo[base + IHID] = sJ / volJ;
-        //	mesh->cubsgeo[base+WSJID] = sJ*mesh->cubw[n];
-      }
-  }
-
-#if 0
-  for(int e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      for(int n = 0; n < mesh->Nq; ++n) {
-        int idM = n + f * mesh->Nq + e * mesh->Nfaces * mesh->Nq;
-        int idP = mesh->mapP[idM];
-        int eP = idP / (mesh->Nq * mesh->Nfaces);
-        int fP = (idP % (mesh->Nq * mesh->Nfaces)) / mesh->Nq;
-        int nP = (idP % mesh->Nq);
-        int baseM = e * mesh->Nq * mesh->Nfaces * mesh->Nsgeo + f * mesh->Nq * mesh->Nsgeo + n;
-        int baseP = eP * mesh->Nq * mesh->Nfaces * mesh->Nsgeo + fP * mesh->Nq * mesh->Nsgeo + nP;
-        printf("e,f,n=(%d,%d,%d)-(%d,%d,%d): xP-xM=(%g,%g,%g) : norP+norM=%g,%g,%g\n",
-               e,f,n,eP,fP,nP,
-               mesh->x[mesh->vmapP[idM]] - mesh->x[mesh->vmapM[idM]],
-               mesh->y[mesh->vmapP[idM]] - mesh->y[mesh->vmapM[idM]],
-               mesh->z[mesh->vmapP[idM]] - mesh->z[mesh->vmapM[idM]],
-               mesh->sgeo[baseM + NXID * mesh->Nq] + mesh->sgeo[baseP + NXID * mesh->Nq],
-               mesh->sgeo[baseM + NYID * mesh->Nq] + mesh->sgeo[baseP + NYID * mesh->Nq],
-               mesh->sgeo[baseM + NZID * mesh->Nq] + mesh->sgeo[baseP + NZID * mesh->Nq]);
-      }
-
-#endif
-  // TW: omit 1/min(h) calculation
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) /* for each non-halo element */
-    for(int n = 0; n < mesh->Nfp * mesh->Nfaces; ++n) {
-      dlong baseM = e * mesh->Nfp * mesh->Nfaces + n;
-      dlong baseP = mesh->mapP[baseM];
-      if(baseP < 0) baseP = baseM;
-
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
-      dfloat hinvM = mesh->sgeo[baseM * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseM * mesh->Nsgeo + IJID];
-      dfloat hinvP = mesh->sgeo[baseP * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseP * mesh->Nsgeo + IJID];
-
-      //      printf("hinvM/P = %g,%g\n", hinvM, hinvP);
-
-      mesh->sgeo[baseM * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-      mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-    }
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) /* for each non-halo element */
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong eP = mesh->EToE[e * mesh->Nfaces + f];
-      dlong fP = mesh->EToF[e * mesh->Nfaces + f];
-
-      dfloat maxhinv  = 0;
-      for(int n = 0; n < mesh->cubNq; ++n) {
-        dlong idM = e * mesh->cubNq * mesh->Nfaces + f * mesh->cubNq + n;
-        dfloat cxM = cubx[idM];
-        dfloat cyM = cuby[idM];
-        dfloat czM = cubz[idM];
-
-        dfloat mindist2;
-        int minidP = 0;
-        // jump through hoops to find neighbor cubature node
-        // [ not needed elsewhere since we interpolate consistently ]
-        dlong idP;
-        for(int m = 0; m < mesh->cubNq; ++m) {
-          idP = eP * mesh->cubNq * mesh->Nfaces + fP * mesh->cubNq + m;
-
-          dfloat cxP = cubx[idP];
-          dfloat cyP = cuby[idP];
-          dfloat czP = cubz[idP];
-
-          dfloat dist2 = pow(cxP - cxM,2) + pow(cyP - cyM,2) + pow(czP - czM,2);
-
-          if(m == 0 || dist2 < mindist2) {
-            mindist2 = dist2;
-            minidP = m;
-          }
-        }
-
-        if(mindist2 > 1e-12)
-          printf("mindist2 = %g\n", mindist2);
-
-        idM = mesh->Nsgeo * ( e * mesh->cubNq * mesh->Nfaces + f * mesh->cubNq + n) + IHID;
-        idP = mesh->Nsgeo * (eP * mesh->cubNq * mesh->Nfaces + fP * mesh->cubNq + minidP) + IHID;
-
-        dfloat hinv = mymax(mesh->cubsgeo[idM],mesh->cubsgeo[idP]);
-        mesh->cubsgeo[idM] = hinv;
-        mesh->cubsgeo[idP] = hinv;
-      }
-    }
-
-}
diff --git a/src/libP/src/meshSurfaceGeometricFactorsTet3D.c b/src/libP/src/meshSurfaceGeometricFactorsTet3D.c
deleted file mode 100644
index 6df0ee819..000000000
--- a/src/libP/src/meshSurfaceGeometricFactorsTet3D.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-void computeFrameTet3D(dfloat nx, dfloat ny, dfloat nz,
-                       dfloat &tanx, dfloat &tany, dfloat &tanz,
-                       dfloat &binx, dfloat &biny, dfloat &binz)
-{
-  dfloat ranx = drand48();
-  dfloat rany = drand48();
-  dfloat ranz = drand48();
-
-  dfloat magran = sqrt(ranx * ranx + rany * rany + ranz * ranz);
-
-  ranx /= magran;
-  rany /= magran;
-  ranz /= magran;
-
-  tanx = ny * ranz - nz * rany;
-  tany = nz * ranx - nx * ranz;
-  tanz = nx * rany - ny * ranx;
-
-  dfloat magtan = sqrt(tanx * tanx + tany * tany + tanz * tanz);
-
-  tanx /= magtan;
-  tany /= magtan;
-  tanz /= magtan;
-
-  binx = ny * tanz - nz * tany;
-  biny = nz * tanx - nx * tanz;
-  binz = nx * tany - ny * tanx;
-
-  dfloat magbin = sqrt(binx * binx + biny * biny + binz * binz);
-
-  binx /= magbin;
-  biny /= magbin;
-  binz /= magbin;
-
-  //  printf("nor = %g,%g,%g; tan = %g,%g,%g; bin = %g,%g,%g\n", nx, ny, nz, tanx, tany, tanz, binx, biny, binz);
-}
-
-void meshSurfaceGeometricFactorsTet3D(mesh3D* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nsgeo = 14;
-  mesh->sgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                mesh->Nsgeo * mesh->Nfaces, sizeof(dfloat));
-
-  for(dlong e = 0; e < mesh->Nelements + mesh->totalHaloPairs; ++e) { /* for each element */
-    /* find vertex indices and physical coordinates */
-    dlong id = e * mesh->Nverts;
-    dfloat xe1 = mesh->EX[id + 0], ye1 = mesh->EY[id + 0], ze1 = mesh->EZ[id + 0];
-    dfloat xe2 = mesh->EX[id + 1], ye2 = mesh->EY[id + 1], ze2 = mesh->EZ[id + 1];
-    dfloat xe3 = mesh->EX[id + 2], ye3 = mesh->EY[id + 2], ze3 = mesh->EZ[id + 2];
-    dfloat xe4 = mesh->EX[id + 3], ye4 = mesh->EY[id + 3], ze4 = mesh->EZ[id + 3];
-
-    /* Jacobian matrix */
-    dfloat xr = 0.5 * (xe2 - xe1), xs = 0.5 * (xe3 - xe1), xt = 0.5 * (xe4 - xe1);
-    dfloat yr = 0.5 * (ye2 - ye1), ys = 0.5 * (ye3 - ye1), yt = 0.5 * (ye4 - ye1);
-    dfloat zr = 0.5 * (ze2 - ze1), zs = 0.5 * (ze3 - ze1), zt = 0.5 * (ze4 - ze1);
-
-    /* compute geometric factors for affine coordinate transform*/
-    dfloat J = xr * (ys * zt - zs * yt) - yr * (xs * zt - zs * xt) + zr * (xs * yt - ys * xt);
-    dfloat rx =  (ys * zt - zs * yt) / J, ry = -(xs * zt - zs * xt) / J,
-           rz =  (xs * yt - ys * xt) / J;
-    dfloat sx = -(yr * zt - zr * yt) / J, sy =  (xr * zt - zr * xt) / J,
-           sz = -(xr * yt - yr * xt) / J;
-    dfloat tx =  (yr * zs - zr * ys) / J, ty = -(xr * zs - zr * xs) / J,
-           tz =  (xr * ys - yr * xs) / J;
-
-    if(J < 0) printf("bugger: got negative geofac\n");
-
-    /* face 1 */
-    dlong base = mesh->Nsgeo * mesh->Nfaces * e;
-    dfloat nx1 = -tx;
-    dfloat ny1 = -ty;
-    dfloat nz1 = -tz;
-    dfloat sJ1 = norm3(nx1,ny1,nz1);
-
-    mesh->sgeo[base + NXID] = nx1 / sJ1;
-    mesh->sgeo[base + NYID] = ny1 / sJ1;
-    mesh->sgeo[base + NZID] = nz1 / sJ1;
-    mesh->sgeo[base + SJID] = sJ1 * J;
-    mesh->sgeo[base + IJID] = 1. / J;
-
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx1 / sJ1, ny1 / sJ1, nz1 / sJ1,
-                      mesh->sgeo[base + STXID], mesh->sgeo[base + STYID], mesh->sgeo[base + STZID],
-                      mesh->sgeo[base + SBXID], mesh->sgeo[base + SBYID], mesh->sgeo[base + SBZID]);
-
-    /* face 2 */
-    base += mesh->Nsgeo;
-    dfloat nx2 = -sx;
-    dfloat ny2 = -sy;
-    dfloat nz2 = -sz;
-    dfloat sJ2 = norm3(nx2,ny2,nz2);
-
-    mesh->sgeo[base + NXID] = nx2 / sJ2;
-    mesh->sgeo[base + NYID] = ny2 / sJ2;
-    mesh->sgeo[base + NZID] = nz2 / sJ2;
-    mesh->sgeo[base + SJID] = sJ2 * J;
-    mesh->sgeo[base + IJID] = 1. / J;
-
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx2 / sJ2, ny2 / sJ2, nz2 / sJ2,
-                      mesh->sgeo[base + STXID], mesh->sgeo[base + STYID], mesh->sgeo[base + STZID],
-                      mesh->sgeo[base + SBXID], mesh->sgeo[base + SBYID], mesh->sgeo[base + SBZID]);
-
-    /* face 3 */
-    base += mesh->Nsgeo;
-    dfloat nx3 = rx + sx + tx;
-    dfloat ny3 = ry + sy + ty;
-    dfloat nz3 = rz + sz + tz;
-    dfloat sJ3 = norm3(nx3,ny3,nz3);
-
-    mesh->sgeo[base + NXID] = nx3 / sJ3;
-    mesh->sgeo[base + NYID] = ny3 / sJ3;
-    mesh->sgeo[base + NZID] = nz3 / sJ3;
-    mesh->sgeo[base + SJID] = sJ3 * J;
-    mesh->sgeo[base + IJID] = 1. / J;
-
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx3 / sJ3, ny3 / sJ3, nz3 / sJ3,
-                      mesh->sgeo[base + STXID], mesh->sgeo[base + STYID], mesh->sgeo[base + STZID],
-                      mesh->sgeo[base + SBXID], mesh->sgeo[base + SBYID], mesh->sgeo[base + SBZID]);
-
-    /* face 4 */
-    base += mesh->Nsgeo;
-    dfloat nx4 = -rx;
-    dfloat ny4 = -ry;
-    dfloat nz4 = -rz;
-    dfloat sJ4 = norm3(nx4,ny4,nz4);
-
-    mesh->sgeo[base + NXID] = nx4 / sJ4;
-    mesh->sgeo[base + NYID] = ny4 / sJ4;
-    mesh->sgeo[base + NZID] = nz4 / sJ4;
-    mesh->sgeo[base + SJID] = sJ4 * J;
-    mesh->sgeo[base + IJID] = 1. / J;
-
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx4 / sJ4, ny4 / sJ4, nz4 / sJ4,
-                      mesh->sgeo[base + STXID], mesh->sgeo[base + STYID], mesh->sgeo[base + STZID],
-                      mesh->sgeo[base + SBXID], mesh->sgeo[base + SBYID], mesh->sgeo[base + SBZID]);
-
-#if 0
-    printf("N1=(%g,%g,%g),sJ1=%g\n", nx1 / sJ1,ny1 / sJ1,nz1 / sJ1,sJ1 * J);
-    printf("N2=(%g,%g,%g),sJ2=%g\n", nx2 / sJ2,ny2 / sJ2,nz2 / sJ2,sJ2 * J);
-    printf("N3=(%g,%g,%g),sJ3=%g\n", nx3 / sJ3,ny3 / sJ3,nz3 / sJ3,sJ3 * J);
-    printf("N4=(%g,%g,%g),sJ4=%g\n", nx4 / sJ4,ny4 / sJ4,nz4 / sJ4,sJ4 * J);
-#endif
-  }
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) /* for each non-halo element */
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong baseM = e * mesh->Nfaces + f;
-
-      // awkward: (need to find eP,fP relative to bulk+halo)
-      dlong idP = mesh->vmapP[e * mesh->Nfp * mesh->Nfaces + f * mesh->Nfp + 0];
-      dlong eP = (idP >= 0) ? (idP / mesh->Np):e;
-
-      int fP = mesh->EToF[baseM];
-      fP = (fP == -1) ? f:fP;
-
-      dlong baseP = eP * mesh->Nfaces + fP;
-
-      // rescaling,  V = A*h/3 => (J*4/3) = (sJ*2)*h/3 => h  = 0.5*J/sJ
-      dfloat hinvM = 0.5 * mesh->sgeo[baseM * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseM * mesh->Nsgeo + IJID];
-      dfloat hinvP = 0.5 * mesh->sgeo[baseP * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseP * mesh->Nsgeo + IJID];
-
-      mesh->sgeo[baseM * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-      mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-    }
-}
diff --git a/src/libP/src/meshSurfaceGeometricFactorsTri2D.c b/src/libP/src/meshSurfaceGeometricFactorsTri2D.c
deleted file mode 100644
index de3a374db..000000000
--- a/src/libP/src/meshSurfaceGeometricFactorsTri2D.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh2D.h"
-
-void meshSurfaceGeometricFactorsTri2D(mesh2D* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nsgeo = 6;
-  mesh->sgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                mesh->Nsgeo * mesh->Nfaces,
-                                sizeof(dfloat));
-
-  for(dlong e = 0; e < mesh->Nelements + mesh->totalHaloPairs; ++e) { /* for each element */
-    /* find vertex indices and physical coordinates */
-    dlong id = e * mesh->Nverts;
-
-    dfloat xe1 = mesh->EX[id + 0];
-    dfloat xe2 = mesh->EX[id + 1];
-    dfloat xe3 = mesh->EX[id + 2];
-
-    dfloat ye1 = mesh->EY[id + 0];
-    dfloat ye2 = mesh->EY[id + 1];
-    dfloat ye3 = mesh->EY[id + 2];
-
-    /* compute geometric factors for affine coordinate transform*/
-    dfloat J = 0.25 * ((xe2 - xe1) * (ye3 - ye1) - (xe3 - xe1) * (ye2 - ye1));
-    if(J < 0) printf("bugger: got negative geofac\n");
-
-    /* face 1 */
-    dlong base = mesh->Nsgeo * mesh->Nfaces * e;
-    dfloat nx1 = ye2 - ye1;
-    dfloat ny1 = -(xe2 - xe1);
-    dfloat d1 = norm2(nx1,ny1);
-
-    mesh->sgeo[base + NXID] = nx1 / d1;
-    mesh->sgeo[base + NYID] = ny1 / d1;
-    mesh->sgeo[base + SJID] = d1 / 2.;
-    mesh->sgeo[base + IJID] = 1. / J;
-
-    /* face 2 */
-    base += mesh->Nsgeo;
-    dfloat nx2 = ye3 - ye2;
-    dfloat ny2 = -(xe3 - xe2);
-    dfloat d2 = norm2(nx2,ny2);
-
-    mesh->sgeo[base + NXID] = nx2 / d2;
-    mesh->sgeo[base + NYID] = ny2 / d2;
-    mesh->sgeo[base + SJID] = d2 / 2.; // TW fixed bug d1=>d2
-    mesh->sgeo[base + IJID] = 1. / J;
-
-    /* face 3 */
-    base += mesh->Nsgeo;
-    dfloat nx3 = ye1 - ye3;
-    dfloat ny3 = -(xe1 - xe3);
-    dfloat d3 = norm2(nx3,ny3);
-
-    mesh->sgeo[base + NXID] = nx3 / d3;
-    mesh->sgeo[base + NYID] = ny3 / d3;
-    mesh->sgeo[base + SJID] = d3 / 2.;
-    mesh->sgeo[base + IJID] = 1. / J;
-  }
-
-  dfloat href = 0.;
-  dfloat tol  = 1.;
-  for(dlong e = 0; e < mesh->Nelements; ++e) /* for each non-halo element */
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong baseM = e * mesh->Nfaces + f;
-
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)  A = L*h/2 => (J*2) = (sJ*2)*h/2 => h  = 2*J/sJ
-      dfloat hinvM = mesh->sgeo[baseM * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseM * mesh->Nsgeo + IJID];
-
-      href = mymax(hinvM,href);
-    }
-
-  for(dlong e = 0; e < mesh->Nelements; ++e) { /* for each non-halo element */
-    for(int f = 0; f < mesh->Nfaces; ++f) {
-      dlong baseM = e * mesh->Nfaces + f;
-
-      // awkward: (need to find eP,fP relative to bulk+halo)
-      dlong idP = mesh->vmapP[e * mesh->Nfp * mesh->Nfaces + f * mesh->Nfp + 0];
-      dlong eP = (idP >= 0) ? (idP / mesh->Np):e;
-
-      int fP = mesh->EToF[baseM];
-      fP = (fP == -1) ? f:fP;
-
-      dlong baseP = eP * mesh->Nfaces + fP;
-
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)  A = L*h/2 => (J*2) = (sJ*2)*h/2 => h  = 2*J/sJ
-      dfloat hinvM = mesh->sgeo[baseM * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseM * mesh->Nsgeo + IJID];
-      dfloat hinvP = mesh->sgeo[baseP * mesh->Nsgeo + SJID] *
-                     mesh->sgeo[baseP * mesh->Nsgeo + IJID];
-
-      mesh->sgeo[baseM * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-      mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
-
-      if (mesh->EToB[f + e * mesh->Nfaces] > 0) { //enforce a stronger penalty on boundaries
-        mesh->sgeo[baseM * mesh->Nsgeo + IHID] = mymax(mesh->sgeo[baseM * mesh->Nsgeo + IHID],
-                                                       tol * href);
-        mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(mesh->sgeo[baseP * mesh->Nsgeo + IHID],
-                                                       tol * href);
-      }
-#if 0
-      printf("e=%d f=%d (eP=%d,fP=%d) nx=%5.4f, ny=%5.4f, sJ=%5.4f, invJ=%5.4f, hinv=%f\n"
-             ,e,f,eP,fP,
-             mesh->sgeo[baseM * mesh->Nsgeo + NXID],
-             mesh->sgeo[baseM * mesh->Nsgeo + NYID],
-             mesh->sgeo[baseM * mesh->Nsgeo + SJID],
-             mesh->sgeo[baseM * mesh->Nsgeo + IJID],
-             mesh->sgeo[baseM * mesh->Nsgeo + IHID]);
-#endif
-    }
-  }
-}
diff --git a/src/libP/src/meshSurfaceGeometricFactorsTri3D.c b/src/libP/src/meshSurfaceGeometricFactorsTri3D.c
deleted file mode 100644
index 8aa0dd63d..000000000
--- a/src/libP/src/meshSurfaceGeometricFactorsTri3D.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh3D.h"
-
-/* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshSurfaceGeometricFactorsTri3D(mesh_t* mesh)
-{
-  /* unified storage array for geometric factors */
-  mesh->Nsgeo = 14;
-  mesh->sgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) *
-                                mesh->Nsgeo * mesh->Nfp * mesh->Nfaces,
-                                sizeof(dfloat));
-
-  dfloat* xr = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* yr = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* zr = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-
-  dfloat* xs = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* ys = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-  dfloat* zs = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-
-  dfloat* J  = (dfloat*) calloc(mesh->Np, sizeof(dfloat));
-
-  for(int e = 0; e < mesh->Nelements + mesh->totalHaloPairs; ++e) { /* for each element */
-    for(int n = 0; n < mesh->Np; ++n) {
-      dfloat x = mesh->x[n + e * mesh->Np];
-      dfloat y = mesh->y[n + e * mesh->Np];
-      dfloat z = mesh->z[n + e * mesh->Np];
-
-      dfloat xrn = 0, yrn = 0, zrn = 0;
-      dfloat xsn = 0, ysn = 0, zsn = 0;
-
-      for(int m = 0; m < mesh->Np; ++m) {
-        dfloat Dr = mesh->Dr[n * mesh->Np + m];
-        dfloat Ds = mesh->Ds[n * mesh->Np + m];
-
-        xrn += Dr * mesh->x[m + e * mesh->Np];
-        yrn += Dr * mesh->y[m + e * mesh->Np];
-        zrn += Dr * mesh->z[m + e * mesh->Np];
-
-        xsn += Ds * mesh->x[m + e * mesh->Np];
-        ysn += Ds * mesh->y[m + e * mesh->Np];
-        zsn += Ds * mesh->z[m + e * mesh->Np];
-      }
-
-      dfloat txn = yrn * zsn - zrn * ysn;
-      dfloat tyn = zrn * xsn - xrn * zsn;
-      dfloat tzn = xrn * ysn - yrn * xsn;
-
-      dfloat Gx = txn, Gy = tyn, Gz = tzn;
-
-      dfloat Jn = x * txn + y * tyn + z * tzn;
-
-      xr[n] = xrn;
-      yr[n] = yrn;
-      zr[n] = zrn;
-
-      xs[n] = xsn;
-      ys[n] = ysn;
-      zs[n] = zsn;
-
-      J[n] = sqrt(Gx * Gx + Gy * Gy + Gz * Gz);
-    }
-
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      for(int n = 0; n < mesh->Nfp; ++n) {
-        int id = mesh->faceNodes[n + f * mesh->Nfp];
-
-        dfloat xid = mesh->x[id + e * mesh->Np];
-        dfloat yid = mesh->y[id + e * mesh->Np];
-        dfloat zid = mesh->z[id + e * mesh->Np];
-        dfloat Jid = J[id];
-
-        dfloat nx, ny, nz;
-
-        if(f == 0) {
-          nx = yr[id] * zid - zr[id] * yid;
-          ny = zr[id] * xid - xr[id] * zid;
-          nz = xr[id] * yid - yr[id] * xid;
-        }
-
-        if(f == 1) {
-          nx = (ys[id] - yr[id]) * zid - (zs[id] - zr[id]) * yid;
-          ny = (zs[id] - zr[id]) * xid - (xs[id] - xr[id]) * zid;
-          nz = (xs[id] - xr[id]) * yid - (ys[id] - yr[id]) * xid;
-        }
-
-        if(f == 2) {
-          nx = -ys[id] * zid + zs[id] * yid;
-          ny = -zs[id] * xid + xs[id] * zid;
-          nz = -xs[id] * yid + ys[id] * xid;
-        }
-
-        dfloat R = sqrt(xid * xid + yid * yid + zid * zid);
-
-        nx /= R;
-        ny /= R;
-        nz /= R;
-
-        dfloat sJ = sqrt(nx * nx + ny * ny + nz * nz);
-
-        nx /= sJ;
-        ny /= sJ;
-        nz /= sJ;
-
-        if(sJ < 1e-8) {
-          printf("Negative or small surface Jacobian: %g\n", sJ);
-          exit(-1);
-        }
-
-        int base = e * mesh->Nfp * mesh->Nfaces * mesh->Nsgeo + n + f * mesh->Nfp;
-
-        mesh->sgeo[base + mesh->Nfp * mesh->Nfaces * NXID] = nx;
-        mesh->sgeo[base + mesh->Nfp * mesh->Nfaces * NYID] = ny;
-        mesh->sgeo[base + mesh->Nfp * mesh->Nfaces * NZID] = nz;
-        mesh->sgeo[base + mesh->Nfp * mesh->Nfaces * SJID] = sJ;
-
-        mesh->sgeo[base + mesh->Nfp * mesh->Nfaces * IJID] = 1. / Jid;
-      }
-  }
-
-#if 0
-  for(int e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      for(int n = 0; n < mesh->Nfp; ++n) {
-        int idM = n + f * mesh->Nfp + e * mesh->Nfaces * mesh->Nfp;
-        int idP = mesh->mapP[idM];
-        int eP = idP / (mesh->Nfp * mesh->Nfaces);
-        int fP = (idP % (mesh->Nfp * mesh->Nfaces)) / mesh->Nfp;
-        int nP = (idP % mesh->Nfp);
-        int baseM = e * mesh->Nfp * mesh->Nfaces * mesh->Nsgeo + f * mesh->Nfp + n;
-        int baseP = eP * mesh->Nfp * mesh->Nfaces * mesh->Nsgeo + fP * mesh->Nfp + nP;
-        printf("e,f,n=(%d,%d,%d)-(%d,%d,%d): xP-xM=(%g,%g,%g) : norP+norM=%g,%g,%g\n",
-               e,f,n,eP,fP,nP,
-               mesh->x[mesh->vmapP[idM]] - mesh->x[mesh->vmapM[idM]],
-               mesh->y[mesh->vmapP[idM]] - mesh->y[mesh->vmapM[idM]],
-               mesh->z[mesh->vmapP[idM]] - mesh->z[mesh->vmapM[idM]],
-               mesh->sgeo[baseM + NXID * mesh->Nfp * mesh->Nfaces] +
-               mesh->sgeo[baseP + NXID * mesh->Nfp * mesh->Nfaces],
-               mesh->sgeo[baseM + NYID * mesh->Nfp * mesh->Nfaces] +
-               mesh->sgeo[baseP + NYID * mesh->Nfp * mesh->Nfaces],
-               mesh->sgeo[baseM + NZID * mesh->Nfp * mesh->Nfaces] +
-               mesh->sgeo[baseP + NZID * mesh->Nfp * mesh->Nfaces]);
-      }
-
-#endif
-  // TW: omit 1/min(h) calculation
-}
diff --git a/src/libP/src/meshVTU2D.c b/src/libP/src/meshVTU2D.c
deleted file mode 100644
index e26922ce0..000000000
--- a/src/libP/src/meshVTU2D.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "mpi.h"
-
-#include "mesh2D.h"
-
-void meshVTU2D(mesh2D* mesh, char* fileName)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  FILE* fp;
-  int* allNelements = (int*) calloc(size, sizeof(int));
-  int totalNelements = 0, maxNelements = 0;
-  dfloat* tmpEX, * tmpEY;
-
-  if(rank == 0) {
-    fp = fopen(fileName, "w");
-    printf("fp=%p\n for fileName=%s\n",fp, fileName);
-  }
-
-  // gather element counts to root
-  MPI_Allgather(&(mesh->Nelements), 1, MPI_INT,
-                allNelements, 1, MPI_INT,
-                mesh->comm);
-
-  if(rank == 0) {
-    for(int r = 0; r < size; ++r) {
-      totalNelements += allNelements[r];
-      maxNelements = mymax(maxNelements, allNelements[r]);
-    }
-
-    tmpEX = (dfloat*) calloc(maxNelements * mesh->Nverts, sizeof(dfloat));
-    tmpEY = (dfloat*) calloc(maxNelements * mesh->Nverts, sizeof(dfloat));
-
-    fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-    fprintf(fp, "  <UnstructuredGrid>\n");
-    fprintf(fp,
-            "    <Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
-            totalNelements * mesh->Nverts,
-            totalNelements);
-
-    // write out nodes
-    fprintf(fp, "      <Points>\n");
-    fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-    if(rank == 0) { // root writes out its coordinates
-      printf("printing local verts \n");
-      for(int e = 0; e < mesh->Nelements; ++e) {
-        fprintf(fp, "        ");
-        for(int n = 0; n < mesh->Nverts; ++n)
-          fprintf(fp, "%g %g 0.\n", mesh->EX[e * mesh->Nverts + n], mesh->EY[e * mesh->Nverts + n]);
-      }
-    }
-  }
-
-  for(int r = 1; r < size; ++r) {
-    if(rank == r) {
-      MPI_Send(mesh->EX, mesh->Nelements * mesh->Nverts,
-               MPI_DFLOAT, 0, 666, mesh->comm);
-      MPI_Send(mesh->EY, mesh->Nelements * mesh->Nverts,
-               MPI_DFLOAT, 0, 666, mesh->comm);
-    }
-
-    if(rank == 0) {
-      MPI_Status status;
-      MPI_Recv(tmpEX, allNelements[r] * mesh->Nverts,
-               MPI_DFLOAT, r, 666, mesh->comm, &status);
-      MPI_Recv(tmpEY, allNelements[r] * mesh->Nverts,
-               MPI_DFLOAT, r, 666, mesh->comm, &status);
-
-      for(int e = 0; e < allNelements[r]; ++e) {
-        fprintf(fp, "        ");
-        for(int n = 0; n < mesh->Nverts; ++n)
-          fprintf(fp, "%g %g 0\n", tmpEX[e * mesh->Nverts + n], tmpEY[e * mesh->Nverts + n]);
-      }
-    }
-  }
-  if(rank == 0) {
-    free(tmpEX);
-    free(tmpEY);
-
-    fprintf(fp, "        </DataArray>\n");
-    fprintf(fp, "      </Points>\n");
-
-    // write out rank
-    fprintf(fp, "      <CellData Scalars=\"scalars\">\n");
-    fprintf(fp, "        <DataArray type=\"Int32\" Name=\"element_rank\" Format=\"ascii\">\n");
-
-    for(int r = 0; r < size; ++r)
-      for(int e = 0; e < allNelements[r]; ++e)
-        fprintf(fp, "         %d\n", r);
-
-    fprintf(fp, "       </DataArray>\n");
-    fprintf(fp, "     </CellData>\n");
-
-    fprintf(fp, "    <Cells>\n");
-    fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-    int cnt = 0;
-    for(int r = 0; r < size; ++r)
-      for(int e = 0; e < allNelements[r]; ++e) {
-        for(int n = 0; n < mesh->Nverts; ++n) {
-          fprintf(fp, "%d ", cnt);
-          ++cnt;
-        }
-        fprintf(fp, "\n");
-      }
-
-    fprintf(fp, "        </DataArray>\n");
-
-    fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-    for(int e = 0; e < totalNelements; ++e) {
-      if(e % 10 == 0) fprintf(fp, "        ");
-      fprintf(fp, "%d ", (e + 1) * mesh->Nverts);
-      if(((e + 1) % 10 == 0) || (e == totalNelements - 1))
-        fprintf(fp, "\n");
-    }
-    fprintf(fp, "       </DataArray>\n");
-
-    fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-    for(int e = 0; e < totalNelements; ++e) {
-      if(e % 10 == 0) fprintf(fp, "        ");
-      fprintf(fp, "5 ");
-      if(((e + 1) % 10 == 0) || e == (totalNelements - 1))
-        fprintf(fp, "\n");
-    }
-    fprintf(fp, "        </DataArray>\n");
-    fprintf(fp, "      </Cells>\n");
-    fprintf(fp, "    </Piece>\n");
-    fprintf(fp, "  </UnstructuredGrid>\n");
-    fprintf(fp, "</VTKFile>\n");
-    fclose(fp);
-  }
-
-  MPI_Barrier(mesh->comm);
-}
diff --git a/src/libP/src/meshVTU3D.c b/src/libP/src/meshVTU3D.c
deleted file mode 100644
index 0221ce18f..000000000
--- a/src/libP/src/meshVTU3D.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "mesh3D.h"
-
-void meshVTU3D(mesh3D* mesh, char* fileName)
-{
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
-
-  FILE* fp;
-  int* allNelements = (int*) calloc(size, sizeof(int));
-  int totalNelements = 0, maxNelements = 0;
-  dfloat* tmpEX, * tmpEY, * tmpEZ;
-
-  if(rank == 0) {
-    fp = fopen(fileName, "w");
-    printf("fp=%p\n for fileName=%s\n",fp, fileName);
-  }
-
-  // gather element counts to root
-  MPI_Allgather(&(mesh->Nelements), 1, MPI_INT,
-                allNelements, 1, MPI_INT,
-                mesh->comm);
-
-  if(rank == 0) {
-    for(int r = 0; r < size; ++r) {
-      totalNelements += allNelements[r];
-      maxNelements = mymax(maxNelements, allNelements[r]);
-    }
-
-    tmpEX = (dfloat*) calloc(maxNelements * mesh->Nverts, sizeof(dfloat));
-    tmpEY = (dfloat*) calloc(maxNelements * mesh->Nverts, sizeof(dfloat));
-    tmpEZ = (dfloat*) calloc(maxNelements * mesh->Nverts, sizeof(dfloat));
-
-    fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
-    fprintf(fp, "  <UnstructuredGrid>\n");
-    fprintf(fp,
-            "    <Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
-            totalNelements * mesh->Nverts,
-            totalNelements);
-
-    // write out nodes
-    fprintf(fp, "      <Points>\n");
-    fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
-
-    if(rank == 0) { // root writes out its coordinates
-      printf("printing local verts \n");
-      for(int e = 0; e < mesh->Nelements; ++e) {
-        fprintf(fp, "        ");
-        for(int n = 0; n < mesh->Nverts; ++n)
-          fprintf(fp, "%g %g %g\n",
-                  mesh->EX[e * mesh->Nverts + n],
-                  mesh->EY[e * mesh->Nverts + n],
-                  mesh->EZ[e * mesh->Nverts + n]);
-      }
-    }
-  }
-
-  for(int r = 1; r < size; ++r) {
-    if(rank == r) {
-      MPI_Send(mesh->EX, mesh->Nelements * mesh->Nverts,
-               MPI_DFLOAT, 0, 666, mesh->comm);
-      MPI_Send(mesh->EY, mesh->Nelements * mesh->Nverts,
-               MPI_DFLOAT, 0, 666, mesh->comm);
-      MPI_Send(mesh->EZ, mesh->Nelements * mesh->Nverts,
-               MPI_DFLOAT, 0, 666, mesh->comm);
-    }
-
-    if(rank == 0) {
-      MPI_Status status;
-      MPI_Recv(tmpEX, allNelements[r] * mesh->Nverts,
-               MPI_DFLOAT, r, 666, mesh->comm, &status);
-      MPI_Recv(tmpEY, allNelements[r] * mesh->Nverts,
-               MPI_DFLOAT, r, 666, mesh->comm, &status);
-      MPI_Recv(tmpEZ, allNelements[r] * mesh->Nverts,
-               MPI_DFLOAT, r, 666, mesh->comm, &status);
-
-      for(int e = 0; e < allNelements[r]; ++e) {
-        fprintf(fp, "        ");
-        for(int n = 0; n < mesh->Nverts; ++n)
-          fprintf(fp, "%g %g %g\n",
-                  tmpEX[e * mesh->Nverts + n],
-                  tmpEY[e * mesh->Nverts + n],
-                  tmpEZ[e * mesh->Nverts + n]);
-      }
-    }
-  }
-  if(rank == 0) {
-    free(tmpEX);
-    free(tmpEY);
-    free(tmpEZ);
-
-    fprintf(fp, "        </DataArray>\n");
-    fprintf(fp, "      </Points>\n");
-
-    // write out rank
-    fprintf(fp, "      <CellData Scalars=\"scalars\">\n");
-    fprintf(fp, "        <DataArray type=\"Int32\" Name=\"element_rank\" Format=\"ascii\">\n");
-
-    for(int r = 0; r < size; ++r)
-      for(int e = 0; e < allNelements[r]; ++e)
-        fprintf(fp, "         %d\n", r);
-
-    fprintf(fp, "       </DataArray>\n");
-    fprintf(fp, "     </CellData>\n");
-
-    fprintf(fp, "    <Cells>\n");
-    fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
-
-    int cnt = 0;
-    for(int r = 0; r < size; ++r)
-      for(int e = 0; e < allNelements[r]; ++e) {
-        for(int n = 0; n < mesh->Nverts; ++n) {
-          fprintf(fp, "%d ", cnt);
-          ++cnt;
-        }
-        fprintf(fp, "\n");
-      }
-
-    fprintf(fp, "        </DataArray>\n");
-
-    fprintf(fp, "        <DataArray type=\"Int32\" Name=\"offsets\" Format=\"ascii\">\n");
-    for(int e = 0; e < totalNelements; ++e) {
-      if(e % 10 == 0) fprintf(fp, "        ");
-      fprintf(fp, "%d ", (e + 1) * mesh->Nverts);
-      if(((e + 1) % 10 == 0) || (e == totalNelements - 1))
-        fprintf(fp, "\n");
-    }
-    fprintf(fp, "       </DataArray>\n");
-
-    fprintf(fp, "       <DataArray type=\"Int32\" Name=\"types\" Format=\"ascii\">\n");
-    for(int e = 0; e < totalNelements; ++e) {
-      if(e % 10 == 0) fprintf(fp, "        ");
-      fprintf(fp, "10 "); // need to choose type here !
-      if(((e + 1) % 10 == 0) || e == (totalNelements - 1))
-        fprintf(fp, "\n");
-    }
-    fprintf(fp, "        </DataArray>\n");
-    fprintf(fp, "      </Cells>\n");
-    fprintf(fp, "    </Piece>\n");
-    fprintf(fp, "  </UnstructuredGrid>\n");
-    fprintf(fp, "</VTKFile>\n");
-    fclose(fp);
-  }
-
-  MPI_Barrier(mesh->comm);
-}
diff --git a/src/libP/src/readArray.c b/src/libP/src/readArray.c
deleted file mode 100644
index a9fedbf6d..000000000
--- a/src/libP/src/readArray.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "mesh.h"
-
-void readDfloatArray(FILE* fp, const char* label, dfloat** A, int* Nrows, int* Ncols)
-{
-  char buf[BUFSIZ];
-  rewind(fp); // rewind to beginning
-
-  int flag = 0;
-  int status;
-  char* stat;
-
-  //search for label in file
-  while(fgets(buf, BUFSIZ, fp)) {
-    if (strstr(buf, label)) {
-      flag = 1;
-      break;
-    }
-  }
-  if (flag == 0) {
-    printf("ERROR: Unable to find label: '%s' in node file.\n", label);
-    exit(-1);
-  }
-
-  //if found read in number of rows and columns
-  status = fscanf(fp, "%d %d",  Nrows, Ncols);
-  stat = fgets(buf, BUFSIZ, fp); //read to end of line
-
-  *A = (dfloat*) calloc((*Nrows) * (*Ncols), sizeof(dfloat)); //allocate space
-  for(int n = 0; n < (*Nrows) * (*Ncols); ++n) //read matrix data
-    status = fscanf(fp, dfloatFormat, (*A) + n);
-}
-
-void readIntArray(FILE* fp, const char* label, int** A, int* Nrows, int* Ncols)
-{
-  char buf[BUFSIZ];
-  rewind(fp); // rewind to beginning
-
-  int flag = 0;
-  int status;
-  char* stat;
-
-  //search for label in file
-  while(fgets(buf, BUFSIZ, fp)) {
-    if (strstr(buf, label)) {
-      flag = 1;
-      break;
-    }
-  }
-  if (flag == 0) {
-    printf("ERROR: Unable to find label: '%s' in node file.\n", label);
-    exit(-1);
-  }
-
-  //if found read in number of rows and columns
-  status = fscanf(fp, "%d %d",  Nrows, Ncols);
-  stat = fgets(buf, BUFSIZ, fp); //read to end of line
-
-  *A = (int*) calloc((*Nrows) * (*Ncols), sizeof(int)); //allocate space
-  for(int n = 0; n < (*Nrows) * (*Ncols); ++n) //read matrix data
-    status = fscanf(fp, "%d", (*A) + n);
-}
diff --git a/src/libP/src/spaceFillingOrderings.cpp b/src/libP/src/spaceFillingOrderings.cpp
deleted file mode 100644
index 29120b2d5..000000000
--- a/src/libP/src/spaceFillingOrderings.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <stdlib.h>
-
-// taken from: http://and-what-happened.blogspot.com/2011/08/fast-2d-and-3d-hilbert-curves-and.html
-
-extern "C"
-{
-unsigned int MortonToHilbert2D( const unsigned int morton, const unsigned int bits )
-{
-  unsigned int hilbert = 0;
-  unsigned int remap = 0xb4;
-  unsigned int block = ( bits << 1 );
-  while( block ) {
-    block -= 2;
-    unsigned int mcode = ( ( morton >> block ) & 3 );
-    unsigned int hcode = ( ( remap >> ( mcode << 1 ) ) & 3 );
-    remap ^= ( 0x82000028 >> ( hcode << 3 ) );
-    hilbert = ( ( hilbert << 2 ) + hcode );
-  }
-  return hilbert;
-}
-unsigned int HilbertToMorton2D( const unsigned int hilbert, const unsigned int bits )
-{
-  unsigned int morton = 0;
-  unsigned int remap = 0xb4;
-  unsigned int block = ( bits << 1 );
-  while( block ) {
-    block -= 2;
-    unsigned int hcode = ( ( hilbert >> block ) & 3 );
-    unsigned int mcode = ( ( remap >> ( hcode << 1 ) ) & 3 );
-    remap ^= ( 0x330000cc >> ( hcode << 3 ) );
-    morton = ( ( morton << 2 ) + mcode );
-  }
-  return morton;
-}
-unsigned int MortonToHilbert3D( const unsigned int morton, const unsigned int bits )
-{
-  unsigned int hilbert = morton;
-  if( bits > 1 ) {
-    unsigned int block = ( ( bits * 3 ) - 3 );
-    unsigned int hcode = ( ( hilbert >> block ) & 7 );
-    unsigned int mcode, shift, signs;
-    shift = signs = 0;
-    while( block ) {
-      block -= 3;
-      hcode <<= 2;
-      mcode = ( ( 0x20212021 >> hcode ) & 3 );
-      shift = ( ( 0x48 >> ( 7 - shift - mcode ) ) & 3 );
-      signs = ( ( signs | ( signs << 3 ) ) >> mcode );
-      signs = ( ( signs ^ ( 0x53560300 >> hcode ) ) & 7 );
-      mcode = ( ( hilbert >> block ) & 7 );
-      hcode = mcode;
-      hcode = ( ( ( hcode | ( hcode << 3 ) ) >> shift ) & 7 );
-      hcode ^= signs;
-      hilbert ^= ( ( mcode ^ hcode ) << block );
-    }
-  }
-  hilbert ^= ( ( hilbert >> 1 ) & 0x92492492 );
-  hilbert ^= ( ( hilbert & 0x92492492 ) >> 1 );
-  return hilbert;
-}
-unsigned int HilbertToMorton3D( const unsigned int hilbert, const unsigned int bits )
-{
-  unsigned int morton = hilbert;
-  morton ^= ( ( morton & 0x92492492 ) >> 1 );
-  morton ^= ( ( morton >> 1 ) & 0x92492492 );
-  if( bits > 1 ) {
-    unsigned int block = ( ( bits * 3 ) - 3 );
-    unsigned int hcode = ( ( morton >> block ) & 7 );
-    unsigned int mcode, shift, signs;
-    shift = signs = 0;
-    while( block ) {
-      block -= 3;
-      hcode <<= 2;
-      mcode = ( ( 0x20212021 >> hcode ) & 3 );
-      shift = ( ( 0x48 >> ( 4 - shift + mcode ) ) & 3 );
-      signs = ( ( signs | ( signs << 3 ) ) >> mcode );
-      signs = ( ( signs ^ ( 0x53560300 >> hcode ) ) & 7 );
-      hcode = ( ( morton >> block ) & 7 );
-      mcode = hcode;
-      mcode ^= signs;
-      mcode = ( ( ( mcode | ( mcode << 3 ) ) >> shift ) & 7 );
-      morton ^= ( ( hcode ^ mcode ) << block );
-    }
-  }
-  return morton;
-}
-unsigned int Morton_2D_Encode_5bit( unsigned int index1, unsigned int index2 )
-{ // pack 2 5-bit indices into a 10-bit Morton code
-  index1 &= 0x0000001f;
-  index2 &= 0x0000001f;
-  index1 *= 0x01041041;
-  index2 *= 0x01041041;
-  index1 &= 0x10204081;
-  index2 &= 0x10204081;
-  index1 *= 0x00108421;
-  index2 *= 0x00108421;
-  index1 &= 0x15500000;
-  index2 &= 0x15500000;
-  return ( index1 >> 20 ) | ( index2 >> 19 );
-}
-void Morton_2D_Decode_5bit( const unsigned int morton, unsigned int& index1, unsigned int& index2 )
-{ // unpack 2 5-bit indices from a 10-bit Morton code
-  unsigned int value1 = morton;
-  unsigned int value2 = ( value1 >> 1 );
-  value1 &= 0x00000155;
-  value2 &= 0x00000155;
-  value1 |= ( value1 >> 1 );
-  value2 |= ( value2 >> 1 );
-  value1 &= 0x00000133;
-  value2 &= 0x00000133;
-  value1 |= ( value1 >> 2 );
-  value2 |= ( value2 >> 2 );
-  value1 &= 0x0000010f;
-  value2 &= 0x0000010f;
-  value1 |= ( value1 >> 4 );
-  value2 |= ( value2 >> 4 );
-  value1 &= 0x0000001f;
-  value2 &= 0x0000001f;
-  index1 = value1;
-  index2 = value2;
-}
-unsigned int Morton_2D_Encode_16bit( unsigned int index1, unsigned int index2 )
-{ // pack 2 16-bit indices into a 32-bit Morton code
-  index1 &= 0x0000ffff;
-  index2 &= 0x0000ffff;
-  index1 |= ( index1 << 8 );
-  index2 |= ( index2 << 8 );
-  index1 &= 0x00ff00ff;
-  index2 &= 0x00ff00ff;
-  index1 |= ( index1 << 4 );
-  index2 |= ( index2 << 4 );
-  index1 &= 0x0f0f0f0f;
-  index2 &= 0x0f0f0f0f;
-  index1 |= ( index1 << 2 );
-  index2 |= ( index2 << 2 );
-  index1 &= 0x33333333;
-  index2 &= 0x33333333;
-  index1 |= ( index1 << 1 );
-  index2 |= ( index2 << 1 );
-  index1 &= 0x55555555;
-  index2 &= 0x55555555;
-  return index1 | ( index2 << 1 );
-}
-void Morton_2D_Decode_16bit( const unsigned int morton, unsigned int& index1, unsigned int& index2 )
-{ // unpack 2 16-bit indices from a 32-bit Morton code
-  unsigned int value1 = morton;
-  unsigned int value2 = ( value1 >> 1 );
-  value1 &= 0x55555555;
-  value2 &= 0x55555555;
-  value1 |= ( value1 >> 1 );
-  value2 |= ( value2 >> 1 );
-  value1 &= 0x33333333;
-  value2 &= 0x33333333;
-  value1 |= ( value1 >> 2 );
-  value2 |= ( value2 >> 2 );
-  value1 &= 0x0f0f0f0f;
-  value2 &= 0x0f0f0f0f;
-  value1 |= ( value1 >> 4 );
-  value2 |= ( value2 >> 4 );
-  value1 &= 0x00ff00ff;
-  value2 &= 0x00ff00ff;
-  value1 |= ( value1 >> 8 );
-  value2 |= ( value2 >> 8 );
-  value1 &= 0x0000ffff;
-  value2 &= 0x0000ffff;
-  index1 = value1;
-  index2 = value2;
-}
-unsigned int Morton_3D_Encode_5bit( unsigned int index1, unsigned int index2, unsigned int index3 )
-{ // pack 3 5-bit indices into a 15-bit Morton code
-  index1 &= 0x0000001f;
-  index2 &= 0x0000001f;
-  index3 &= 0x0000001f;
-  index1 *= 0x01041041;
-  index2 *= 0x01041041;
-  index3 *= 0x01041041;
-  index1 &= 0x10204081;
-  index2 &= 0x10204081;
-  index3 &= 0x10204081;
-  index1 *= 0x00011111;
-  index2 *= 0x00011111;
-  index3 *= 0x00011111;
-  index1 &= 0x12490000;
-  index2 &= 0x12490000;
-  index3 &= 0x12490000;
-  return ( index1 >> 16 ) | ( index2 >> 15 ) | ( index3 >> 14 );
-}
-void Morton_3D_Decode_5bit( const unsigned int morton,
-                            unsigned int& index1, unsigned int& index2, unsigned int& index3 )
-{ // unpack 3 5-bit indices from a 15-bit Morton code
-  unsigned int value1 = morton;
-  unsigned int value2 = ( value1 >> 1 );
-  unsigned int value3 = ( value1 >> 2 );
-  value1 &= 0x00001249;
-  value2 &= 0x00001249;
-  value3 &= 0x00001249;
-  value1 |= ( value1 >> 2 );
-  value2 |= ( value2 >> 2 );
-  value3 |= ( value3 >> 2 );
-  value1 &= 0x000010c3;
-  value2 &= 0x000010c3;
-  value3 &= 0x000010c3;
-  value1 |= ( value1 >> 4 );
-  value2 |= ( value2 >> 4 );
-  value3 |= ( value3 >> 4 );
-  value1 &= 0x0000100f;
-  value2 &= 0x0000100f;
-  value3 &= 0x0000100f;
-  value1 |= ( value1 >> 8 );
-  value2 |= ( value2 >> 8 );
-  value3 |= ( value3 >> 8 );
-  value1 &= 0x0000001f;
-  value2 &= 0x0000001f;
-  value3 &= 0x0000001f;
-  index1 = value1;
-  index2 = value2;
-  index3 = value3;
-}
-unsigned int Morton_3D_Encode_10bit( unsigned int index1, unsigned int index2, unsigned int index3 )
-{ // pack 3 10-bit indices into a 30-bit Morton code
-  index1 &= 0x000003ff;
-  index2 &= 0x000003ff;
-  index3 &= 0x000003ff;
-  index1 |= ( index1 << 16 );
-  index2 |= ( index2 << 16 );
-  index3 |= ( index3 << 16 );
-  index1 &= 0x030000ff;
-  index2 &= 0x030000ff;
-  index3 &= 0x030000ff;
-  index1 |= ( index1 << 8 );
-  index2 |= ( index2 << 8 );
-  index3 |= ( index3 << 8 );
-  index1 &= 0x0300f00f;
-  index2 &= 0x0300f00f;
-  index3 &= 0x0300f00f;
-  index1 |= ( index1 << 4 );
-  index2 |= ( index2 << 4 );
-  index3 |= ( index3 << 4 );
-  index1 &= 0x030c30c3;
-  index2 &= 0x030c30c3;
-  index3 &= 0x030c30c3;
-  index1 |= ( index1 << 2 );
-  index2 |= ( index2 << 2 );
-  index3 |= ( index3 << 2 );
-  index1 &= 0x09249249;
-  index2 &= 0x09249249;
-  index3 &= 0x09249249;
-  return index1 | ( index2 << 1 ) | ( index3 << 2 );
-}
-void Morton_3D_Decode_10bit( const unsigned int morton,
-                             unsigned int& index1, unsigned int& index2, unsigned int& index3 )
-{ // unpack 3 10-bit indices from a 30-bit Morton code
-  unsigned int value1 = morton;
-  unsigned int value2 = ( value1 >> 1 );
-  unsigned int value3 = ( value1 >> 2 );
-  value1 &= 0x09249249;
-  value2 &= 0x09249249;
-  value3 &= 0x09249249;
-  value1 |= ( value1 >> 2 );
-  value2 |= ( value2 >> 2 );
-  value3 |= ( value3 >> 2 );
-  value1 &= 0x030c30c3;
-  value2 &= 0x030c30c3;
-  value3 &= 0x030c30c3;
-  value1 |= ( value1 >> 4 );
-  value2 |= ( value2 >> 4 );
-  value3 |= ( value3 >> 4 );
-  value1 &= 0x0300f00f;
-  value2 &= 0x0300f00f;
-  value3 &= 0x0300f00f;
-  value1 |= ( value1 >> 8 );
-  value2 |= ( value2 >> 8 );
-  value3 |= ( value3 >> 8 );
-  value1 &= 0x030000ff;
-  value2 &= 0x030000ff;
-  value3 &= 0x030000ff;
-  value1 |= ( value1 >> 16 );
-  value2 |= ( value2 >> 16 );
-  value3 |= ( value3 >> 16 );
-  value1 &= 0x000003ff;
-  value2 &= 0x000003ff;
-  value3 &= 0x000003ff;
-  index1 = value1;
-  index2 = value2;
-  index3 = value3;
-}
-
-unsigned int hilbert2D(unsigned int index1, unsigned int index2)
-{
-  unsigned int morton = Morton_2D_Encode_16bit(index1,index2);
-
-  return MortonToHilbert2D(morton, 16);
-}
-
-unsigned int morton2D(unsigned int index1, unsigned int index2)
-{
-  return Morton_2D_Encode_16bit(index1,index2);
-}
-}
diff --git a/src/libP/src/timer.c b/src/libP/src/timer.c
deleted file mode 100644
index 65e926d64..000000000
--- a/src/libP/src/timer.c
+++ /dev/null
@@ -1,535 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include "timer.h"
-
-namespace occa {
-  timerTraits::timerTraits()
-  {
-    timeTaken      = 0.0;
-    selfTime       = 0.0;
-    numCalls       = 0;
-    flopCount      = 0.0;
-    bandWidthCount = 0.0;
-    treeDepth      = 0;
-  }
-
-  timer::timer()
-  {
-    profileKernels     = false;
-    deviceInitialized  = false;
-    profileApplication = false;
-
-    /*
-       std::string profilerOn       = occa::env::var("OCCA_PROFILE");
-       std::string kernelProfilerOn = occa::env::var("OCCA_KERNEL_PROFILE");
-     */
-    std::string profilerOn = "1";
-    std::string kernelProfilerOn = "1";
-
-    if(profilerOn == "1")
-      profileApplication = true;
-
-    if(kernelProfilerOn == "1") {
-      profileKernels     = true;
-      profileApplication = true;
-    }
-  }
-
-  void timer::initTimer(const occa::device &deviceHandle)
-  {
-    deviceInitialized = true;
-
-    occaHandle = deviceHandle;
-  }
-
-  void timer::tic(std::string key)
-  {
-    if(profileApplication) {
-      keyStack.push(key);
-
-      int treeDepth = keyStack.size() - 1;
-      times[keyStack].treeDepth = treeDepth;
-
-      double currentTime = occa::currentTime();
-
-      timeStack.push(currentTime);
-
-      if(treeDepth) {
-        keyStack.pop();
-
-        // see if it was already in the child list
-        std::vector < std::string > ::iterator iter;
-        std::vector < std::string > *childs = &(times[keyStack].childs);
-        iter = std::find(childs->begin(), childs->end(), key);
-
-        if(iter == childs->end())
-          childs->push_back(key);
-
-        keyStack.push(key);
-      }
-    }
-  }
-
-  double timer::toc(std::string key)
-  {
-    double elapsedTime = 0.;
-
-    if(profileApplication) {
-      assert(key == keyStack.top());
-
-      double currentTime = occa::currentTime();
-      elapsedTime = (currentTime - timeStack.top());
-
-      times[keyStack].timeTaken += elapsedTime;
-      times[keyStack].numCalls++;
-
-      keyStack.pop();
-      timeStack.pop();
-    }
-
-    return elapsedTime;
-  }
-
-  double timer::toc(std::string key, occa::kernel &kernel)
-  {
-    double elapsedTime = 0.;
-
-    if(profileApplication) {
-      assert(key == keyStack.top());
-
-      //      OCCA_CHECK(key == keyStack.top(),
-      //                 "Error in timer " << key << '\n');
-
-      if(profileKernels) {
-        if(deviceInitialized)
-          occaHandle.finish();
-
-        double currentTime = occa::currentTime();
-        elapsedTime = (currentTime - timeStack.top());
-        // times[keyStack].timeTaken += kernel.timeTaken();
-        times[keyStack].timeTaken += elapsedTime;
-        times[keyStack].numCalls++;
-      }
-
-      keyStack.pop();
-      timeStack.pop();
-    }
-
-    return elapsedTime;
-  }
-
-  double timer::toc(std::string key, double flops)
-  {
-    double elapsedTime = 0.;
-
-    if(profileApplication) {
-      assert(key == keyStack.top());
-
-      double currentTime = occa::currentTime();
-      elapsedTime = (currentTime - timeStack.top());
-
-      times[keyStack].timeTaken += elapsedTime;
-      times[keyStack].numCalls++;
-      times[keyStack].flopCount += flops;
-
-      keyStack.pop();
-      timeStack.pop();
-    }
-
-    return elapsedTime;
-  }
-
-  double timer::toc(std::string key, occa::kernel &kernel, double flops)
-  {
-    double elapsedTime = 0.;
-
-    if(profileApplication) {
-      assert(key == keyStack.top());
-
-      if(profileKernels) {
-        occaHandle.finish();
-        double currentTime = occa::currentTime();
-        elapsedTime = (currentTime - timeStack.top());
-        // times[keyStack].timeTaken += kernel.timeTaken();
-        times[keyStack].timeTaken += elapsedTime;
-        times[keyStack].numCalls++;
-        times[keyStack].flopCount += flops;
-      }
-
-      keyStack.pop();
-      timeStack.pop();
-    }
-
-    return elapsedTime;
-  }
-
-  double timer::toc(std::string key, double flops, double bw)
-  {
-    double elapsedTime = 0.;
-
-    if(profileApplication) {
-      assert(key == keyStack.top());
-
-      double currentTime = occa::currentTime();
-      elapsedTime = (currentTime - timeStack.top());
-
-      times[keyStack].timeTaken += elapsedTime;
-      times[keyStack].numCalls++;
-      times[keyStack].flopCount += flops;
-      times[keyStack].bandWidthCount += bw;
-
-      dataTransferred += bw;
-
-      keyStack.pop();
-      timeStack.pop();
-    }
-
-    return elapsedTime;
-  }
-
-  double timer::toc(std::string key, occa::kernel &kernel,
-                    double flops, double bw)
-  {
-    double elapsedTime = 0.;
-
-    if(profileApplication) {
-      assert(key == keyStack.top());
-
-      if(profileKernels) {
-        occaHandle.finish();
-        double currentTime = occa::currentTime();
-        elapsedTime = (currentTime - timeStack.top());
-        // times[keyStack].timeTaken += kernel.timeTaken();
-        times[keyStack].timeTaken += elapsedTime;
-        times[keyStack].numCalls++;
-        times[keyStack].flopCount += flops;
-        times[keyStack].bandWidthCount += bw;
-      }
-
-      dataTransferred += bw;
-
-      keyStack.pop();
-      timeStack.pop();
-    }
-
-    return elapsedTime;
-  }
-
-  double timer::print_recursively(std::vector < std::string > &childs,
-                                  double parentTime,
-                                  double overallTime)
-  {
-    double sumChildrenTime = 0.0;
-
-    for(size_t i = 0; i < childs.size(); ++i) {
-      keyStack.push(childs[i]);
-
-      timerTraits* traits = &(times[keyStack]);
-
-      std::string stringName = "  ";
-      for(int j = 0; j < traits->treeDepth; j++) stringName.append(" ");
-
-      stringName.append("*");
-      stringName.append(keyStack.top());
-
-      double timeTaken = traits->timeTaken;
-
-      sumChildrenTime += timeTaken;
-
-      double invTimeTaken = (timeTaken > 1e-10) ? 1.0 / timeTaken : 0.;
-
-      std::cout << std::left << std::setw(30) << stringName
-                << std::right << std::setw(10) << std::setprecision(3) << timeTaken
-                << std::right << std::setw(10) << traits->numCalls
-                << std::right << std::setw(10) << std::setprecision(3) << 100. * timeTaken /
-      parentTime
-                << std::right << std::setw(10) << std::setprecision(3) << 100. * timeTaken /
-      overallTime
-                << std::right << std::setw(10) << std::setprecision(3) << traits->flopCount *
-      invTimeTaken / 1e9
-                << std::right << std::setw(10) << std::setprecision(3) << traits->bandWidthCount *
-      invTimeTaken / 1e9
-                << std::endl;
-
-      traits->selfTime -= print_recursively(traits->childs, timeTaken, overallTime);
-
-      keyStack.pop();
-    }
-
-    return sumChildrenTime;
-  }
-
-  static bool compareSelfTimes(std::pair < std::string, timerTraits > a,
-                               std::pair < std::string, timerTraits > b)
-  {
-    return a.second.selfTime > b.second.selfTime;
-  }
-
-  void timer::printTimer()
-  {
-    if(profileApplication) {
-      std::map < std::stack < std::string >, timerTraits > ::iterator iter;
-
-      // compute overall time
-      double overallTime = 0.;
-      for(iter = times.begin(); iter != times.end(); iter++) {
-        iter->second.selfTime = iter->second.timeTaken;
-        if(iter->second.treeDepth == 0)
-          overallTime += iter->second.timeTaken;
-      }
-
-      std::cout << "********************************************************"
-                << "**********************************" << std::endl;
-      std::cout << "Profiling info: " << std::endl;
-      std::cout << std::left << std::setw(30) << "Name"
-                << std::right << std::setw(10) << "time spent"
-                << std::right << std::setw(10) << "# calls"
-                << std::right << std::setw(10) << "% time"
-                << std::right << std::setw(10) << "% total"
-                << std::right << std::setw(10) << "gflops "
-                << std::right << std::setw(10) << "bwidth"
-                << std::endl;
-
-      std::cout << "--------------------------------------------------------"
-                << "----------------------------------" << std::endl;
-
-      for(iter = times.begin(); iter != times.end(); iter++)
-        if(iter->second.treeDepth == 0) {
-          keyStack = iter->first;
-
-          timerTraits* traits = &(iter->second);
-
-          std::string stringName = " *";
-          stringName.append(keyStack.top());
-
-          double timeTaken = traits->timeTaken;
-
-          double invTimeTaken = (timeTaken > 1e-10) ? 1.0 / timeTaken : 0.;
-
-          std::cout << std::left << std::setw(30) << stringName
-                    << std::right << std::setw(10) << std::setprecision(3) << timeTaken
-                    << std::right << std::setw(10) << traits->numCalls
-                    << std::right << std::setw(10) << std::setprecision(3) << 100.0
-                    << std::right << std::setw(10) << std::setprecision(3) << 100 * timeTaken /
-          overallTime
-                    << std::right << std::setw(10) << std::setprecision(3) << traits->flopCount *
-          invTimeTaken / 1e9
-                    << std::right << std::setw(10) << std::setprecision(3) <<
-          traits->bandWidthCount * invTimeTaken / 1e9
-                    << std::endl;
-
-          traits->selfTime -= print_recursively(iter->second.childs, timeTaken, overallTime);
-        }
-
-
-      std::map < std::string, timerTraits > flat;
-
-      // flat profile
-      for(iter = times.begin(); iter != times.end(); iter++) {
-        std::string key = iter->first.top();
-
-        timerTraits* traits = &(iter->second);
-
-        timerTraits* targetTraits = &(flat[key]);
-
-        targetTraits->timeTaken += traits->timeTaken;
-        targetTraits->selfTime += traits->selfTime;
-        targetTraits->numCalls += traits->numCalls;
-        targetTraits->flopCount += traits->flopCount;
-        targetTraits->bandWidthCount += traits->bandWidthCount;
-      }
-
-      std::vector < std::pair < std::string, timerTraits >> flatVec(flat.size());
-
-      std::map < std::string, timerTraits > ::iterator iter2 = flat.begin();
-
-      for(size_t i = 0; i < flat.size(); ++i) {
-        flatVec[i].first = iter2->first;
-
-        flatVec[i].second.timeTaken = iter2->second.timeTaken;
-        flatVec[i].second.numCalls = iter2->second.numCalls;
-        flatVec[i].second.selfTime = iter2->second.selfTime;
-        flatVec[i].second.flopCount = iter2->second.flopCount;
-        flatVec[i].second.bandWidthCount = iter2->second.bandWidthCount;
-
-        iter2++;
-      }
-
-      // sort
-      std::sort(flatVec.begin(), flatVec.end(), compareSelfTimes);
-
-      // write the flat profiling info
-      std::cout << "********************************************************"
-                << "**********************************" << std::endl;
-
-      std::cout << "Profiling summary: " << std::endl;
-
-      std::cout << std::left << std::setw(30) << "Name"
-                << std::right << std::setw(10) << "time"
-                << std::right << std::setw(10) << "self time"
-                << std::right << std::setw(10) << "# calls"
-                << std::right << std::setw(10) << "% time"
-                << std::right << std::setw(10) << "gflops "
-                << std::right << std::setw(10) << "bwidth"
-                << std::endl;
-
-      std::cout << "--------------------------------------------------------"
-                << "----------------------------------" << std::endl;
-
-      std::vector < std::pair < std::string, timerTraits >> ::iterator iter1;
-      for(iter1 = flatVec.begin(); iter1 != flatVec.end(); iter1++) {
-        timerTraits* traits = &(iter1->second);
-        double timeTaken = traits->timeTaken;
-        double invTimeTaken = (timeTaken > 1e-10) ? 1.0 / timeTaken : 0.;
-        std::cout << std::left << std::setw(30) << iter1->first
-                  << std::right << std::setw(10) << std::setprecision(3) << traits->timeTaken
-                  << std::right << std::setw(10) << std::setprecision(3) << traits->selfTime
-                  << std::right << std::setw(10) << traits->numCalls
-                  << std::right << std::setw(10) << std::setprecision(3) << 100 * traits->selfTime /
-        overallTime
-                  << std::right << std::setw(10) << std::setprecision(3) << traits->flopCount *
-        invTimeTaken / 1e9
-                  << std::right << std::setw(10) << std::setprecision(3) << traits->bandWidthCount *
-        invTimeTaken / 1e9
-                  << std::endl;
-      }
-
-      std::cout << "********************************************************"
-                << "**********************************" << std::endl;
-    }
-  }
-
-  timer globalTimer;
-
-  double dataTransferred = 0.;
-
-  void initTimer(const occa::device &deviceHandle)
-  {
-    globalTimer.initTimer(deviceHandle);
-  }
-
-  void tic(std::string key)
-  {
-    globalTimer.tic(key);
-  }
-
-  double toc(std::string key)
-  {
-    return globalTimer.toc(key);
-  }
-
-  double toc(std::string key, occa::kernel &kernel)
-  {
-    return globalTimer.toc(key, kernel);
-  }
-
-  double toc(std::string key, double fp)
-  {
-    return globalTimer.toc(key, fp);
-  }
-
-  double toc(std::string key, occa::kernel &kernel, double fp)
-  {
-    return globalTimer.toc(key, kernel, fp);
-  }
-
-  double toc(std::string key, double fp, double bw)
-  {
-    return globalTimer.toc(key, fp, bw);
-  }
-
-  double toc(std::string key, occa::kernel &kernel, double fp, double bw)
-  {
-    return globalTimer.toc(key, kernel, fp, bw);
-  }
-
-  void printTimer()
-  {
-    globalTimer.printTimer();
-  }
-
-  double currentTime()
-  {
-#if (OCCA_OS & LINUX_OS)
-
-    timespec ct;
-    clock_gettime(CLOCK_MONOTONIC, &ct);
-
-    return (double) (ct.tv_sec + (1.0e-9 * ct.tv_nsec));
-
-#elif (OCCA_OS == OSX_OS)
-#  ifdef __clang__
-    uint64_t ct;
-    ct = mach_absolute_time();
-
-    const Nanoseconds ct2 = AbsoluteToNanoseconds(*(AbsoluteTime*) &ct);
-
-    return ((double) 1.0e-9) * ((double) ( *((uint64_t*) &ct2) ));
-#  else
-    clock_serv_t cclock;
-    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-
-    mach_timespec_t ct;
-    clock_get_time(cclock, &ct);
-
-    mach_port_deallocate(mach_task_self(), cclock);
-
-    return (double) (ct.tv_sec + (1.0e-9 * ct.tv_nsec));
-#  endif
-#elif (OCCA_OS == WINDOWS_OS)
-    static LARGE_INTEGER freq;
-    static bool haveFreq = false;
-
-    if (!haveFreq) {
-      QueryPerformanceFrequency(&freq);
-      haveFreq = true;
-    }
-
-    LARGE_INTEGER ct;
-
-    QueryPerformanceCounter(&ct);
-
-    return ((double) (ct.QuadPart)) / ((double) (freq.QuadPart));
-#endif
-  }
-}
-
-void occaTimerTic(occa::device device,std::string name)
-{
-#if 0
-  device.finish();
-  occa::tic(name);
-#endif
-}
-
-void occaTimerToc(occa::device device,std::string name)
-{
-#if 0
-  device.finish();
-  occa::toc(name);
-#endif
-}
diff --git a/src/libP/src/trace.c b/src/libP/src/trace.c
deleted file mode 100644
index 1f7b7bb5f..000000000
--- a/src/libP/src/trace.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-
-   The MIT License (MIT)
-
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
- */
-
-#include <trace.hpp>
-
-using std::string;
-// GNU specialization. Not portable.
-
-ostream &trace(ostream &stream, int stack_size)
-{
-  void* array[stack_size];
-
-  int length = backtrace (array, stack_size);
-  char** trace = backtrace_symbols (array, length);
-
-  //  for( size_t i=1; i<length; ++i ){
-  //    cout << trace[i] << endl;
-  //  }
-  //  return stream;
-
-  string binary;
-
-  // start from 1, since the first entry is the traced_exception ctor
-  for( size_t i = 1; i < length; ++i ) {
-    char* line = trace[i];
-    char* lb = strchr(line, '(');
-    char* plus = 0;
-    char* rb = 0;
-    char* demangled = 0;
-    if( lb != 0 ) {
-      if( binary != string( line, lb - line ) ) {
-        binary = string( line, lb - line );
-        stream << "in " << binary << ":\n";
-      }
-      *lb = '\0';
-      plus = strchr(lb + 1,'+');
-      if( plus != 0 ) {
-        *plus = '\0';
-        int status;
-        demangled = abi::__cxa_demangle(lb + 1, 0, 0, &status);
-        rb = strchr(plus + 1, ')');
-      }else{
-        rb = strchr(lb + 1, ')');
-      }
-      if( rb != 0 )
-        *rb = '\0';
-    }
-    if( lb == 0 ) {
-      stream << line;
-    }else{
-      if( plus != 0 )
-        stream << "+" << (plus + 1);
-      if( demangled != 0 ) {
-        stream << '\t' << demangled;
-        free(demangled);
-      }else if( lb != 0 && rb > lb ) {
-        stream << '\t' << (lb + 1) << "()";
-      }
-    }
-    stream << '\n';
-  }
-
-  return stream;
-}
diff --git a/src/linAlg/linAlg.cpp b/src/linAlg/linAlg.cpp
index f00ae6fa7..e83e4d4d4 100644
--- a/src/linAlg/linAlg.cpp
+++ b/src/linAlg/linAlg.cpp
@@ -32,16 +32,16 @@ void linAlg_t::setup() {
   MPI_Comm_rank(comm, &rank);
 
   //add defines
-  kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
 
   //pinned scratch buffer
   {
     occa::properties props = kernelInfo;
     props["mapped"] = true;
-    h_scratch = device.malloc(blocksize*sizeof(dfloat), props);
+    h_scratch = device.malloc(BLOCKSIZE*sizeof(dfloat), props);
     scratch = (dfloat*) h_scratch.ptr(props);
   }
-  o_scratch = device.malloc(blocksize*sizeof(dfloat));
+  o_scratch = device.malloc(BLOCKSIZE*sizeof(dfloat));
 
   string oklDir;
   oklDir.assign(getenv("NEKRS_INSTALL_DIR"));
@@ -210,8 +210,8 @@ void linAlg_t::axdyz(const dlong N, const dfloat alpha,
 
 // \sum o_a
 dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   sumKernel(Nblock, N, o_a, o_scratch);
 
@@ -230,8 +230,8 @@ dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
 
 // \min o_a
 dfloat linAlg_t::min(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   minKernel(Nblock, N, o_a, o_scratch);
 
@@ -249,8 +249,8 @@ dfloat linAlg_t::min(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
 
 // \max o_a
 dfloat linAlg_t::max(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   maxKernel(Nblock, N, o_a, o_scratch);
 
@@ -270,8 +270,8 @@ dfloat linAlg_t::max(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
 // ||o_a||_2
 /*
 dfloat linAlg_t::norm2(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   norm2Kernel(Nblock, N, o_a, o_scratch);
 
@@ -292,8 +292,8 @@ dfloat linAlg_t::norm2(const dlong N, occa::memory& o_a, MPI_Comm _comm) {
 // o_x.o_y
 dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
                            MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   innerProdKernel(Nblock, N, o_x, o_y, o_scratch);
 
@@ -314,8 +314,8 @@ dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
 dfloat linAlg_t::weightedInnerProd(const dlong N, occa::memory& o_w,
                                    occa::memory& o_x, occa::memory& o_y,
                                    MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   weightedInnerProdKernel(Nblock, N, o_w, o_x, o_y, o_scratch);
 
@@ -335,8 +335,8 @@ dfloat linAlg_t::weightedInnerProd(const dlong N, occa::memory& o_w,
 // ||o_a||_w2
 dfloat linAlg_t::weightedNorm2(const dlong N, occa::memory& o_w,
                                occa::memory& o_a, MPI_Comm _comm) {
-  int Nblock = (N+blocksize-1)/blocksize;
-  Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
+  int Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
+  Nblock = (Nblock>BLOCKSIZE) ? BLOCKSIZE : Nblock; //limit to BLOCKSIZE entries
 
   weightedNorm2Kernel(Nblock, N, o_w, o_a, o_scratch);
 
diff --git a/src/linAlg/linAlg.hpp b/src/linAlg/linAlg.hpp
index 192de323d..d48abf994 100644
--- a/src/linAlg/linAlg.hpp
+++ b/src/linAlg/linAlg.hpp
@@ -27,9 +27,7 @@ SOFTWARE.
 #ifndef LINALG_HPP
 #define LINALG_HPP
 
-#include "mpi.h"
-#include "occa.hpp"
-#include "types.h"
+#include "nrssys.hpp"
 
 using std::string;
 
@@ -48,7 +46,7 @@ class linAlg_t {
   void setup();
 public:
   linAlg_t(occa::device& _device, occa::properties*& _kernelInfo, MPI_Comm& _comm) {
-    blocksize = 256; 
+    blocksize = BLOCKSIZE;
     device = _device;
     kernelInfo = *(_kernelInfo);
     comm = _comm;
diff --git a/src/libP/src/matrix.c b/src/linAlg/matrix.cpp
similarity index 97%
rename from src/libP/src/matrix.c
rename to src/linAlg/matrix.cpp
index 3183863e0..e81ca319f 100644
--- a/src/libP/src/matrix.c
+++ b/src/linAlg/matrix.cpp
@@ -27,7 +27,8 @@
 #include "matrix.hpp"
 
 template < >
-void matrix < float > ::symeig(matrix < float > &W, matrix < float > &V){
+void matrix < float > ::symeig(matrix < float > &W, matrix < float > &V)
+{
   V = *this;
   W.resize(Nrows,1);
 
@@ -46,7 +47,8 @@ void matrix < float > ::symeig(matrix < float > &W, matrix < float > &V){
 }
 
 template < >
-void matrix < double > ::symeig(matrix < double > &W, matrix < double > &V){
+void matrix < double > ::symeig(matrix < double > &W, matrix < double > &V)
+{
   V = *this;
   W.resize(Nrows,1);
 
@@ -66,7 +68,8 @@ void matrix < double > ::symeig(matrix < double > &W, matrix < double > &V){
 
 template < >
 void matrix < float > ::eig(matrix < float > &WR, matrix < float > &WI,
-                            matrix < float > &VL, matrix < float > &VR){
+                            matrix < float > &VL, matrix < float > &VR)
+{
   matrix < float > A = *this;
   VL = *this;
   VR = *this;
@@ -89,7 +92,8 @@ void matrix < float > ::eig(matrix < float > &WR, matrix < float > &WI,
 
 template < >
 void matrix < double > ::eig(matrix < double > &WR, matrix < double > &WI,
-                             matrix < double > &VL, matrix < double > &VR){
+                             matrix < double > &VL, matrix < double > &VR)
+{
   matrix < double > A = *this;
   VL = *this;
   VR = *this;
@@ -112,7 +116,8 @@ void matrix < double > ::eig(matrix < double > &WR, matrix < double > &WI,
 
 // general left matrix inverse not implemented
 template < >
-matrix < double > operator | (const matrix < double > &A, const matrix < double > &B){
+matrix < double > operator | (const matrix < double > &A, const matrix < double > &B)
+{
   matrix < double > C = B;
   matrix < double > Acopy = A;
 
@@ -143,7 +148,8 @@ matrix < double > operator | (const matrix < double > &A, const matrix < double
 
 // general left matrix inverse not implemented
 template < >
-matrix < float > operator | (const matrix < float > &A, const matrix < float > &B){
+matrix < float > operator | (const matrix < float > &A, const matrix < float > &B)
+{
   matrix < float > C = B;
   matrix < float > Acopy = A;
 
diff --git a/src/libP/include/matrix.hpp b/src/linAlg/matrix.hpp
similarity index 57%
rename from src/libP/include/matrix.hpp
rename to src/linAlg/matrix.hpp
index e7b67cb61..1091b9dab 100644
--- a/src/libP/include/matrix.hpp
+++ b/src/linAlg/matrix.hpp
@@ -1,28 +1,28 @@
 /*
 
-The MIT License (MIT)
+   The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
 
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
 
-*/
+ */
 
 #ifndef __MATRIX
 #define __MATRIX
@@ -36,12 +36,12 @@ SOFTWARE.
 static int clarrayrequest = 0;
 
 template <class T>
-class matrix {
-
+class matrix
+{
 private:
   int Nrows;
   int Ncolumns;
-  T *data;
+  T* data;
 
   void allocate();
 
@@ -60,7 +60,7 @@ class matrix {
   void operator += (const matrix <T> &A);
 
   // assignment operator from double **array (assumes initialized)
-  matrix <T> operator= (const T *A);
+  matrix <T> operator= (const T* A);
 
   // assignment operator (all values set to d)
   matrix <T> operator= (const double &d);
@@ -71,7 +71,7 @@ class matrix {
 
   int ncolumns() const;
 
-  T *c_array();
+  T* c_array();
 
   void resize(int nr, int nc);
 
@@ -80,21 +80,21 @@ class matrix {
   matrix <T> transpose();
 
   // column major serial access - 1-indexed
-  T operator[] (int r) const ;
+  T operator[] (int r) const;
 
   // column major serial access - 1-indexed
-  T & operator[] (int r) ;
+  T & operator[] (int r);
 
-  T operator() (int r, int c) const ;
+  T operator() (int r, int c) const;
 
-  T & operator() (int r, int c) ;
+  T & operator() (int r, int c);
 
   T operator() (int r) const;
 
-  T & operator() (int r) ;
+  T & operator() (int r);
 
   // permute
-  matrix <T> operator[] (const matrix <int> &ind)  ;
+  matrix <T> operator[] (const matrix <int> &ind);
 
   // in-place sort in ascending order
   void slow_sort(int index);
@@ -103,7 +103,7 @@ class matrix {
   void randomize();
 
   // sort columns using user supplied comparison function
-  void sort( int (*compare)(const void *, const void *) );
+  void sort( int (* compare)(const void*, const void*) );
 
   void symeig(matrix <T> &d, matrix <T> &v);
 
@@ -124,7 +124,6 @@ class matrix {
   T minentry() const;
 
   matrix <T> inverse();
-
 };
 
 using namespace std;
@@ -149,7 +148,6 @@ matrix <T> operator| (const matrix <T> & A, const matrix <T> &B);
 #define imatrix matrix<int>
 #define dmatrix matrix<double>
 
-
 #include "matrix.tpp"
 
 #endif
diff --git a/src/libP/include/matrix.tpp b/src/linAlg/matrix.tpp
similarity index 100%
rename from src/libP/include/matrix.tpp
rename to src/linAlg/matrix.tpp
diff --git a/src/libP/src/matrixConditionNumber.c b/src/linAlg/matrixConditionNumber.cpp
similarity index 100%
rename from src/libP/src/matrixConditionNumber.c
rename to src/linAlg/matrixConditionNumber.cpp
diff --git a/src/libP/src/meshPartitionStatistics.c b/src/linAlg/matrixEig.cpp
similarity index 50%
rename from src/libP/src/meshPartitionStatistics.c
rename to src/linAlg/matrixEig.cpp
index bd9114be3..72e529039 100644
--- a/src/libP/src/meshPartitionStatistics.c
+++ b/src/linAlg/matrixEig.cpp
@@ -2,7 +2,7 @@
 
    The MIT License (MIT)
 
-   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+   Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -24,48 +24,46 @@
 
  */
 
-#include <stdio.h>
+#include <unistd.h>
+#include <math.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include "mesh.h"
 
-void meshPartitionStatistics(mesh_t* mesh)
+extern "C" {
+void dgeev_(char* JOBVL, char* JOBVR, int* N, double* A, int* LDA, double* WR, double* WI,
+            double* VL, int* LDVL, double* VR, int* LDVR, double* WORK, int* LWORK, int* INFO );
+}
+
+// compute right eigenvectors
+void matrixEig(int N, dfloat* A, dfloat* VR, dfloat* WR, dfloat* WI)
 {
-  /* get MPI rank and size */
-  int rank, size;
-  rank = mesh->rank;
-  size = mesh->size;
+  char JOBVL = 'N';
+  char JOBVR = 'V';
+  int LDA = N;
+  int LDVL = N;
+  int LDVR = N;
+  int LWORK = 8 * N;
 
-  /* now gather statistics on connectivity between processes */
-  int* comms = (int*) calloc(size, sizeof(int));
-  int Ncomms = 0;
+  double* tmpA  = (double*) calloc(N * N,sizeof(double));
+  double* tmpWR = (double*) calloc(N,sizeof(double));
+  double* tmpWI = (double*) calloc(N,sizeof(double));
+  double* tmpVR = (double*) calloc(N * N,sizeof(double));
+  double* tmpVL = NULL;
+  double* WORK  = (double*) calloc(LWORK,sizeof(double));
 
-  /* count elements with neighbors on each other rank ranks */
-  for(dlong e = 0; e < mesh->Nelements; ++e)
-    for(int f = 0; f < mesh->Nfaces; ++f)
-      if(mesh->EToP[e * mesh->Nfaces + f] != -1) {
-        ++comms[mesh->EToP[e * mesh->Nfaces + f]];
-        ++Ncomms;
-      }
+  int info;
 
-  int Nmessages = 0;
-  for(int r = 0; r < size; ++r)
-    if(comms[r] > 0)
-      ++Nmessages;
+  for(int n = 0; n < N; ++n)
+    for(int m = 0; m < N; ++m)
+      tmpA[n + m * N] = A[n * N + m];
 
-  for(int r = 0; r < size; ++r) {
-    MPI_Barrier(mesh->comm);
-    if(r == rank) {
-      fflush(stdout);
-      printf("r: %02d [", rank);
-      for(int s = 0; s < size; ++s)
-        printf(" %04d", comms[s]);
-      printf("] (Nelements=" dlongFormat ", Nmessages=%d, Ncomms=%d)\n",
-             mesh->Nelements,
-             Nmessages,
-             Ncomms);
-      fflush(stdout);
-    }
-  }
+  dgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, tmpWR, tmpWI, tmpVL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &info);
 
-  free(comms);
+  for(int n = 0; n < N; ++n) {
+    WR[n] = tmpWR[n];
+    WI[n] = tmpWI[n];
+    for(int m = 0; m < N; ++m)
+      VR[n + m * N] = tmpVR[n * N + m];
+  }
 }
diff --git a/src/libP/src/matrixInverse.c b/src/linAlg/matrixInverse.cpp
similarity index 100%
rename from src/libP/src/matrixInverse.c
rename to src/linAlg/matrixInverse.cpp
diff --git a/src/linAlg/matrixRightSolve.cpp b/src/linAlg/matrixRightSolve.cpp
new file mode 100644
index 000000000..6e9ef0acc
--- /dev/null
+++ b/src/linAlg/matrixRightSolve.cpp
@@ -0,0 +1,81 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#include <unistd.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "mesh.h"
+
+extern "C" {
+void dgesv_ ( int* N, int* NRHS, double* A,
+              int* LDA,
+              int* IPIV,
+              double* B,
+              int* LDB,
+              int* INFO );
+}
+
+// C = A/B  = trans(trans(B)\trans(A))
+// assume row major
+void matrixRightSolve(int NrowsA, int NcolsA, dfloat* A, int NrowsB, int NcolsB, dfloat* B, dfloat* C)
+{
+  int info;
+
+  int NrowsX = NcolsB;
+  int NcolsX = NrowsB;
+
+  int NrowsY = NcolsA;
+  int NcolsY = NrowsA;
+
+  int lwork = NrowsX * NcolsX;
+
+  // compute inverse mass matrix
+  double* tmpX = (double*) calloc(NrowsX * NcolsX, sizeof(double));
+  double* tmpY = (double*) calloc(NrowsY * NcolsY, sizeof(double));
+
+  int* ipiv = (int*) calloc(NrowsX, sizeof(int));
+  double* work = (double*) calloc(lwork, sizeof(double));
+
+  for(int n = 0; n < NrowsX * NcolsX; ++n)
+    tmpX[n] = B[n];
+
+  for(int n = 0; n < NrowsY * NcolsY; ++n)
+    tmpY[n] = A[n];
+
+  dgesv_(&NrowsX, &NcolsY, tmpX, &NrowsX, ipiv, tmpY, &NrowsY, &info); // ?
+
+  for(int n = 0; n < NrowsY * NcolsY; ++n)
+    C[n] = tmpY[n];
+
+  if(info)
+    printf("matrixRightSolve: dgesv reports info = %d when inverting matrix\n", info);
+
+  free(work);
+  free(ipiv);
+  free(tmpX);
+  free(tmpY);
+}
diff --git a/src/libP/src/occaHostMallocPinned.c b/src/linAlg/matrixTranspose.cpp
similarity index 63%
rename from src/libP/src/occaHostMallocPinned.c
rename to src/linAlg/matrixTranspose.cpp
index 28a1f49b4..4e3330d65 100644
--- a/src/libP/src/occaHostMallocPinned.c
+++ b/src/linAlg/matrixTranspose.cpp
@@ -25,27 +25,31 @@
  */
 
 #include <unistd.h>
+#include <math.h>
+#include <stdlib.h>
 #include <stdio.h>
-#include "occa.hpp"
-#include "mesh.h"
-
-void* occaHostMallocPinned(occa::device &device,
-                           size_t size,
-                           void* source,
-                           occa::memory &mem,
-                           occa::memory &h_mem)
-{
-  occa::properties props;
-  props["mapped"] = true;
-
-  if(source != NULL)
-    mem =  device.malloc(size, source);
-  else
-    mem =  device.malloc(size);
-
-  h_mem =  device.malloc(size, props);
+#include <mesh.h>
 
-  void* ptr = h_mem.ptr(props);
-
-  return ptr;
-}
+void matrixTranspose(const int M, const int N,
+                     const dfloat* A, const int LDA,
+                     dfloat* AT, const int LDAT)
+{
+  //A & A^T - Row major ordering
+  //M = number of rows of A, columns of A^T
+  //N = number of columns of A, rows of A^T
+  //LDA  - leading dimension of A (>=M)
+  //LDAT - leading dimension of A^T (>=N)
+
+  //quick return
+  if (N < 1 || M < 1) return;
+
+  //check for weird input
+  if (LDA < N || LDAT < M) {
+    printf("Bad input to matrixTranspose\n");
+    return;
+  }
+
+  for (int n = 0; n < N; n++) //for all cols of A^T
+    for (int m = 0; m < M; m++) //for all rows of A^T
+      AT[n * LDAT + m] = A[m * LDA + n];
+}
\ No newline at end of file
diff --git a/src/libP/solvers/elliptic/src/NBFPCG.c b/src/linearSolver/NBFPCG.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/NBFPCG.c
rename to src/linearSolver/NBFPCG.cpp
diff --git a/src/libP/solvers/elliptic/src/NBPCG.c b/src/linearSolver/NBPCG.cpp
similarity index 100%
rename from src/libP/solvers/elliptic/src/NBPCG.c
rename to src/linearSolver/NBPCG.cpp
diff --git a/src/libP/solvers/elliptic/src/PCG.c b/src/linearSolver/PCG.cpp
similarity index 97%
rename from src/libP/solvers/elliptic/src/PCG.c
rename to src/linearSolver/PCG.cpp
index d5df3a5a9..7946674fb 100644
--- a/src/libP/solvers/elliptic/src/PCG.c
+++ b/src/linearSolver/PCG.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "elliptic.h"
-#include <timer.hpp>
+#include "timer.hpp"
 
 int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x,
         const dfloat tol, const int MAXIT)
@@ -54,7 +54,7 @@ int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x,
   occa::memory &o_Ap = elliptic->o_Ap;
   occa::memory &o_weight = elliptic->o_invDegree;
 
-  elliptic->fillKernel(elliptic->Nfields*elliptic->Ntotal, 0.0, o_p);
+  elliptic->fillKernel(elliptic->Nfields * elliptic->Ntotal, 0.0, o_p);
 
   pAp = 0;
   rdotz1 = 1;
diff --git a/src/lns/tombo.cpp b/src/lns/tombo.cpp
new file mode 100644
index 000000000..26bbe75ac
--- /dev/null
+++ b/src/lns/tombo.cpp
@@ -0,0 +1,272 @@
+#include "nrs.hpp"
+#include "udf.hpp"
+
+namespace tombo
+{
+occa::memory pressureSolve(nrs_t* nrs, dfloat time)
+{
+  mesh_t* mesh = nrs->mesh;
+
+  //enforce Dirichlet BCs
+  nrs->fillKernel((1+nrs->NVfields)*nrs->fieldOffset, std::numeric_limits<dfloat>::min(), nrs->o_wrk6);
+  for (int sweep = 0; sweep < 2; sweep++) {
+    nrs->pressureDirichletBCKernel(mesh->Nelements,
+                                   time,
+                                   nrs->fieldOffset,
+                                   mesh->o_sgeo,
+                                   mesh->o_x,
+                                   mesh->o_y,
+                                   mesh->o_z,
+                                   mesh->o_vmapM,
+                                   mesh->o_EToB,
+                                   nrs->o_EToB,
+                                   nrs->o_usrwrk,
+                                   nrs->o_U,
+                                   nrs->o_P,
+                                   nrs->o_wrk6);
+
+    nrs->velocityDirichletBCKernel(mesh->Nelements,
+                                   nrs->fieldOffset,
+                                   time,
+                                   mesh->o_sgeo,
+                                   mesh->o_x,
+                                   mesh->o_y,
+                                   mesh->o_z,
+                                   mesh->o_vmapM,
+                                   mesh->o_EToB,
+                                   nrs->o_EToB,
+                                   nrs->o_usrwrk,
+                                   nrs->o_U,
+                                   nrs->o_wrk7);
+
+    //take care of Neumann-Dirichlet shared edges across elements
+    if (sweep == 0) oogs::startFinish(nrs->o_wrk6, 1+nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMax, nrs->gsh);
+    if (sweep == 1) oogs::startFinish(nrs->o_wrk6, 1+nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMin, nrs->gsh);
+  }
+
+  if (nrs->pSolver->Nmasked) nrs->maskCopyKernel(nrs->pSolver->Nmasked, 0, nrs->pSolver->o_maskIds,
+                                                 nrs->o_wrk6, nrs->o_P); 
+
+  if (nrs->uvwSolver) {
+    if (nrs->uvwSolver->Nmasked) nrs->maskCopyKernel(nrs->uvwSolver->Nmasked, 0*nrs->fieldOffset, nrs->uvwSolver->o_maskIds,
+                                                     nrs->o_wrk7, nrs->o_U);
+  } else {
+    if (nrs->uSolver->Nmasked) nrs->maskCopyKernel(nrs->uSolver->Nmasked, 0*nrs->fieldOffset, nrs->uSolver->o_maskIds, 
+                                                   nrs->o_wrk7, nrs->o_U);
+    if (nrs->vSolver->Nmasked) nrs->maskCopyKernel(nrs->vSolver->Nmasked, 1*nrs->fieldOffset, nrs->vSolver->o_maskIds, 
+                                                   nrs->o_wrk7, nrs->o_U);
+    if (nrs->wSolver->Nmasked) nrs->maskCopyKernel(nrs->wSolver->Nmasked, 2*nrs->fieldOffset, nrs->wSolver->o_maskIds, 
+                                                   nrs->o_wrk7, nrs->o_U);
+  }
+
+  nrs->curlKernel(mesh->Nelements,
+                  mesh->o_vgeo,
+                  mesh->o_Dmatrices,
+                  nrs->fieldOffset,
+                  nrs->o_Ue,
+                  nrs->o_wrk0);
+
+  oogs::startFinish(nrs->o_wrk0, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);
+
+  nrs->invMassMatrixKernel(
+    mesh->Nelements,
+    nrs->fieldOffset,
+    nrs->NVfields,
+    mesh->o_vgeo,
+    nrs->mesh->o_invLMM,
+    nrs->o_wrk0);
+
+  nrs->curlKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    nrs->o_wrk0,
+    nrs->o_wrk3);
+
+  nrs->gradientVolumeKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    nrs->o_div,
+    nrs->o_wrk0);
+
+  //if (nrs->options.compareArgs("VARIABLE VISCOSITY", "TRUE"))
+  if(nrs->options.compareArgs("STRESSFORMULATION", "TRUE"))
+    nrs->pressureStressKernel(
+         mesh->Nelements,
+         mesh->o_vgeo,
+         mesh->o_Dmatrices,
+         nrs->fieldOffset,
+         nrs->o_mue,
+         nrs->o_Ue,
+         nrs->o_div,
+         nrs->o_wrk3);
+
+  occa::memory o_irho = nrs->o_ellipticCoeff;
+  nrs->pressureRhsKernel(
+    mesh->Nelements * mesh->Np,
+    nrs->fieldOffset,
+    nrs->o_mue,
+    o_irho,
+    nrs->o_BF,
+    nrs->o_wrk3,
+    nrs->o_wrk0,
+    nrs->o_wrk6);
+
+  oogs::startFinish(nrs->o_wrk6, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);
+
+  nrs->invMassMatrixKernel(
+    mesh->Nelements,
+    nrs->fieldOffset,
+    nrs->NVfields,
+    mesh->o_vgeo,
+    nrs->mesh->o_invLMM,
+    nrs->o_wrk6);
+
+  nrs->divergenceVolumeKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    nrs->o_wrk6,
+    nrs->o_wrk3);
+
+  nrs->pressureAddQtlKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    nrs->g0 * nrs->idt,
+    nrs->o_div,
+    nrs->o_wrk3);
+
+  nrs->divergenceSurfaceKernel(
+    mesh->Nelements,
+    mesh->o_sgeo,
+    mesh->o_vmapM,
+    nrs->o_EToB,
+    nrs->g0 * nrs->idt,
+    nrs->fieldOffset,
+    nrs->o_wrk6,
+    nrs->o_U,
+    nrs->o_wrk3);
+
+  oogs::startFinish(nrs->o_wrk3, 1, 0, ogsDfloat, ogsAdd, nrs->gsh);
+
+  nrs->o_wrk1.copyFrom(nrs->o_P, nrs->Ntotal * sizeof(dfloat));
+  nrs->NiterP = ellipticSolve(nrs->pSolver, nrs->o_wrk3, nrs->o_wrk1);
+
+  return nrs->o_wrk1;
+}
+
+occa::memory velocitySolve(nrs_t* nrs, dfloat time)
+{
+  mesh_t* mesh = nrs->mesh;
+
+  dfloat scale = -1./3;
+  if(nrs->options.compareArgs("STRESSFORMULATION", "TRUE")) scale = 2./3;
+
+#if 0
+  nrs->PQKernel(
+       mesh->Nelements*mesh->Np,
+       -scale,
+       nrs->o_mue,
+       nrs->o_div,
+       nrs->o_P,
+       nrs->o_wrk3); 
+
+  nrs->gradientVolumeKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    nrs->o_wrk3,
+    nrs->o_wrk0);
+#else
+  nrs->mueDivKernel(
+       mesh->Nelements*mesh->Np,
+       scale,
+       nrs->o_mue,
+       nrs->o_div,
+       nrs->o_wrk3); 
+
+  nrs->gradientVolumeKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    nrs->o_wrk3,
+    nrs->o_wrk0);
+
+  nrs->wgradientVolumeKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    nrs->o_P,
+    nrs->o_wrk3); 
+
+  nrs->scaledAddKernel(
+    nrs->NVfields*nrs->fieldOffset,
+    1.0,
+    0*nrs->fieldOffset,
+    nrs->o_wrk3,
+    -1.0,
+    0*nrs->fieldOffset,
+    nrs->o_wrk0);
+#endif
+
+  nrs->velocityNeumannBCKernel(
+       mesh->Nelements,
+       nrs->fieldOffset,
+       mesh->o_sgeo,
+       mesh->o_vmapM,
+       mesh->o_EToB,
+       nrs->o_EToB,
+       time,
+       mesh->o_x,
+       mesh->o_y,
+       mesh->o_z,
+       nrs->o_usrwrk,
+       nrs->o_U,
+       nrs->o_wrk0); 
+
+  nrs->velocityRhsKernel(
+    mesh->Nelements,
+    nrs->fieldOffset,
+    nrs->o_BF,
+    nrs->o_wrk0,
+    nrs->o_rho,
+    nrs->o_wrk3);
+
+  oogs::startFinish(nrs->o_wrk3, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);
+
+  if(nrs->options.compareArgs("VELOCITY INITIAL GUESS DEFAULT", "EXTRAPOLATION")) { 
+    nrs->o_wrk0.copyFrom(nrs->o_Ue, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat));
+    if (nrs->uvwSolver) {
+      if (nrs->uvwSolver->Nmasked) nrs->maskCopyKernel(nrs->uvwSolver->Nmasked, 0*nrs->fieldOffset, nrs->uvwSolver->o_maskIds,
+                                                       nrs->o_U, nrs->o_wrk0);
+    } else {
+      if (nrs->uSolver->Nmasked) nrs->maskCopyKernel(nrs->uSolver->Nmasked, 0*nrs->fieldOffset, nrs->uSolver->o_maskIds,
+                                                     nrs->o_U, nrs->o_wrk0);
+      if (nrs->vSolver->Nmasked) nrs->maskCopyKernel(nrs->vSolver->Nmasked, 1*nrs->fieldOffset, nrs->vSolver->o_maskIds,
+                                                     nrs->o_U, nrs->o_wrk0);
+      if (nrs->wSolver->Nmasked) nrs->maskCopyKernel(nrs->wSolver->Nmasked, 2*nrs->fieldOffset, nrs->wSolver->o_maskIds,
+                                                     nrs->o_U, nrs->o_wrk0);
+    }
+  } else {
+    nrs->o_wrk0.copyFrom(nrs->o_U, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat));
+  }
+
+  if(nrs->uvwSolver) {
+    nrs->NiterU = ellipticSolve(nrs->uvwSolver, nrs->o_wrk3, nrs->o_wrk0);
+  } else {
+    nrs->NiterU = ellipticSolve(nrs->uSolver, nrs->o_wrk3, nrs->o_wrk0);
+    nrs->NiterV = ellipticSolve(nrs->vSolver, nrs->o_wrk4, nrs->o_wrk1);
+    nrs->NiterW = ellipticSolve(nrs->wSolver, nrs->o_wrk5, nrs->o_wrk2);
+  }
+
+  return nrs->o_wrk0;
+}
+
+} // namespace
diff --git a/src/lns/tombo.hpp b/src/lns/tombo.hpp
new file mode 100644
index 000000000..666a9d38b
--- /dev/null
+++ b/src/lns/tombo.hpp
@@ -0,0 +1,12 @@
+#if !defined(nekrs_tombo_hpp_)
+#define nekrs_tombo_hpp_
+
+#include "nrs.hpp"
+
+namespace tombo
+{
+occa::memory pressureSolve(nrs_t* nrs, dfloat time);
+occa::memory velocitySolve(nrs_t* nrs, dfloat time);
+}
+
+#endif
diff --git a/src/main.cpp b/src/main.cpp
index 007287063..9102b4ad6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -67,9 +67,11 @@
 #include <cstring>
 #include <getopt.h>
 #include <cfenv>
+#include <limits>
+#include <math.h>
 #include <unistd.h>
+
 #include "nekrs.hpp"
-#include <parAlmond.hpp>
 
 #define DEBUG
 
@@ -93,8 +95,8 @@ int main(int argc, char** argv)
   {
     int request = MPI_THREAD_SINGLE;
     const char* env_val = std::getenv ("NEKRS_MPI_THREAD_MULTIPLE");
-    if(env_val)
-      if(std::stoi(env_val)) request = MPI_THREAD_MULTIPLE;
+    if (env_val)
+      if (std::stoi(env_val)) request = MPI_THREAD_MULTIPLE;
 
     int provided;
     int retval =  MPI_Init_thread(&argc, &argv, request, &provided);
@@ -117,8 +119,10 @@ int main(int argc, char** argv)
     }
     MPI_Barrier(comm);
   }
+  if (cmdOpt->debug) feraiseexcept(FE_ALL_EXCEPT);
 
-  if(cmdOpt->debug) feraiseexcept(FE_ALL_EXCEPT);
+  MPI_Barrier(comm);
+  double elapsedTime = MPI_Wtime();
 
   std::string cacheDir;
   nekrs::setup(comm, cmdOpt->buildOnly, cmdOpt->sizeTarget,
@@ -131,42 +135,67 @@ int main(int argc, char** argv)
     return EXIT_SUCCESS;
   }
 
-
   const int runTimeStatFreq = 500;
-  const int outputStep = nekrs::outputStep();
-  const int NtimeSteps = nekrs::NtimeSteps();
-  const double startTime = nekrs::startTime();
-  const double finalTime = nekrs::finalTime();
-
-  if (rank == 0) std::cout << "\nstarting time loop" << "\n";
-
-  double time = startTime;
-  int tStep = 1;
+  const int writeControlRunTime = nekrs::writeControlRunTime();
+
+  int tStep = 0;
+  double time = nekrs::startTime();
+  double outputTime = -1;
+  if (writeControlRunTime) outputTime = time + nekrs::writeInterval();
+  int lastStep = nekrs::lastStep(time, tStep, elapsedTime);
+
+  if (rank == 0 && !lastStep) {
+    if (nekrs::endTime() > nekrs::startTime()) 
+      std::cout << "\ntimestepping to time " << nekrs::endTime() << " ...\n";
+    else
+      std::cout << "\ntimestepping for " << nekrs::numSteps() << " steps ...\n";
+  }
   MPI_Pcontrol(1);
-  while ((finalTime - time) / finalTime > 1e-6 * nekrs::dt()) {
-    nekrs::runStep(time, nekrs::dt(), tStep);
-    time += nekrs::dt();
-
-    int isOutputStep = 0;
-    if (outputStep > 0)
-      if (tStep % outputStep == 0 || tStep == NtimeSteps) isOutputStep = 1;
-
-    nekrs::udfExecuteStep(time, tStep, isOutputStep);
-    if (isOutputStep) {
-      nekrs::copyToNek(time, tStep);
-      nekrs::nekOutfld();
+  while (!lastStep) {
+    MPI_Barrier(comm);
+    elapsedTime += (MPI_Wtime() - elapsedTime);
+    ++tStep;
+    lastStep = nekrs::lastStep(time, tStep, elapsedTime);
+
+    double dt; 
+    if (lastStep && nekrs::endTime() > 0) 
+      dt = nekrs::endTime() - time;
+    else
+      dt = nekrs::dt();
+
+    nekrs::runStep(time, dt, tStep);
+    time += dt;
+
+    int outputStep = 0;
+    if (writeControlRunTime) { 
+      outputStep = (time >= outputTime);
+    } else {
+      if (nekrs::writeInterval() > 0) outputStep = (tStep%(int)nekrs::writeInterval() == 0);
     }
+    if (nekrs::writeInterval() == 0) outputStep = 0;
+    if (lastStep) outputStep = 1;
+    if (nekrs::writeInterval() < 0) outputStep = 0;
 
-    if (tStep && tStep % runTimeStatFreq == 0 || tStep == NtimeSteps) nekrs::printRuntimeStatistics();
+    nekrs::udfExecuteStep(time, tStep, outputStep);
 
-    ++tStep;
+    if (outputStep) {
+      nekrs::outfld(time, outputTime);
+      if (writeControlRunTime) outputTime += nekrs::writeInterval();
+    }
+
+    if (tStep%runTimeStatFreq == 0 || lastStep) nekrs::printRuntimeStatistics();
   }
   MPI_Pcontrol(0);
 
-  if(rank == 0) std::cout << "\nEnd." << "\n";
+  MPI_Barrier(comm);
+  elapsedTime += (MPI_Wtime() - elapsedTime);
+  if (rank == 0) {
+    std::cout << "elapsedTime: " << elapsedTime << " s\n";
+    std::cout << "End\n";
+  }
+  fflush(stdout);
 
   MPI_Finalize();
-  fflush(stdout);
   return EXIT_SUCCESS;
 }
 
diff --git a/src/libP/src/gsParallelGatherScatter.c b/src/mesh/gsParallelGatherScatter.cpp
similarity index 100%
rename from src/libP/src/gsParallelGatherScatter.c
rename to src/mesh/gsParallelGatherScatter.cpp
diff --git a/src/libP/src/gsParallelGatherScatterSetup.c b/src/mesh/gsParallelGatherScatterSetup.cpp
similarity index 99%
rename from src/libP/src/gsParallelGatherScatterSetup.c
rename to src/mesh/gsParallelGatherScatterSetup.cpp
index b32758036..c1cc7a678 100644
--- a/src/libP/src/gsParallelGatherScatterSetup.c
+++ b/src/mesh/gsParallelGatherScatterSetup.cpp
@@ -32,8 +32,7 @@
 #include <string.h>
 
 #include "gslib.h"
-
-#include "types.h"
+#include "nrssys.h"
 
 void* gsParallelGatherScatterSetup(MPI_Comm meshComm,
                                    dlong NuniqueBases,
diff --git a/src/mesh/mesh.h b/src/mesh/mesh.h
new file mode 100644
index 000000000..cb2522ff0
--- /dev/null
+++ b/src/mesh/mesh.h
@@ -0,0 +1,627 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#ifndef MESH_H
+#define MESH_H 1
+
+#include <unistd.h>
+#include <assert.h>
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "nrssys.hpp"
+
+#define TRIANGLES 3
+#define QUADRILATERALS 4
+#define TETRAHEDRA 6
+#define HEXAHEDRA 12
+
+extern "C" { // Start C linkage
+typedef struct
+{
+  MPI_Comm comm;
+  int rank, size; // MPI rank and size (process count)
+
+  int dim;
+  int Nverts, Nfaces, NfaceVertices;
+
+  int cht;
+
+  hlong Nnodes;
+  dfloat* EX; // coordinates of vertices for each element
+  dfloat* EY;
+  dfloat* EZ;
+
+  dlong Nelements;
+  hlong* EToV; // element-to-vertex connectivity
+  dlong* EToE; // element-to-element connectivity
+  int* EToF;   // element-to-(local)face connectivity
+  int* EToP;   // element-to-partition/process connectivity
+  int* EToB;   // element-to-boundary condition type
+
+  hlong* elementInfo; //type of element
+
+  // boundary faces
+  hlong NboundaryFaces; // number of boundary faces
+  hlong* boundaryInfo; // list of boundary faces (type, vertex-1, vertex-2, vertex-3)
+
+  // MPI halo exchange info
+  dlong totalHaloPairs;   // number of elements to be sent in halo exchange
+  dlong* haloElementList; // sorted list of elements to be sent in halo exchange
+  int* NhaloPairs;      // number of elements worth of data to send/recv
+  int NhaloMessages;      // number of messages to send
+
+  dlong* haloGetNodeIds; // volume node ids of outgoing halo nodes
+  dlong* haloPutNodeIds; // volume node ids of incoming halo nodes
+
+  void* haloSendRequests;
+  void* haloRecvRequests;
+
+  dlong NinternalElements; // number of elements that can update without halo exchange
+  dlong NnotInternalElements; // number of elements that cannot update without halo exchange
+
+  // CG gather-scatter info
+  hlong* globalIds;
+  hlong* maskedGlobalIds;
+  void* gsh, * hostGsh; // gslib struct pointer
+  ogs_t* ogs; //occa gs pointer
+
+  // list of elements that are needed for global gather-scatter
+  dlong NglobalGatherElements;
+  dlong* globalGatherElementList;
+  occa::memory o_globalGatherElementList;
+
+  // list of elements that are not needed for global gather-scatter
+  dlong NlocalGatherElements;
+  dlong* localGatherElementList;
+  occa::memory o_localGatherElementList;
+
+  //list of fair pairs
+  dlong NfacePairs;
+  dlong* EToFPairs;
+  dlong* FPairsToE;
+  int* FPairsToF;
+
+  // NBN: streams / command queues
+  occa::stream stream0, stream1;
+
+  // volumeGeometricFactors;
+  dlong Nvgeo;
+  dfloat* vgeo;
+
+  // second order volume geometric factors
+  dlong Nggeo;
+  dfloat* ggeo;
+
+  // volume node info
+  int N, Np;
+  dfloat* r, * s, * t;    // coordinates of local nodes
+  dfloat* Dr, * Ds, * Dt; // collocation differentiation matrices
+  dfloat* Dmatrices;
+  dfloat* MM, * invMM;           // reference mass matrix
+  dfloat* LMM, * invLMM;
+  dfloat* Srr,* Srs, * Srt; //element stiffness matrices
+  dfloat* Ssr,* Sss, * Sst;
+  dfloat* Str,* Sts, * Stt;
+  dfloat* Smatrices;
+  int maxNnzPerRow;
+  dfloat* x, * y, * z;    // coordinates of physical nodes
+
+  dfloat sphereRadius;  // for Quad3D
+
+  dfloat volume;
+
+  // indices of vertex nodes
+  int* vertexNodes;
+
+  // quad specific quantity
+  int Nq, NqP, NpP;
+
+  dfloat* D; // 1D differentiation matrix (for tensor-product)
+  dfloat* DW; // weak 1D differentiation matrix (for tensor-product)
+  dfloat* gllz; // 1D GLL quadrature nodes
+  dfloat* gllw; // 1D GLL quadrature weights
+
+  int gjNq;
+  dfloat* gjr,* gjw; // 1D nodes and weights for Gauss Jacobi quadature
+  dfloat* gjI,* gjD; // 1D GLL to Gauss node interpolation and differentiation matrices
+  dfloat* gjD2;     // 1D GJ to GJ node differentiation
+
+  // transform to/from eigenmodes of 1D laplacian (with built in weighting)
+  dfloat* oasForward;
+  dfloat* oasBack;
+  dfloat* oasDiagOp;
+
+  // transform to/from eigenmode of IPDG 1D laplacian
+  dfloat* oasForwardDg;
+  dfloat* oasBackDg;
+  dfloat* oasDiagOpDg;
+
+  //rotated node ids
+  int* rmapP;
+
+  //reference patch inverse (for OAS precon)
+  dfloat* invAP;
+
+  // face node info
+  int Nfp;        // number of nodes per face
+  int* faceNodes; // list of element reference interpolation nodes on element faces
+  dlong* vmapM;     // list of volume nodes that are face nodes
+  dlong* vmapP;     // list of volume nodes that are paired with face nodes
+  dlong* mapP;     // list of surface nodes that are paired with -ve surface  nodes
+  int* faceVertices; // list of mesh vertices on each face
+
+  dfloat* LIFT; // lift matrix
+  dfloat* FMM;  // Face Mass Matrix
+  dfloat* sMT; // surface mass (MM*LIFT)^T
+
+  dlong Nsgeo;
+  dfloat* sgeo;
+
+  // field info for PDE solver
+  int Nfields;
+  dfloat* q;    // solution data array
+  dfloat* fQM, * fQP; //solution trace arrays
+  dfloat* rhsq, * rhsq2, * rhsq3; // right hand side data array
+  dfloat* resq; // residual data array (for LSERK time-stepping)
+
+  dfloat Lambda2; // square of penalty paramater used in constructing q^*
+
+  // cubature
+  int cubNp, cubNfp, cubNq;
+  dfloat* cubr, * cubs, * cubt, * cubw; // coordinates and weights of local cubature nodes
+  dfloat* cubx, * cuby, * cubz;    // coordinates of physical nodes
+  dfloat* cubInterp; // interpolate from W&B to cubature nodes
+  dfloat* cubProject; // projection matrix from cubature nodes to W&B nodes
+  dfloat* cubD;       // 1D differentiation matrix
+  dfloat* cubDiffInterp;     // 1D weak differentiation matrix
+  dfloat* cubDW;     // 1D weak differentiation matrix
+  dfloat* cubDrW;    // 'r' weak differentiation matrix
+  dfloat* cubDsW;    // 's' weak differentiation matrix
+  dfloat* cubDtW;    // 't' weak differentiation matrix
+  dfloat* cubDWmatrices;
+
+  dfloat* cubvgeo;  //volume geometric data at cubature points
+  dfloat* cubsgeo;  //surface geometric data at cubature points
+  dfloat* cubggeo;  //second type volume geometric data at cubature points
+
+  // c2 at cubature points (for wadg)
+  dfloat* c2;
+
+  //source injection
+  dfloat* sourceq;
+  dfloat sourceX0, sourceY0, sourceZ0, sourceT0, sourceC2, sourceFreq;
+  int sourceNelements;
+  dlong* MRABsourceNelements;
+  dlong* sourceElements;
+
+  // surface integration node info
+  int intNfp;       // number of integration nodes on each face
+  dfloat* intInterp; // interp from surface node to integration nodes
+  dfloat* intLIFT;   // lift from surface integration nodes to W&B volume nodes
+  dfloat* intx, * inty, * intz; // coordinates of suface integration nodes
+
+  // Bernstein-Bezier info
+  dfloat* VB, * invVB; // Bernstein Vandermonde matrices
+  dfloat* BBMM;
+  dfloat* invVB1D, * invVB2D;
+  int* D0ids, * D1ids, * D2ids, * D3ids; // Bernstein deriv matrix indices
+  dfloat* Dvals; // Bernstein deriv matrix values
+  int* D0Tids, * D1Tids, * D2Tids, * D3Tids; // Bernstein transpose deriv matrix indices
+  dfloat* DTvals; // Bernstein transpose deriv matrix values
+  dfloat* VBq, * PBq; // cubature interpolation/projection matrices
+  int* L0ids; // L0 matrix ids
+  dfloat* L0vals; // L0 values (L0 tridiagonal in 2D)
+  int* ELids; // lift reduction matrix indices
+  dfloat* ELvals; // lift reduction matrix values
+  int max_EL_nnz; // max number of non-zeros per row of EL
+  int* BBRaiseids; //Bernstein elevate matrix indices
+  dfloat* BBRaiseVals; //Bernstein elevate matrix values
+  dfloat* BBLower; //Berstein projection matrix.
+
+  //degree raising and lowering interpolation matrices
+  dfloat* interpRaise;
+  dfloat* interpLower;
+
+  //sparse basis info
+  dfloat* sparseV, * invSparseV;
+  dfloat* sparseMM;
+  int* FaceModes;
+  int SparseNnzPerRow;
+  int SparseNnzPerRowNonPadded;
+  int* sparseStackedNZ;
+  dfloat* sparseSrrT;
+  dfloat* sparseSrsT;
+  dfloat* sparseSssT;
+  int* Ind;
+
+  dlong* mmapM, * mmapP;
+  int* mmapS;
+  dfloat* mapSgn;
+
+  // MRAB,SAAB coefficients
+  dfloat mrab[3], mrabb[3], saab[3], saabexp; // AK: deprecated
+  int MRABNlevels;
+  int* MRABlevel;
+  dlong* MRABNelements, * MRABNhaloElements;
+  dlong** MRABelementIds, ** MRABhaloIds;
+  int* MRABshiftIndex;
+
+  dlong* MRABpmlNelements, * MRABpmlNhaloElements;
+  dlong** MRABpmlElementIds, ** MRABpmlIds;
+  dlong** MRABpmlHaloElementIds, ** MRABpmlHaloIds;
+
+  dlong pmlNelements, nonPmlNelements;
+  dlong* nonPmlElementIds, * pmlElementIds, * pmlIds;
+  int shiftIndex;
+
+  dfloat dtfactor; //Delete later for script run
+  dfloat maxErrorBoltzmann;
+
+  dfloat* errtmp;
+  dfloat rkC[7], rkA[7 * 7], rkE[7];
+
+  occa::memory o_rkq, o_rkrhsq, o_rkerr; // deprecated, AK.
+  occa::memory o_errtmp;
+  occa::memory o_rkA, o_rkE;
+
+  // ploting info for generating field vtu
+  int plotNverts;       // number of vertices for each plot element
+  int plotNp;           // number of plot nodes per element
+  int plotNelements;    // number of "plot elements" per element
+  int* plotEToV;        // triangulation of plot nodes
+  dfloat* plotR, * plotS, * plotT; // coordinates of plot nodes in reference element
+  dfloat* plotInterp;    // warp & blend to plot node interpolation matrix
+
+  int* contourEToV;
+  dfloat* contourVX, * contourVY, * contourVZ;
+  dfloat* contourInterp, * contourInterp1, * contourFilter;
+
+  //SEMFEM data
+  int NpFEM, NelFEM;
+  int* FEMEToV;
+  dfloat* rFEM, * sFEM, * tFEM;
+  dfloat* SEMFEMInterp;
+
+  occa::memory o_SEMFEMInterp;
+  occa::memory o_SEMFEMAnterp;
+
+  // Boltzmann specific stuff
+  dfloat RT, sqrtRT, tauInv, Ma, Re; // Deprecated: AK
+
+  // pml stuff
+  int pmlNfields;
+  //  dlong    pmlNelements; // deprecated
+  dlong* pmlElementList;   // deprecated
+
+  int Ntscale; // Will be removed, for time accuracy test
+
+  dfloat* invTau; // deprecated in Boltzmann
+
+  // Probe Data
+  int probeN, probeNTotal;
+  dfloat* probeR, * probeS, * probeT;
+  // dfloat *probeX, *probeY, *probeZ;
+  dlong* probeElementIds, * probeIds;
+  dfloat* probeI;
+
+  // occa stuff
+  occa::device device;
+
+  occa::stream defaultStream;
+  occa::stream dataStream;
+  occa::stream computeStream;
+
+  occa::memory o_q, o_rhsq, o_resq, o_fQM, o_fQP;
+
+  occa::memory o_Dr, o_Ds, o_Dt, o_LIFT, o_MM, o_invMM, o_MMPfloat;
+  occa::memory o_DrT, o_DsT, o_DtT, o_LIFTT;
+  occa::memory o_LMM, o_invLMM;
+  occa::memory o_Dmatrices;
+  occa::memory o_DmatricesPfloat;
+  occa::memory o_FMMT;
+  occa::memory o_sMT;
+
+  occa::memory o_D; // tensor product differentiation matrix (for Hexes)
+  occa::memory o_DW; // tensor product differentiation matrix (for Hexes)
+  occa::memory o_SrrT, o_SrsT, o_SrtT; //element stiffness matrices
+  occa::memory o_SsrT, o_SssT, o_SstT;
+  occa::memory o_Srr, o_Srs, o_Srt, o_Sss, o_Sst, o_Stt; // for char4-based kernels
+  occa::memory o_Smatrices;
+  occa::memory o_SmatricesPfloat;
+  occa::memory o_IndT, o_IndTchar;
+  occa::memory o_India, o_Indja;
+  occa::memory o_StrT, o_StsT, o_SttT;
+  occa::memory o_Ind; // for sparse index storage
+
+  occa::memory o_vgeo, o_sgeo;
+  occa::memory o_vmapM, o_vmapP, o_mapP;
+
+  occa::memory o_rmapP;
+
+  occa::memory o_EToE, o_EToF, o_EToB, o_x, o_y, o_z;
+
+  occa::memory o_EToFPairs, o_FPairsToE, o_FPairsToF;
+
+  // cubature (for wadg)
+  occa::memory o_intLIFTT, o_intInterpT, o_intx, o_inty, o_intz;
+  occa::memory o_cubDWT, o_cubD;
+  occa::memory o_cubDrWT, o_cubDsWT, o_cubDtWT, o_cubDiffInterpT;
+  occa::memory o_cubDWmatrices;
+  occa::memory o_cubInterpT, o_cubProjectT;
+  occa::memory o_invMc; // for comparison: inverses of weighted mass matrices
+
+  occa::memory o_cubvgeo, o_cubsgeo, o_cubggeo;
+
+  occa::memory o_c2;
+
+  //MRAB element lists
+  occa::memory* o_MRABelementIds;
+  occa::memory* o_MRABhaloIds;
+  occa::memory* o_MRABpmlElementIds;
+  occa::memory* o_MRABpmlIds;
+  occa::memory* o_MRABpmlHaloElementIds;
+  occa::memory* o_MRABpmlHaloIds;
+
+  // DG halo exchange info
+  occa::memory o_haloElementList;
+  occa::memory o_haloBuffer;
+  occa::memory o_haloGetNodeIds;
+  occa::memory o_haloPutNodeIds;
+
+  occa::memory o_internalElementIds;
+  occa::memory o_notInternalElementIds;
+
+  // Bernstein-Bezier occa arrays
+  occa::memory o_BBMM;
+  occa::memory o_D0ids, o_D1ids, o_D2ids, o_D3ids, o_Dvals; // Bernstein deriv matrix indices
+  occa::memory o_packedDids; // char4 packed increments (D1ids-D0ids)
+
+  occa::memory o_invVB1DT, o_invVB2DT;
+  occa::memory o_VBq, o_PBq; // cubature interpolation/projection matrices
+  occa::memory o_L0ids, o_L0vals, o_ELids, o_ELvals;
+
+  /* sparse basis occa arrays */
+  occa::memory o_sparseStackedNZ;
+  occa::memory o_sparseSrrT;
+  occa::memory o_sparseSrsT;
+  occa::memory o_sparseSssT;
+  occa::memory o_mapSgn;
+
+  // pml vars
+  occa::memory o_sigmax, o_sigmay, o_sigmaz; // AK: deprecated
+
+  occa::memory o_pmlElementIds;
+  occa::memory o_nonPmlElementIds;
+  occa::memory o_pmlIds;
+
+  occa::memory o_pmlElementList;
+
+  occa::memory o_ggeo; // second order geometric factors
+  occa::memory o_ggeoPfloat; // second order geometric factors
+  occa::memory o_projectL2; // local weights for projection.
+
+  occa::kernel volumeKernel;
+  occa::kernel surfaceKernel;
+  occa::kernel updateKernel;
+  occa::kernel traceUpdateKernel;
+  occa::kernel haloExtractKernel;
+  occa::kernel partialSurfaceKernel;
+  occa::kernel haloGetKernel;
+  occa::kernel haloPutKernel;
+
+  // Just for test will be deleted after temporal testsAK
+  occa::kernel RKupdateKernel;
+  occa::kernel RKpmlUpdateKernel;
+
+  occa::kernel gatherKernel;
+  occa::kernel scatterKernel;
+  occa::kernel gatherScatterKernel;
+
+  occa::kernel getKernel;
+  occa::kernel putKernel;
+
+  occa::kernel sumKernel;
+  occa::kernel addScalarKernel;
+
+  occa::kernel AxKernel;
+  occa::kernel innerProductKernel;
+  occa::kernel weightedInnerProduct1Kernel;
+  occa::kernel weightedInnerProduct2Kernel;
+  occa::kernel scaledAddKernel;
+  occa::kernel dotMultiplyKernel;
+  occa::kernel dotDivideKernel;
+
+  occa::kernel gradientKernel;
+  occa::kernel ipdgKernel;
+
+  occa::kernel maskKernel;
+  occa::kernel maskPfloatKernel;
+
+  // Boltzmann Specific Kernels
+  occa::kernel relaxationKernel;
+  occa::kernel pmlRelaxationKernel;
+}mesh_t;
+
+// serial sort
+void mysort(hlong* data, int N, const char* order);
+
+// sort entries in an array in parallel
+void parallelSort(int size, int rank, MPI_Comm comm,
+                  int N, void* vv, size_t sz,
+                  int (* compare)(const void*, const void*),
+                  void (* match)(void*, void*)
+                  );
+
+#define mymax(a,b) (((a) > (b))?(a):(b))
+#define mymin(a,b) (((a) < (b))?(a):(b))
+
+/* dimension independent mesh operations */
+void meshConnect(mesh_t* mesh);
+
+/* build parallel face connectivity */
+void meshParallelConnect(mesh_t* mesh);
+
+/* build global connectivity in parallel */
+void meshParallelConnectNodes(mesh_t* mesh, int nrsBuildOnly);
+
+void meshHaloSetup(mesh_t* mesh);
+
+/* extract whole elements for the halo exchange */
+void meshHaloExtract(mesh_t* mesh, size_t Nbytes, void* sourceBuffer, void* haloBuffer);
+
+void meshHaloExchange(mesh_t* mesh,
+                      size_t Nbytes, // message size per element
+                      void* sourceBuffer,
+                      void* sendBuffer, // temporary buffer
+                      void* recvBuffer);
+
+void meshHaloExchangeStart(mesh_t* mesh,
+                           size_t Nbytes, // message size per element
+                           void* sendBuffer, // temporary buffer
+                           void* recvBuffer);
+
+void meshHaloExchangeFinish(mesh_t* mesh);
+
+void meshHaloExchangeBlocking(mesh_t* mesh,
+                              size_t Nbytes, // message size per element
+                              void* sendBuffer, // temporary buffer
+                              void* recvBuffer);
+
+// print out parallel partition i
+void meshPartitionStatistics(mesh_t* mesh);
+
+// build element-boundary connectivity
+void meshConnectBoundary(mesh_t* mesh);
+
+void meshParallelGatherScatterSetup(mesh_t* mesh,
+                                    dlong N,
+                                    hlong* globalIds,
+                                    MPI_Comm &comm,
+                                    int verbose);
+
+// generic mesh setup
+mesh_t* meshSetup(char* filename, int N, setupAide &options);
+void meshFree(mesh_t*);
+
+void occaTimerTic(occa::device device,std::string name);
+void occaTimerToc(occa::device device,std::string name);
+
+extern "C"
+{
+void* xxtSetup(uint num_local_rows,
+               void* row_ids,
+               uint nnz,
+               void*   A_i,
+               void*   A_j,
+               void* A_vals,
+               int null_space,
+               const char* inttype,
+               const char* floattype);
+
+void xxtSolve(void* x,
+              void* A,
+              void* rhs);
+
+void xxtFree(void* A);
+}
+
+extern "C"
+{
+void dgesv_ ( int* N, int* NRHS, double* A,
+              int* LDA,
+              int* IPIV,
+              double* B,
+              int* LDB,
+              int* INFO );
+
+// void dgemm_(const char *TRANSA, const char *TRANSB, const int *M,
+//             const int *N, const int *K, double *ALPHA, double *A, const int *LDA, double *B,
+//             const int *LDB, double *BETA, double *C, const int *LDC);
+
+void dgemm_ (char*, char*, int*, int*, int*,
+             const dfloat*, const dfloat* __restrict, int*,
+             const dfloat* __restrict, int*,
+             const dfloat*, dfloat* __restrict, int*);
+
+void sgesv_(int* N, int* NRHS,float* A, int* LDA, int* IPIV, float* B, int* LDB,int* INFO);
+
+void dgetrf_(int* M, int* N, double* A, int* lda, int* IPIV, int* INFO);
+void dgetri_(int* N, double* A, int* lda, int* IPIV, double* WORK, int* lwork, int* INFO);
+void dgeev_(char* JOBVL, char* JOBVR, int* N, double* A, int* LDA, double* WR, double* WI,
+            double* VL, int* LDVL, double* VR, int* LDVR, double* WORK, int* LWORK, int* INFO );
+
+double dlange_(char* NORM, int* M, int* N, double* A, int* LDA, double* WORK);
+void dgecon_(char* NORM, int* N, double* A, int* LDA, double* ANORM,
+             double* RCOND, double* WORK, int* IWORK, int* INFO );
+}
+
+void meshApplyElementMatrix(mesh_t* mesh, dfloat* A, dfloat* q, dfloat* Aq);
+void meshApplyVectorElementMatrix(mesh_t* mesh, int Nfield, const dlong offset, dfloat* A, dfloat* q, dfloat* Aq);
+
+void meshRecursiveSpectralBisectionPartition(mesh_t* mesh);
+
+void matrixInverse(int N, dfloat* A);
+dfloat matrixConditionNumber(int N, dfloat* A);
+
+#if 0
+void* occaHostMallocPinned(occa::device &device, size_t size, void* source, occa::memory &mem);
+#else
+void* occaHostMallocPinned(occa::device &device, size_t size, void* source, occa::memory &mem, occa::memory &h_mem);
+#endif
+
+void matrixRightSolve(int NrowsA, int NcolsA, dfloat* A, int NrowsB, int NcolsB, dfloat* B, dfloat* C);
+void matrixEig(int N, dfloat* A, dfloat* VR, dfloat* WR, dfloat* WI);
+void matrixTranspose(const int M, const int N,
+                     const dfloat* A, const int LDA,
+                     dfloat* AT, const int LDAT);
+
+// 1D mesh basis functions
+void Nodes1D(int _N, dfloat* _r);
+void EquispacedNodes1D(int _N, dfloat* _r);
+void OrthonormalBasis1D(dfloat a, int i, dfloat* P);
+void GradOrthonormalBasis1D(dfloat a, int i, dfloat* Pr);
+void Vandermonde1D(int _N, int Npoints, dfloat* _r, dfloat* V);
+void GradVandermonde1D(int _N, int Npoints, dfloat* _r, dfloat* Vr);
+void MassMatrix1D(int _Np, dfloat* V, dfloat* _MM);
+void Dmatrix1D(int _N, int NpointsIn, dfloat* _rIn,
+               int NpointsOut, dfloat* _rOut, dfloat* _Dr);
+void DWmatrix1D(int _N, dfloat* _D, dfloat* _DT);
+
+void InterpolationMatrix1D(int _N,
+                           int NpointsIn, dfloat* rIn,
+                           int NpointsOut, dfloat* rOut,
+                           dfloat* I);
+void DegreeRaiseMatrix1D(int Nc, int Nf, dfloat* P);
+void CubatureWeakDmatrix1D(int _Nq, int _cubNq,
+                           dfloat* _cubProject, dfloat* _cubD, dfloat* _cubPDT);
+dfloat JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N);
+dfloat GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N);
+void JacobiGLL(int _N, dfloat* _x, dfloat* _w = nullptr);
+void JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat* _x, dfloat* _w);
+} // end C Linkage
+#endif
diff --git a/src/mesh/mesh3D.h b/src/mesh/mesh3D.h
new file mode 100644
index 000000000..7348458e4
--- /dev/null
+++ b/src/mesh/mesh3D.h
@@ -0,0 +1,224 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#ifndef MESH3D_H
+#define MESH3D_H 1
+
+// generic mesh structure
+#include "mesh.h"
+
+extern "C" { // Begin C Linkage
+#define mesh3D mesh_t
+
+// mesh readers
+mesh3D* meshParallelReaderTri3D(char* fileName);
+mesh3D* meshParallelReaderQuad3D(char* fileName);
+mesh3D* meshParallelReaderTet3D(char* fileName);
+mesh3D* meshParallelReaderHex3D(char* fileName);
+
+// build connectivity in serial
+void meshConnect3D(mesh3D* mesh);
+
+// build element-boundary connectivity
+void meshConnectBoundary3D(mesh3D* mesh);
+
+// build connectivity in parallel
+void meshParallelConnect3D(mesh3D* mesh);
+
+// repartition elements in parallel
+void meshGeometricPartition3D(mesh3D* mesh);
+
+// print out mesh
+void meshPrint3D(mesh3D* mesh);
+
+// print out mesh in parallel from the root process
+void meshParallelPrint3D(mesh3D* mesh);
+
+// print out mesh partition in parallel
+void meshVTU3D(mesh3D* mesh, char* fileName);
+
+// print out mesh field
+void meshPlotVTU3D(mesh3D* mesh, char* fileNameBase, int fld);
+void meshPlotContour3D(mesh_t* mesh, char* fname, dfloat* u, int Nlevels, dfloat* levels);
+void meshPlotAdaptiveContour3D(mesh_t* mesh, char* fname, dfloat* u, int Nlevels, dfloat* levels, dfloat tol);
+
+// compute geometric factors for local to physical map
+void meshGeometricFactorsTri3D(mesh3D* mesh);
+void meshGeometricFactorsQuad3D(mesh3D* mesh);
+void meshGeometricFactorsTet3D(mesh3D* mesh);
+void meshGeometricFactorsHex3D(mesh3D* mesh);
+
+void meshSurfaceGeometricFactorsTri3D(mesh3D* mesh);
+void meshSurfaceGeometricFactorsQuad3D(mesh3D* mesh);
+void meshSurfaceGeometricFactorsTet3D(mesh3D* mesh);
+void meshSurfaceGeometricFactorsHex3D(mesh3D* mesh);
+
+void meshPhysicalNodesTri3D(mesh3D* mesh);
+void meshPhysicalNodesQuad3D(mesh3D* mesh);
+void meshPhysicalNodesTet3D(mesh3D* mesh);
+void meshPhysicalNodesHex3D(mesh3D* mesh, int nrsBuildOnly);
+
+void meshLoadReferenceNodesTet3D(mesh3D* mesh, int N);
+void meshLoadReferenceNodesHex3D(mesh3D* mesh, int N, int cubN);
+
+void meshGradientTet3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz);
+void meshGradientHex3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz);
+
+// print out parallel partition i
+void meshPartitionStatistics3D(mesh3D* mesh);
+
+// default occa set up
+void meshOccaSetup3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo);
+void meshOccaSetupQuad3D(mesh_t* mesh, setupAide &newOptions, occa::properties &kernelInfo);
+void meshOccaSetupTri3D(mesh_t* mesh, setupAide &newOptions, occa::properties &kernelInfo);
+
+void meshOccaPopulateDevice3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo);
+void meshOccaCloneDevice(mesh_t* donorMesh, mesh_t* mesh);
+
+// functions that call OCCA kernels
+void occaTest3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz);
+
+//
+void occaOptimizeGradientTet3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz);
+void occaOptimizeGradientHex3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz);
+
+// serial face-node to face-node connection
+void meshConnectFaceNodes3D(mesh3D* mesh);
+
+//
+mesh3D* meshSetupTri3D(char* filename, int N, dfloat sphereRadius);
+mesh3D* meshSetupQuad3D(char* filename, int N, dfloat sphereRadius);
+mesh3D* meshSetupTet3D(char* filename, int N);
+mesh3D* meshSetupHex3D(char* filename, int N);
+
+void meshParallelConnectNodesHex3D(mesh3D* mesh);
+
+// halo connectivity information
+void meshHaloSetup3D(mesh3D* mesh);
+
+// perform halo exchange
+void meshHaloExchange3D(mesh3D* mesh,
+                        size_t Nbytes, // number of bytes per element
+                        void* sourceBuffer,
+                        void* sendBuffer,
+                        void* recvBuffer);
+
+void meshHaloExchangeStart3D(mesh3D* mesh,
+                             size_t Nbytes, // message size per element
+                             void* sendBuffer, // temporary buffer
+                             void* recvBuffer);
+
+void meshHaloExchangeFinish3D(mesh3D* mesh);
+
+// build list of nodes on each face of the reference element
+void meshBuildFaceNodes3D(mesh3D* mesh);
+void meshBuildFaceNodesHex3D(mesh3D* mesh);
+
+dfloat meshMRABSetup3D(mesh3D* mesh, dfloat* EToDT, int maxLevels, dfloat finalTime);
+
+//MRAB weighted mesh partitioning
+void meshMRABWeightedPartition3D(mesh3D* mesh, dfloat* weights,
+                                 int numLevels, int* levels);
+
+void interpolateHex3D(dfloat* Inter, dfloat* x, int N, dfloat* Ix, int M);
+
+#define norm3(a,b,c) ( sqrt((a) * (a) + (b) * (b) + (c) * (c)) )
+
+/* offsets for geometric factors */
+#define RXID 0
+#define RYID 1
+#define SXID 2
+#define SYID 3
+#define  JID 4
+#define JWID 5
+#define IJWID 6
+#define RZID 7
+#define SZID 8
+#define TXID 9
+#define TYID 10
+#define TZID 11
+
+/* offsets for second order geometric factors */
+#define G00ID 0
+#define G01ID 1
+#define G11ID 2
+#define GWJID 3
+#define G12ID 4
+#define G02ID 5
+#define G22ID 6
+
+/* offsets for nx, ny, sJ, 1/J */
+#define NXID 0
+#define NYID 1
+#define SJID 2
+#define IJID 3
+#define IHID 4
+#define WSJID 5
+#define WIJID 6
+#define NZID 7
+#define STXID 8
+#define STYID 9
+#define STZID 10
+#define SBXID 11
+#define SBYID 12
+#define SBZID 13
+#define SURXID 14
+#define SURYID 15
+#define SURZID 16
+//
+//offsets for boltzmann PML variables
+#define QXID1 0
+#define QXID2 1
+#define QXID3 2
+#define QXID4 3
+#define QXID5 4
+#define QXID6 5
+#define QXID8 6
+//
+#define QYID1 7
+#define QYID2 8
+#define QYID3 9
+#define QYID4 10
+#define QYID5 11
+#define QYID7 12
+#define QYID9 13
+//
+#define QZID1 14
+#define QZID2 15
+#define QZID3 16
+#define QZID4 17
+#define QZID6 18
+#define QZID7 19
+#define QZID10  20
+
+mesh3D* meshSetupBoxHex3D(int N, setupAide &options);
+void meshConnectPeriodicFaceNodes3D(mesh3D* mesh, dfloat xper, dfloat yper, dfloat zper);
+
+// Mesh generation
+void NodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t);
+void FaceNodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t, int* _faceNodes);
+} // end C Linkage
+#endif
diff --git a/src/libP/src/meshApplyElementMatrix.c b/src/mesh/meshApplyElementMatrix.cpp
similarity index 100%
rename from src/libP/src/meshApplyElementMatrix.c
rename to src/mesh/meshApplyElementMatrix.cpp
diff --git a/src/mesh/meshBasis1D.cpp b/src/mesh/meshBasis1D.cpp
new file mode 100644
index 000000000..8d0a2bd1b
--- /dev/null
+++ b/src/mesh/meshBasis1D.cpp
@@ -0,0 +1,533 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#include "mesh.h"
+
+// ------------------------------------------------------------------------
+// 1D NODES
+// ------------------------------------------------------------------------
+void Nodes1D(int _N, dfloat* _r)
+{
+  JacobiGLL(_N, _r); //Gauss-Legendre-Lobatto nodes
+}
+
+void EquispacedNodes1D(int _N, dfloat* _r)
+{
+  int _Nq = _N + 1;
+
+  dfloat dr = 2.0 / _N;
+  for (int i = 0; i < _Nq; i++) _r[i] = -1.0 + i * dr;
+}
+
+// ------------------------------------------------------------------------
+// ORTHONORMAL BASIS POLYNOMIALS
+// ------------------------------------------------------------------------
+void OrthonormalBasis1D(dfloat a, int i, dfloat* P)
+{
+  *P = JacobiP(a,0,0,i); //Legendre Polynomials
+}
+
+void GradOrthonormalBasis1D(dfloat a, int i, dfloat* Pr)
+{
+  *Pr = GradJacobiP(a,0,0,i);
+}
+
+// ------------------------------------------------------------------------
+// 1D VANDERMONDE MATRICES
+// ------------------------------------------------------------------------
+void Vandermonde1D(int _N, int Npoints, dfloat* _r, dfloat* V)
+{
+  int _Np = (_N + 1);
+
+  for(int n = 0; n < Npoints; n++)
+    for(int i = 0; i < _Np; i++) {
+      int id = n * _Np + i;
+      OrthonormalBasis1D(_r[n], i, V + id);
+    }
+}
+
+void GradVandermonde1D(int _N, int Npoints, dfloat* _r, dfloat* Vr)
+{
+  int _Np = (_N + 1);
+
+  for(int n = 0; n < Npoints; n++)
+    for(int i = 0; i < _Np; i++) {
+      int id = n * _Np + i;
+      GradOrthonormalBasis1D(_r[n], i, Vr + id);
+    }
+}
+
+// ------------------------------------------------------------------------
+// 1D OPERATOR MATRICES
+// ------------------------------------------------------------------------
+void MassMatrix1D(int _Np, dfloat* V, dfloat* _MM)
+{
+  // masMatrix = inv(V')*inv(V) = inv(V*V')
+  for(int n = 0; n < _Np; ++n)
+    for(int m = 0; m < _Np; ++m) {
+      dfloat res = 0;
+      for(int i = 0; i < _Np; ++i)
+        res += V[n * _Np + i] * V[m * _Np + i];
+      _MM[n * _Np + m] = res;
+    }
+  matrixInverse(_Np, _MM);
+}
+
+void Dmatrix1D(int _N, int NpointsIn, dfloat* _rIn,
+               int NpointsOut, dfloat* _rOut, dfloat* _Dr)
+{
+  // need NpointsIn = (_N+1)
+  if (NpointsIn != _N + 1) {
+    std::cout << "Invalid Differentiation operator requested.\n";
+    exit(-1);
+  }
+
+  int _Np = _N + 1;
+
+  dfloat* V  = (dfloat*) calloc(NpointsIn * _Np, sizeof(dfloat));
+  dfloat* Vr = (dfloat*) calloc(NpointsOut * _Np, sizeof(dfloat));
+
+  Vandermonde1D(_N, NpointsIn, _rIn, V);
+  GradVandermonde1D(_N, NpointsOut, _rOut, Vr);
+
+  //D = Vr/V
+  matrixRightSolve(NpointsOut, _Np, Vr, _Np, _Np, V, _Dr);
+
+  free(V);
+  free(Vr);
+}
+
+void DWmatrix1D(int _N, dfloat* _D, dfloat* _DT)
+{
+  int _Nq = _N + 1;
+
+  for(int n = 0; n < _Nq; ++n)
+    for(int m = 0; m < _Nq; ++m) {
+      _DT[n * _Nq + m] = 0.0;
+      for(int k = 0; k < _Nq; ++k) _DT[n * _Nq + m] += _D[m * _Nq + k];
+    }
+
+/*
+   dfloat *r1D  = (dfloat *) calloc(_Nq, sizeof(dfloat));
+   dfloat *w1D  = (dfloat *) calloc(_Nq, sizeof(dfloat));
+   JacobiGLL(_N, r1D, w1D); // i.e. 1D gll points and correspondin weights from mass lumping
+
+   dfloat *V1D  = (dfloat *) calloc(_Nq*_Nq, sizeof(dfloat));
+   dfloat *V1Dr = (dfloat *) calloc(_Nq*_Nq, sizeof(dfloat));
+   Vandermonde1D(_N, _Nq, r1D, V1D);
+   GradVandermonde1D(_N, _Nq, r1D, V1Dr);
+
+   // DW1D = V*Vr'*diag(w)
+   for(int n=0;n<_Nq;++n){
+    for(int m=0;m<_Nq;++m){
+      dfloat dw = 0;
+      for(int i=0; i<_Nq; i++) dw += V1D[n*_Nq + i]*V1Dr[m*_Nq + i];
+      _DT[n*_Nq+m] = dw; //*w1D[m]; // scale by w
+    }
+   }
+
+   free(r1D);
+   free(w1D);
+   free(V1D);
+   free(V1Dr);
+ */
+}
+
+void InterpolationMatrix1D(int _N,
+                           int NpointsIn, dfloat* rIn,
+                           int NpointsOut, dfloat* rOut,
+                           dfloat* I)
+{
+  // need NpointsIn = (_N+1)
+  if (NpointsIn != _N + 1) {
+    std::cout << "Invalid Interplation operator requested.\n";
+    exit(-1);
+  }
+
+  dfloat* VIn = (dfloat*) malloc(NpointsIn * (_N + 1) * sizeof(dfloat));
+  dfloat* VOut = (dfloat*) malloc(NpointsOut * (_N + 1) * sizeof(dfloat));
+
+  Vandermonde1D(_N, NpointsIn,   rIn, VIn);
+  Vandermonde1D(_N, NpointsOut, rOut, VOut);
+
+  matrixRightSolve(NpointsOut, _N + 1, VOut, NpointsIn, _N + 1, VIn, I);
+
+  free(VIn);
+  free(VOut);
+}
+
+void DegreeRaiseMatrix1D(int Nc, int Nf, dfloat* P)
+{
+  int Nqc = Nc + 1;
+  int Nqf = Nf + 1;
+
+  dfloat* rc = (dfloat*) malloc(Nqc * sizeof(dfloat));
+  dfloat* rf = (dfloat*) malloc(Nqf * sizeof(dfloat));
+
+  Nodes1D(Nc, rc);
+  Nodes1D(Nf, rf);
+
+  InterpolationMatrix1D(Nc, Nqc, rc, Nqf, rf, P);
+
+  free(rc);
+  free(rf);
+}
+
+void CubatureWeakDmatrix1D(int _Nq, int _cubNq,
+                           dfloat* _cubProject, dfloat* _cubD, dfloat* _cubPDT)
+{
+  // cubPDT = cubProject*cubD';
+  for(int n = 0; n < _Nq; ++n)
+    for(int m = 0; m < _cubNq; ++m) {
+      _cubPDT[n * _cubNq + m] = 0.0;
+      for(int k = 0; k < _cubNq; ++k)
+        _cubPDT[n * _cubNq + m] += _cubProject[n * _cubNq + k] * _cubD[m * _cubNq + k];
+    }
+}
+
+// ------------------------------------------------------------------------
+// 1D JACOBI POLYNOMIALS
+// ------------------------------------------------------------------------
+static dfloat mygamma(dfloat x)
+{
+  dfloat lgam = lgamma(x);
+  dfloat gam  = signgam * exp(lgam);
+  return gam;
+}
+
+dfloat JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N)
+{
+  dfloat ax = a;
+
+  dfloat* P = (dfloat*) calloc((_N + 1), sizeof(dfloat));
+
+  // Zero order
+  dfloat gamma0 = pow(2,(alpha + beta + 1)) / (alpha + beta + 1) * mygamma(1 + alpha) * mygamma(1 + beta) / mygamma(
+    1 + alpha + beta);
+  dfloat p0     = 1.0 / sqrt(gamma0);
+
+  if (_N == 0) {
+    free(P);
+    return p0;
+  }
+  P[0] = p0;
+
+  // first order
+  dfloat gamma1 = (alpha + 1) * (beta + 1) / (alpha + beta + 3) * gamma0;
+  dfloat p1     = ((alpha + beta + 2) * ax / 2 + (alpha - beta) / 2) / sqrt(gamma1);
+  if (_N == 1) {
+    free(P);
+    return p1;
+  }
+
+  P[1] = p1;
+
+  /// Repeat value in recurrence.
+  dfloat aold = 2 / (2 + alpha + beta) * sqrt((alpha + 1.) * (beta + 1.) / (alpha + beta + 3.));
+  /// Forward recurrence using the symmetry of the recurrence.
+  for(int i = 1; i <= _N - 1; ++i) {
+    dfloat h1 = 2. * i + alpha + beta;
+    dfloat anew = 2. / (h1 + 2.) * sqrt(
+      (i + 1.) * (i + 1. + alpha + beta) * (i + 1 + alpha) * (i + 1 + beta) / (h1 + 1) / (h1 + 3));
+    dfloat bnew = -(alpha * alpha - beta * beta) / h1 / (h1 + 2);
+    P[i + 1] = 1. / anew * ( -aold * P[i - 1] + (ax - bnew) * P[i]);
+    aold = anew;
+  }
+
+  dfloat pN = P[_N];
+  free(P);
+  return pN;
+}
+
+dfloat GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N)
+{
+  dfloat PNr = 0;
+
+  if(_N > 0)
+    PNr = sqrt(_N * (_N + alpha + beta + 1.)) * JacobiP(a, alpha + 1.0, beta + 1.0, _N - 1);
+
+  return PNr;
+}
+
+// ------------------------------------------------------------------------
+// 1D GAUSS-LEGENDRE-LOBATTO QUADRATURE
+// ------------------------------------------------------------------------
+void JacobiGLL(int _N, dfloat* _x, dfloat* _w)
+{
+  _x[0] = -1.;
+  _x[_N] =  1.;
+
+  if(_N > 1) {
+    dfloat* wtmp = (dfloat*) calloc(_N - 1, sizeof(dfloat));
+    JacobiGQ(1,1, _N - 2, _x + 1, wtmp);
+    free(wtmp);
+  }
+
+  if (_w != NULL) {
+    int _Np = _N + 1;
+    dfloat* _MM = (dfloat*) malloc(_Np * _Np * sizeof(dfloat));
+    dfloat* V = (dfloat*) malloc(_Np * _Np * sizeof(dfloat));
+
+    Vandermonde1D(_N, _N + 1, _x, V);
+    MassMatrix1D(_N + 1, V, _MM);
+
+    // use weights from mass lumping
+    for(int n = 0; n <= _N; ++n) {
+      dfloat res = 0;
+      for(int m = 0; m <= _N; ++m)
+        res += _MM[n * (_N + 1) + m];
+      _w[n] = res;
+    }
+  }
+}
+
+// ------------------------------------------------------------------------
+// 1D GAUSS QUADRATURE
+// ------------------------------------------------------------------------
+void JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat* _x, dfloat* _w)
+{
+  // function NGQ = JacobiGQ(alpha,beta,_N, _x, _w)
+  // Purpose: Compute the _N'th order Gauss quadrature points, _x,
+  //          and weights, _w, associated with the Jacobi
+  //          polynomial, of type (alpha,beta) > -1 ( <> -0.5).
+  if (_N == 0) {
+    _x[0] = (alpha - beta) / (alpha + beta + 2);
+    _w[0] = 2;
+  }
+
+  // Form symmetric matrix from recurrence.
+  dfloat* J = (dfloat*) calloc((_N + 1) * (_N + 1), sizeof(dfloat));
+  dfloat* h1 = (dfloat*) calloc(_N + 1, sizeof(dfloat));
+
+  for(int n = 0; n <= _N; ++n)
+    h1[n] = 2 * n + alpha + beta;
+
+  // J = J + J';
+  for(int n = 0; n <= _N; ++n) {
+    // J = diag(-1/2*(alpha^2-beta^2)./(h1+2)./h1) + ...
+    J[n * (_N + 1) + n] += -0.5 * (alpha * alpha - beta * beta) / ((h1[n] + 2) * h1[n]) * 2; // *2 for symm
+
+    //    diag(2./(h1(1:_N)+2).*sqrt((1:_N).*((1:_N)+alpha+beta).*((1:_N)+alpha).*((1:_N)+beta)./(h1(1:_N)+1)./(h1(1:_N)+3)),1);
+    if(n < _N) {
+      J[n * (_N + 1) + n + 1]   += (2. / (h1[n] + 2.)) *
+                                   sqrt((n + 1) * (n + 1 + alpha + beta) * (n + 1 + alpha) * (n + 1 + beta) /
+                                        ((h1[n] + 1) * (h1[n] + 3)));
+      J[(n + 1) * (_N + 1) + n] += (2. / (h1[n] + 2.)) *
+                                   sqrt((n + 1) * (n + 1 + alpha + beta) * (n + 1 + alpha) * (n + 1 + beta) /
+                                        ((h1[n] + 1) * (h1[n] + 3)));
+    }
+  }
+
+  dfloat eps = 1;
+  while(1 + eps > 1)
+    eps = eps / 2.;
+  // printf("MACHINE PRECISION %e\n", eps);
+
+  if (alpha + beta < 10 * eps) J[0] = 0;
+
+  // Compute quadrature by eigenvalue solve
+
+  //  [V,D] = eig(J);
+  dfloat* WR = (dfloat*) calloc(_N + 1, sizeof(dfloat));
+  dfloat* WI = (dfloat*) calloc(_N + 1, sizeof(dfloat));
+  dfloat* VR = (dfloat*) calloc((_N + 1) * (_N + 1), sizeof(dfloat));
+
+  // _x = diag(D);
+  matrixEig(_N + 1, J, VR, _x, WI);
+
+  //_w = (V(1,:)').^2*2^(alpha+beta+1)/(alpha+beta+1)*gamma(alpha+1)*.gamma(beta+1)/gamma(alpha+beta+1);
+  for(int n = 0; n <= _N; ++n)
+    _w[n] = pow(VR[0 * (_N + 1) + n],2) * (pow(2,alpha + beta + 1) / (alpha + beta + 1)) * mygamma(alpha + 1) * mygamma(
+      beta + 1) / mygamma(alpha + beta + 1);
+
+  // sloppy sort
+  for(int n = 0; n <= _N; ++n)
+    for(int m = n + 1; m <= _N; ++m)
+      if(_x[n] > _x[m]) {
+        dfloat tmpx = _x[m];
+        dfloat tmpw = _w[m];
+        _x[m] = _x[n];
+        _w[m] = _w[n];
+        _x[n] = tmpx;
+        _w[n] = tmpw;
+      }
+
+#if 0
+  for(int n = 0; n <= _N; ++n)
+    printf("zgl[%d] = % e, wgl[%d] = % e\n", n, _x[0][n], n, _w[0][n]);
+
+#endif
+
+  free(WR);
+  free(WI);
+  free(VR);
+}
+
+/*
+   // C0 basis
+   int meshContinuousVandermonde1D(int _N, int Npoints, dfloat *_r, dfloat **V, dfloat **Vr){
+
+   int _Np = (_N+1);
+
+ * V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
+ * Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
+
+   for(int n=0; n<Npoints; n++){
+
+    int sk = 0;
+    for(int i=0; i<=_N; i++){
+      int id = n*_Np+sk;
+      if(i==0){
+        V[0][id] = 0.5*(1-_r[n]);
+        Vr[0][id] = -0.5;
+      }
+      else  if(i==1){
+        V[0][id] = 0.5*(1+_r[n]);
+        Vr[0][id] = +0.5;
+      }
+      else{
+        // 0.25*(1+_r)*(1-_r)*P^{0,0}_{i-2}(_r)
+        dfloat P =  meshJacobiP(_r[n], 0, 0, i-2);
+        dfloat Pr = meshGradJacobiP(_r[n], 0, 0, i-2);
+        V[0][id]  = 0.25*(1+_r[n])*(1-_r[n])*P;
+        Vr[0][id] = 0.25*( (-2*_r[n])*P + (1+_r[n])*(1-_r[n])*Pr);
+      }
+
+      sk++;
+    }
+   }
+
+   return _Np;
+   }
+ */
+
+/*
+   void meshContinuousFilterMatrix1D(int _N, int Nlow, dfloat *_r, dfloat **F){
+
+   dfloat *VC0, *VrC0;
+   dfloat *L = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
+   dfloat *LinvF = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
+
+   int _Np = meshContinuousVandermonde1D(_N, _N+1, _r, &VC0, &VrC0);
+   //  int _Np = meshVandermonde1D(_N, _N+1, _r, &VC0, &VrC0); use
+   printf("CONTINUOUS VANDERMONDE MATRIX: [\n");
+   for(int n=0;n<_Np;++n){
+    for(int m=0;m<_Np;++m){
+      printf("% e ", VC0[n*_Np+m]);
+    }
+    printf("\n");
+   }
+   printf("\n");
+
+ * F = (dfloat *) calloc(_Np*_Np, sizeof(dfloat));
+
+   for(int n=0;n<=Nlow;++n){
+    L[n*(_N+1)+n] = 1;
+   }
+
+   matrixRightSolve(_Np, _Np, L, _Np, _Np, VC0, LinvF);
+
+   for(int n=0;n<_Np;++n){
+    for(int m=0;m<_Np;++m){
+      dfloat res = 0;
+      printf("% e ", LinvF[n*_Np+m]);
+    }
+    printf("\n");
+   }
+   printf("\n");
+
+   printf("FILTER MATRIX: [\n");
+   for(int n=0;n<_Np;++n){
+    for(int m=0;m<_Np;++m){
+      dfloat res = 0;
+      for(int i=0;i<_Np;++i){
+        res += VC0[n*_Np+i]*LinvF[i*_Np+m];
+      }
+      F[0][n*_Np+m] = res;
+      printf("% e ", res);
+    }
+    printf("\n");
+   }
+   printf("\n");
+
+   free(VC0);
+   free(VrC0);
+   free(L);
+   free(LinvF);
+   }
+ */
+
+// ------------------------------------------------------------------------
+// 1D INTERPOLATION MATRICES
+// ------------------------------------------------------------------------
+
+/*
+
+ */
+
+/*
+
+   void meshCubatureWeakDmatrices1D(int _N, int _Np, dfloat *V,
+                                 int cubNp, dfloat *cubr, dfloat *cubw,
+                                 dfloat **cubDrT, dfloat **cubProject){
+
+   dfloat *cubV, *cubVr;
+
+   meshVandermonde1D(_N, cubNp, cubr, &cubV, &cubVr);
+
+   // cubDrT = V*transpose(cVr)*diag(cubw);
+   // cubProject = V*cV'*diag(cubw); %% relies on (transpose(cV)*diag(cubw)*cV being the identity)
+
+   for(int n=0;n<cubNp;++n){
+    for(int m=0;m<_Np;++m){
+      // scale by cubw
+      cubVr[n*_Np+m] *= cubw[n];
+      cubV[n*_Np+m]  *= cubw[n];
+    }
+   }
+
+ * cubDrT = (dfloat*) calloc(cubNp*_Np, sizeof(dfloat));
+ * cubProject = (dfloat*) calloc(cubNp*_Np, sizeof(dfloat));
+
+   for(int n=0;n<_Np;++n){
+    for(int m=0;m<cubNp;++m){
+      dfloat resP = 0, resDrT = 0;
+
+      for(int i=0;i<_Np;++i){
+        dfloat Vni = V[n*_Np+i];
+        resDrT += Vni*cubVr[m*_Np+i];
+        resP   += Vni*cubV[m*_Np+i];
+      }
+
+      cubDrT[0][n*cubNp+m] = resDrT;
+      cubProject[0][n*cubNp+m] = resP;
+    }
+   }
+
+   free(cubV);
+   free(cubVr);
+   }
+ */
diff --git a/src/mesh/meshBasisHex3D.cpp b/src/mesh/meshBasisHex3D.cpp
new file mode 100644
index 000000000..93d6a61e3
--- /dev/null
+++ b/src/mesh/meshBasisHex3D.cpp
@@ -0,0 +1,368 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#include "mesh.h"
+#include "mesh3D.h"
+
+// ------------------------------------------------------------------------
+// HEX 3D NODES
+// ------------------------------------------------------------------------
+void NodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t)
+{
+  int _Nq = _N + 1;
+
+  dfloat* r1D = (dfloat*) malloc(_Nq * sizeof(dfloat));
+  JacobiGLL(_N, r1D); //Gauss-Legendre-Lobatto nodes
+
+  //Tensor product
+  for (int k = 0; k < _Nq; k++)
+    for (int j = 0; j < _Nq; j++)
+      for (int i = 0; i < _Nq; i++) {
+        _r[i + j * _Nq + k * _Nq * _Nq] = r1D[i];
+        _s[i + j * _Nq + k * _Nq * _Nq] = r1D[j];
+        _t[i + j * _Nq + k * _Nq * _Nq] = r1D[k];
+      }
+
+  free(r1D);
+}
+
+void FaceNodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t, int* _faceNodes)
+{
+  int _Nq = _N + 1;
+  int _Nfp = _Nq * _Nq;
+  int _Np = _Nq * _Nq * _Nq;
+
+  int cnt[6];
+  for (int i = 0; i < 6; i++) cnt[i] = 0;
+
+  dfloat deps = 1.;
+  while((1. + deps) > 1.)
+    deps *= 0.5;
+
+  const dfloat NODETOL = 1000. * deps;
+
+  for (int n = 0; n < _Np; n++) {
+    if(fabs(_t[n] + 1) < NODETOL)
+      _faceNodes[0 * _Nfp + (cnt[0]++)] = n;
+    if(fabs(_s[n] + 1) < NODETOL)
+      _faceNodes[1 * _Nfp + (cnt[1]++)] = n;
+    if(fabs(_r[n] - 1) < NODETOL)
+      _faceNodes[2 * _Nfp + (cnt[2]++)] = n;
+    if(fabs(_s[n] - 1) < NODETOL)
+      _faceNodes[3 * _Nfp + (cnt[3]++)] = n;
+    if(fabs(_r[n] + 1) < NODETOL)
+      _faceNodes[4 * _Nfp + (cnt[4]++)] = n;
+    if(fabs(_t[n] - 1) < NODETOL)
+      _faceNodes[5 * _Nfp + (cnt[5]++)] = n;
+  }
+}
+
+#if 0
+
+void mesh_t::VertexNodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t, int* _vertexNodes)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  dfloat deps = 1.;
+  while((1. + deps) > 1.)
+    deps *= 0.5;
+
+  const dfloat NODETOL = 1000. * deps;
+
+  for(int n = 0; n < _Np; ++n) {
+    if( (_r[n] + 1) * (_r[n] + 1) + (_s[n] + 1) * (_s[n] + 1) + (_t[n] + 1) * (_t[n] + 1) < NODETOL)
+      _vertexNodes[0] = n;
+    if( (_r[n] - 1) * (_r[n] - 1) + (_s[n] + 1) * (_s[n] + 1) + (_t[n] + 1) * (_t[n] + 1) < NODETOL)
+      _vertexNodes[1] = n;
+    if( (_r[n] - 1) * (_r[n] - 1) + (_s[n] - 1) * (_s[n] - 1) + (_t[n] + 1) * (_t[n] + 1) < NODETOL)
+      _vertexNodes[2] = n;
+    if( (_r[n] + 1) * (_r[n] + 1) + (_s[n] - 1) * (_s[n] - 1) + (_t[n] + 1) * (_t[n] + 1) < NODETOL)
+      _vertexNodes[3] = n;
+    if( (_r[n] + 1) * (_r[n] + 1) + (_s[n] + 1) * (_s[n] + 1) + (_t[n] - 1) * (_t[n] - 1) < NODETOL)
+      _vertexNodes[4] = n;
+    if( (_r[n] - 1) * (_r[n] - 1) + (_s[n] + 1) * (_s[n] + 1) + (_t[n] - 1) * (_t[n] - 1) < NODETOL)
+      _vertexNodes[5] = n;
+    if( (_r[n] - 1) * (_r[n] - 1) + (_s[n] - 1) * (_s[n] - 1) + (_t[n] - 1) * (_t[n] - 1) < NODETOL)
+      _vertexNodes[6] = n;
+    if( (_r[n] + 1) * (_r[n] + 1) + (_s[n] - 1) * (_s[n] - 1) + (_t[n] - 1) * (_t[n] - 1) < NODETOL)
+      _vertexNodes[7] = n;
+  }
+}
+
+void mesh_t::EquispacedNodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t)
+{
+  int _Nq = _N + 1;
+
+  //Equispaced 1D nodes
+  dfloat* r1D = (dfloat*) malloc(_Nq * sizeof(dfloat));
+  dfloat dr = 2.0 / _N;
+  for (int i = 0; i < _Nq; i++) r1D[i] = -1.0 + i * dr;
+
+  //Tensor product
+  for (int k = 0; k < _Nq; k++)
+    for (int j = 0; j < _Nq; j++)
+      for (int i = 0; i < _Nq; i++) {
+        _r[i + j * _Nq + k * _Nq * _Nq] = r1D[i];
+        _s[i + j * _Nq + k * _Nq * _Nq] = r1D[j];
+        _t[i + j * _Nq + k * _Nq * _Nq] = r1D[k];
+      }
+
+  free(r1D);
+}
+
+void mesh_t::EquispacedEToVHex3D(int _N, int* _EToV)
+{
+  int _Nq = _N + 1;
+  int _Nverts = 4;
+
+  //Tensor product
+  int cnt = 0;
+  for (int k = 0; k < _N; k++)
+    for (int j = 0; j < _N; j++)
+      for (int i = 0; i < _N; i++) {
+        //tet 1 (0,3,2,7)
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i + 1 + (j + 1) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i  + (j + 1) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+
+        //tet 2 (0,1,3,7)
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i + 1 + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i + 1 + (j + 1) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+
+        //tet 3 (0,2,6,7)
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i  + (j + 1) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i  + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+
+        //tet 4 (0,6,4,7)
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i  + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i  + (j  ) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+
+        //tet 5 (0,5,1,7)
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i + 1 + (j  ) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i + 1 + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+
+        //tet 6 (0,4,5,7)
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i  + (j  ) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i + 1 + (j  ) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+      }
+}
+
+void mesh_t::SEMFEMEToVHex3D(int _N, int* _EToV)
+{
+  int _Nq = _N + 1;
+  int _Nverts = 8;
+
+  //Tensor product
+  int cnt = 0;
+  for (int k = 0; k < _N; k++)
+    for (int j = 0; j < _N; j++)
+      for (int i = 0; i < _N; i++) {
+        _EToV[cnt * _Nverts + 0] = i  + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 1] = i + 1 + (j  ) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 2] = i + 1 + (j + 1) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 3] = i  + (j + 1) * _Nq + (k  ) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 4] = i  + (j  ) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 5] = i + 1 + (j  ) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 6] = i + 1 + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        _EToV[cnt * _Nverts + 7] = i  + (j + 1) * _Nq + (k + 1) * _Nq * _Nq;
+        cnt++;
+      }
+}
+
+// ------------------------------------------------------------------------
+// ORTHONORMAL BASIS POLYNOMIALS
+// ------------------------------------------------------------------------
+void mesh_t::OrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat* P)
+{
+  *P = JacobiP(a,0,0,i) * JacobiP(b,0,0,j) * JacobiP(c,0,0,k);
+}
+
+void mesh_t::GradOrthonormalBasisHex3D(dfloat a,
+                                       dfloat b,
+                                       dfloat c,
+                                       int i,
+                                       int j,
+                                       int k,
+                                       dfloat* Pr,
+                                       dfloat* Ps,
+                                       dfloat* Pt)
+{
+  *Pr = GradJacobiP(a,0,0,i) * JacobiP(b,0,0,j) * JacobiP(c,0,0,k);
+  *Ps = JacobiP(a,0,0,i) * GradJacobiP(b,0,0,j) * JacobiP(c,0,0,k);
+  *Pt = JacobiP(a,0,0,i) * JacobiP(b,0,0,j) * GradJacobiP(c,0,0,k);
+}
+
+// ------------------------------------------------------------------------
+// 2D VANDERMONDE MATRICES
+// ------------------------------------------------------------------------
+
+void mesh_t::VandermondeHex3D(int _N, int Npoints, dfloat* _r, dfloat* _s, dfloat* _t, dfloat* V)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  for(int n = 0; n < Npoints; n++)
+    for(int k = 0; k < _Nq; k++)
+      for(int j = 0; j < _Nq; j++)
+        for(int i = 0; i < _Nq; i++) {
+          int id = n * _Np + i + j * _Nq + k * _Nq * _Nq;
+          OrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, V + id);
+        }
+}
+
+void mesh_t::GradVandermondeHex3D(int _N,
+                                  int Npoints,
+                                  dfloat* _r,
+                                  dfloat* _s,
+                                  dfloat* _t,
+                                  dfloat* Vr,
+                                  dfloat* Vs,
+                                  dfloat* Vt)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  for(int n = 0; n < Npoints; n++)
+    for(int k = 0; k < _Nq; k++)
+      for(int j = 0; j < _Nq; j++)
+        for(int i = 0; i < _Nq; i++) {
+          int id = n * _Np + i + j * _Nq + k * _Nq * _Nq;
+          GradOrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, Vr + id, Vs + id, Vt + id);
+        }
+}
+
+// ------------------------------------------------------------------------
+// 2D OPERATOR MATRICES
+// ------------------------------------------------------------------------
+void mesh_t::MassMatrixHex3D(int _Np, dfloat* V, dfloat* _MM)
+{
+  // masMatrix = inv(V')*inv(V) = inv(V*V')
+  for(int n = 0; n < _Np; ++n)
+    for(int m = 0; m < _Np; ++m) {
+      dfloat res = 0;
+      for(int i = 0; i < _Np; ++i)
+        res += V[n * _Np + i] * V[m * _Np + i];
+      _MM[n * _Np + m] = res;
+    }
+  matrixInverse(_Np, _MM);
+}
+
+void mesh_t::LumpedMassMatrixHex3D(int _N, dfloat* _gllw, dfloat* _MM)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  // LumpedMassMatrix = gllw \ctimes gllw \ctimes gllw
+  for(int k = 0; k < _Nq; ++k)
+    for(int n = 0; n < _Nq; ++n)
+      for(int m = 0; m < _Nq; ++m) {
+        int id = n + m * _Nq + k * _Nq * _Nq;
+        _MM[id + id * _Np] = _gllw[n] * _gllw[m] * _gllw[k];
+      }
+}
+
+void mesh_t::invLumpedMassMatrixHex3D(int _N, dfloat* _gllw, dfloat* _invMM)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  // invLumpedMassMatrix = invgllw \ctimes invgllw
+  for(int k = 0; k < _Nq; ++k)
+    for(int n = 0; n < _Nq; ++n)
+      for(int m = 0; m < _Nq; ++m) {
+        int id = n + m * _Nq + k * _Nq * _Nq;
+        _invMM[id + id * _Np] = 1.0 / (_gllw[n] * _gllw[m] * _gllw[k]);
+      }
+}
+
+void mesh_t::DmatrixHex3D(int _N, int Npoints, dfloat* _r, dfloat* _s, dfloat* _t,
+                          dfloat* _Dr, dfloat* _Ds, dfloat* _Dt)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  dfloat* V  = (dfloat*) calloc(Npoints * _Np, sizeof(dfloat));
+  dfloat* Vr = (dfloat*) calloc(Npoints * _Np, sizeof(dfloat));
+  dfloat* Vs = (dfloat*) calloc(Npoints * _Np, sizeof(dfloat));
+  dfloat* Vt = (dfloat*) calloc(Npoints * _Np, sizeof(dfloat));
+
+  VandermondeHex3D(_N, Npoints, _r, _s, _t, V);
+  GradVandermondeHex3D(_N, Npoints, _r, _s, _t, Vr, Vs, Vt);
+
+  //Dr = Vr/V, Ds = Vs/V, Dt = Vt/V
+  matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
+  matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
+  matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt);
+
+  free(V);
+  free(Vr);
+  free(Vs);
+  free(Vt);
+}
+
+void mesh_t::InterpolationMatrixHex3D(int _N,
+                                      int NpointsIn, dfloat* rIn, dfloat* sIn, dfloat* tIn,
+                                      int NpointsOut, dfloat* rOut, dfloat* sOut, dfloat* tOut,
+                                      dfloat* I)
+{
+  int _Nq = _N + 1;
+  int _Np = _Nq * _Nq * _Nq;
+
+  // need NpointsIn = _Np
+  if (NpointsIn != _Np)
+    LIBP_ABORT(string("Invalid Interplation operator requested."))
+
+    dfloat* VIn = (dfloat*) malloc(NpointsIn * _Np * sizeof(dfloat));
+  dfloat* VOut = (dfloat*) malloc(NpointsOut * _Np * sizeof(dfloat));
+
+  VandermondeHex3D(_N, NpointsIn,   rIn, sIn, tIn, VIn);
+  VandermondeHex3D(_N, NpointsOut, rOut, sOut, tOut, VOut);
+
+  matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I);
+
+  free(VIn);
+  free(VOut);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/libP/src/meshConnect.c b/src/mesh/meshConnect.cpp
similarity index 100%
rename from src/libP/src/meshConnect.c
rename to src/mesh/meshConnect.cpp
index c194a5067..80b8e6e62 100644
--- a/src/libP/src/meshConnect.c
+++ b/src/mesh/meshConnect.cpp
@@ -59,6 +59,7 @@ int compareVertices(const void* a,
 
   return 0;
 }
+
 /* comparison function that orders element/face
    based on their indexes */
 int compareFaces(const void* a,
@@ -124,7 +125,6 @@ void meshConnect(mesh_t* mesh)
       faces[cnt + 1].faceNeighbor = faces[cnt].face;
     }
 
-
   /* resort faces back to the original element/face ordering */
   qsort(faces,
         mesh->Nelements * mesh->Nfaces,
diff --git a/src/libP/src/meshConnectBoundary.c b/src/mesh/meshConnectBoundary.cpp
similarity index 100%
rename from src/libP/src/meshConnectBoundary.c
rename to src/mesh/meshConnectBoundary.cpp
diff --git a/src/libP/src/meshConnectFaceNodes3D.c b/src/mesh/meshConnectFaceNodes3D.cpp
similarity index 100%
rename from src/libP/src/meshConnectFaceNodes3D.c
rename to src/mesh/meshConnectFaceNodes3D.cpp
diff --git a/src/libP/src/meshConnectPeriodicFaceNodes3D.c b/src/mesh/meshConnectPeriodicFaceNodes3D.cpp
similarity index 100%
rename from src/libP/src/meshConnectPeriodicFaceNodes3D.c
rename to src/mesh/meshConnectPeriodicFaceNodes3D.cpp
diff --git a/src/libP/src/meshFree.c b/src/mesh/meshFree.cpp
similarity index 98%
rename from src/libP/src/meshFree.c
rename to src/mesh/meshFree.cpp
index 9442bc8bf..62a877de9 100644
--- a/src/libP/src/meshFree.c
+++ b/src/mesh/meshFree.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "mesh.h"
-#include "ogsInterface.h"
+//#include "ogsInterface.h"
 
 void meshFree(mesh_t* mesh)
 {
@@ -57,9 +57,9 @@ void meshFree(mesh_t* mesh)
   // CG gather-scatter info
   if(mesh->globalIds) free(mesh->globalIds);
   if(mesh->maskedGlobalIds) free(mesh->maskedGlobalIds);
-  if(  mesh->gsh) ogsHostFree(  mesh->gsh);
-  if(  mesh->hostGsh) ogsHostFree(  mesh->hostGsh);// gslib struct pointer
-  if(  mesh->ogs) ogsFree(  mesh->ogs);//occa gs pointer
+  //if(mesh->gsh) ogsHostFree(mesh->gsh);
+  //if(  mesh->hostGsh) ogsHostFree(  mesh->hostGsh);// gslib struct pointer
+  if(  mesh->ogs) ogsFree(  mesh->ogs); //occa gs pointer
 
   // list of elements that are needed for global gather-scatter
   if(mesh->globalGatherElementList) free(mesh->globalGatherElementList);
diff --git a/src/libP/src/meshGeometricFactorsHex3D.c b/src/mesh/meshGeometricFactorsHex3D.cpp
similarity index 98%
rename from src/libP/src/meshGeometricFactorsHex3D.c
rename to src/mesh/meshGeometricFactorsHex3D.cpp
index efbb4cfca..695f43fd1 100644
--- a/src/libP/src/meshGeometricFactorsHex3D.c
+++ b/src/mesh/meshGeometricFactorsHex3D.cpp
@@ -298,7 +298,8 @@ void meshGeometricFactorsHex3D(mesh3D* mesh)
     MPI_Reduce(&maxSkew, &globalMaxSkew, 1, MPI_DFLOAT, MPI_MAX, 0, mesh->comm);
 
     if(mesh->rank == 0)
-      printf("J in range [%g,%g] and max Skew = %g\n", globalMinJ, globalMaxJ, globalMaxSkew);
+      printf("J in range [%g,%g]\n", globalMinJ, globalMaxJ);
+      //printf("J in range [%g,%g] and max Skew = %g\n", globalMinJ, globalMaxJ, globalMaxSkew);
 
     dfloat globalVolume;
     MPI_Allreduce(&mesh->volume, &globalVolume, 1, MPI_DFLOAT, MPI_SUM, mesh->comm);
diff --git a/src/libP/src/meshHaloExchange.c b/src/mesh/meshHaloExchange.cpp
similarity index 100%
rename from src/libP/src/meshHaloExchange.c
rename to src/mesh/meshHaloExchange.cpp
diff --git a/src/libP/src/meshHaloExtract.c b/src/mesh/meshHaloExtract.cpp
similarity index 100%
rename from src/libP/src/meshHaloExtract.c
rename to src/mesh/meshHaloExtract.cpp
diff --git a/src/libP/src/meshHaloSetup.c b/src/mesh/meshHaloSetup.cpp
similarity index 100%
rename from src/libP/src/meshHaloSetup.c
rename to src/mesh/meshHaloSetup.cpp
diff --git a/src/libP/src/meshLoadReferenceNodesHex3D.c b/src/mesh/meshLoadReferenceNodesHex3D.cpp
similarity index 73%
rename from src/libP/src/meshLoadReferenceNodesHex3D.c
rename to src/mesh/meshLoadReferenceNodesHex3D.cpp
index 2dc30c22a..06d431541 100644
--- a/src/libP/src/meshLoadReferenceNodesHex3D.c
+++ b/src/mesh/meshLoadReferenceNodesHex3D.cpp
@@ -40,54 +40,52 @@ void meshLoadReferenceNodesHex3D(mesh3D* mesh, int N, int cubN)
 
   int Nrows, Ncols;
 
-  mesh->r = (dfloat *) malloc(mesh->Np*sizeof(dfloat));
-  mesh->s = (dfloat *) malloc(mesh->Np*sizeof(dfloat));
-  mesh->t = (dfloat *) malloc(mesh->Np*sizeof(dfloat));
+  mesh->r = (dfloat*) malloc(mesh->Np * sizeof(dfloat));
+  mesh->s = (dfloat*) malloc(mesh->Np * sizeof(dfloat));
+  mesh->t = (dfloat*) malloc(mesh->Np * sizeof(dfloat));
   NodesHex3D(mesh->N, mesh->r, mesh->s, mesh->t);
 
-  mesh->faceNodes = (int *) malloc(mesh->Nfaces*mesh->Nfp*sizeof(int));
+  mesh->faceNodes = (int*) malloc(mesh->Nfaces * mesh->Nfp * sizeof(int));
   FaceNodesHex3D(mesh->N, mesh->r, mesh->s, mesh->t, mesh->faceNodes);
 
   //GLL quadrature
-  mesh->gllz = (dfloat *) malloc((mesh->N+1)*sizeof(dfloat));
-  mesh->gllw = (dfloat *) malloc((mesh->N+1)*sizeof(dfloat));
+  mesh->gllz = (dfloat*) malloc((mesh->N + 1) * sizeof(dfloat));
+  mesh->gllw = (dfloat*) malloc((mesh->N + 1) * sizeof(dfloat));
   JacobiGLL(mesh->N, mesh->gllz, mesh->gllw);
 
-  mesh->D = (dfloat *) malloc(mesh->Nq*mesh->Nq*sizeof(dfloat));
+  mesh->D = (dfloat*) malloc(mesh->Nq * mesh->Nq * sizeof(dfloat));
   Dmatrix1D(mesh->N, mesh->Nq, mesh->gllz, mesh->Nq, mesh->gllz, mesh->D);
 
-  mesh->DW = (dfloat *) malloc(mesh->Nq*mesh->Nq*sizeof(dfloat));
+  mesh->DW = (dfloat*) malloc(mesh->Nq * mesh->Nq * sizeof(dfloat));
   DWmatrix1D(mesh->N, mesh->D, mesh->DW);
 
-  mesh->interpRaise = (dfloat * ) calloc(mesh->Nq*(mesh->Nq+1),sizeof(dfloat));
-  mesh->interpLower = (dfloat * ) calloc((mesh->Nq-1)*(mesh->Nq),sizeof(dfloat));
-  DegreeRaiseMatrix1D(mesh->N, mesh->N+1, mesh->interpRaise);
-  DegreeRaiseMatrix1D(mesh->N-1, mesh->N, mesh->interpLower);
+  mesh->interpRaise = (dfloat* ) calloc(mesh->Nq * (mesh->Nq + 1),sizeof(dfloat));
+  mesh->interpLower = (dfloat* ) calloc((mesh->Nq - 1) * (mesh->Nq),sizeof(dfloat));
+  DegreeRaiseMatrix1D(mesh->N, mesh->N + 1, mesh->interpRaise);
+  DegreeRaiseMatrix1D(mesh->N - 1, mesh->N, mesh->interpLower);
 
   mesh->cubNfp = mesh->cubNq * mesh->cubNq;
   mesh->cubNp = mesh->cubNq * mesh->cubNq * mesh->cubNq;
   // cubN+1 point Gauss-Legendre quadrature
-  mesh->cubr = (dfloat *) malloc(mesh->cubNq*sizeof(dfloat));
-  mesh->cubw = (dfloat *) malloc(mesh->cubNq*sizeof(dfloat));
-  JacobiGLL(mesh->cubNq-1, mesh->cubr, mesh->cubw);
+  mesh->cubr = (dfloat*) malloc(mesh->cubNq * sizeof(dfloat));
+  mesh->cubw = (dfloat*) malloc(mesh->cubNq * sizeof(dfloat));
+  JacobiGLL(mesh->cubNq - 1, mesh->cubr, mesh->cubw);
 
   mesh->cubInterp = (dfloat*) calloc(mesh->Nq * mesh->cubNq, sizeof(dfloat));
   InterpolationMatrix1D(mesh->N, mesh->Nq, mesh->r, mesh->cubNq, mesh->cubr, mesh->cubInterp); //uses the fact that r = gllz for 1:Nq
 
   //cubature project cubProject = cubInterp^T
-  mesh->cubProject = (dfloat*) calloc(mesh->cubNq*mesh->Nq, sizeof(dfloat));
+  mesh->cubProject = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
   matrixTranspose(mesh->cubNq, mesh->Nq, mesh->cubInterp, mesh->Nq, mesh->cubProject, mesh->cubNq);
 
   //cubature derivates matrix, cubD: differentiate on cubature nodes
-  mesh->cubD = (dfloat *) malloc(mesh->cubNq*mesh->cubNq*sizeof(dfloat));
-  Dmatrix1D(mesh->cubNq-1, mesh->cubNq, mesh->cubr, mesh->cubNq, mesh->cubr, mesh->cubD);
+  mesh->cubD = (dfloat*) malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat));
+  Dmatrix1D(mesh->cubNq - 1, mesh->cubNq, mesh->cubr, mesh->cubNq, mesh->cubr, mesh->cubD);
   // weak cubature derivative = cubD^T
-  mesh->cubDW  = (dfloat*) calloc(mesh->cubNq*mesh->cubNq, sizeof(dfloat));
-  for(int i = 0 ; i < mesh->cubNq; ++i){
-    for(int j = 0 ; j < mesh->cubNq; ++j){
-      mesh->cubDW[j+i*mesh->cubNq] = mesh->cubD[i+j*mesh->cubNq];
-    }
-  }
+  mesh->cubDW  = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
+  for(int i = 0; i < mesh->cubNq; ++i)
+    for(int j = 0; j < mesh->cubNq; ++j)
+      mesh->cubDW[j + i * mesh->cubNq] = mesh->cubD[i + j * mesh->cubNq];
 
   mesh->intNfp = 0;
   mesh->intLIFT = NULL;
diff --git a/src/mesh/meshNekReader.cpp b/src/mesh/meshNekReader.cpp
index 7099798fb..ffe3ffc0b 100644
--- a/src/mesh/meshNekReader.cpp
+++ b/src/mesh/meshNekReader.cpp
@@ -6,7 +6,7 @@
 #include "nrs.hpp"
 #include "nekInterfaceAdapter.hpp"
 
-void meshNekReaderHex3D(int N, mesh_t* mesh, int isMeshT)
+void meshNekReaderHex3D(int N, mesh_t* mesh)
 {
   mesh->dim = nekData.ndim;
   if(mesh->dim != 3) {
@@ -23,7 +23,7 @@ void meshNekReaderHex3D(int N, mesh_t* mesh, int isMeshT)
   mesh->Nfaces = 2 * mesh->dim;
   mesh->NfaceVertices = 4;
   mesh->Nelements = nekData.nelt;
-  if(!isMeshT) mesh->Nelements = nekData.nelv;
+  if(!mesh->cht) mesh->Nelements = nekData.nelv;
 
   const int faceVertices[6][4] = {{0,1,2,3},{0,1,5,4},{1,2,6,5},
     {2,3,7,6},{3,0,4,7},{4,5,6,7}};
@@ -34,7 +34,7 @@ void meshNekReaderHex3D(int N, mesh_t* mesh, int isMeshT)
   const int vtxmap[8] = {0, 1, 3, 2, 4, 5, 7, 6};
 
   // build vertex numbering
-  mesh->Nnodes = nek_set_glo_num(2, isMeshT);
+  mesh->Nnodes = nek_set_glo_num(2, mesh->cht);
 
   mesh->EToV
     = (hlong*) calloc(mesh->Nelements * mesh->Nverts, sizeof(hlong));
@@ -45,7 +45,7 @@ void meshNekReaderHex3D(int N, mesh_t* mesh, int isMeshT)
   // find number of boundary faces
   int nbc = 0;
   int* bid = nekData.boundaryIDt;
-  if(!isMeshT) bid = nekData.boundaryID;
+  if(!mesh->cht) bid = nekData.boundaryID;
   for(int e = 0; e < mesh->Nelements; e++)
     for(int iface = 0; iface < mesh->Nfaces; iface++) {
       if(*bid) nbc++;
@@ -65,14 +65,14 @@ void meshNekReaderHex3D(int N, mesh_t* mesh, int isMeshT)
                 MPI_SUM, mesh->comm);
   if(mesh->rank == 0) {
     int n = nekData.NboundaryIDt;
-    if(!isMeshT) n = nekData.NboundaryID;
+    if(!mesh->cht) n = nekData.NboundaryID;
     printf("NboundaryIDs: %d\n", n);
     printf("NboundaryFaces: %d\n", mesh->NboundaryFaces);
   }
 
   int cnt = 0;
   bid = nekData.boundaryIDt;
-  if(!isMeshT) bid = nekData.boundaryID;
+  if(!mesh->cht) bid = nekData.boundaryID;
   int* eface1 = nekData.eface1;
   int* icface = nekData.icface;
   const dlong Nfp = nekData.nx1 * nekData.nx1;
diff --git a/src/mesh/meshNekReader.hpp b/src/mesh/meshNekReader.hpp
index 808a65830..4941f3d3d 100644
--- a/src/mesh/meshNekReader.hpp
+++ b/src/mesh/meshNekReader.hpp
@@ -1,7 +1,7 @@
 #if !defined(nekrs_meshnekreader_hpp_)
 #define nekrs_meshnekreader_hpp_
 
-#include "nekrs.hpp"
-void meshNekReaderHex3D(int N, mesh_t* mesh, int isMeshT);
+#include "nrs.hpp"
+void meshNekReaderHex3D(int N, mesh_t* mesh);
 
 #endif
diff --git a/src/mesh/meshOccaSetup3D.cpp b/src/mesh/meshOccaSetup3D.cpp
new file mode 100644
index 000000000..cca00d7a9
--- /dev/null
+++ b/src/mesh/meshOccaSetup3D.cpp
@@ -0,0 +1,420 @@
+/*
+
+   The MIT License (MIT)
+
+   Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include  "mpi.h"
+
+#include "mesh3D.h"
+
+void reportMemoryUsage(occa::device &device, const char* mess)
+{
+  size_t bytes = device.memoryAllocated();
+
+  printf("%s: bytes allocated = %lu\n", mess, bytes);
+}
+
+void meshOccaPopulateDevice3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo)
+{
+  // find elements that have all neighbors on this process
+  dlong* internalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
+  dlong* notInternalElementIds = (dlong*) calloc(mesh->Nelements, sizeof(dlong));
+
+  dlong Ninterior = 0, NnotInterior = 0;
+  for(dlong e = 0; e < mesh->Nelements; ++e) {
+    int flag = 0;
+    for(int f = 0; f < mesh->Nfaces; ++f)
+      if(mesh->EToP[e * mesh->Nfaces + f] != -1)
+        flag = 1;
+    if(!flag)
+      internalElementIds[Ninterior++] = e;
+    else
+      notInternalElementIds[NnotInterior++] = e;
+  }
+
+  //  printf("NinteriorElements = %d, NnotInternalElements = %d\n", Ninterior, NnotInterior);
+
+  mesh->NinternalElements = Ninterior;
+  mesh->NnotInternalElements = NnotInterior;
+  if(Ninterior)
+    mesh->o_internalElementIds    = mesh->device.malloc(Ninterior * sizeof(dlong),
+                                                        internalElementIds);
+
+  if(NnotInterior > 0)
+    mesh->o_notInternalElementIds = mesh->device.malloc(NnotInterior * sizeof(dlong),
+                                                        notInternalElementIds);
+
+  if (mesh->Nverts == 8) {    // hardcoded for hexes
+    //lumped mass matrix
+    mesh->MM = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat));
+    for (int k = 0; k < mesh->Nq; k++)
+      for (int j = 0; j < mesh->Nq; j++)
+        for (int i = 0; i < mesh->Nq; i++) {
+          int n = i + j * mesh->Nq + k * mesh->Nq * mesh->Nq;
+          mesh->MM[n + n * mesh->Np] = mesh->gllw[i] * mesh->gllw[j] * mesh->gllw[k];
+        }
+
+    mesh->LIFT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
+
+    dfloat* cubDWT = (dfloat*) calloc(mesh->cubNq * mesh->cubNq, sizeof(dfloat));
+    dfloat* cubProjectT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
+    dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat));
+    for(int n = 0; n < mesh->Nq; ++n)
+      for(int m = 0; m < mesh->cubNq; ++m) {
+        cubProjectT[n + m * mesh->Nq] = mesh->cubProject[n * mesh->cubNq + m];
+        cubInterpT[m + n * mesh->cubNq] = mesh->cubInterp[m * mesh->Nq + n];
+      }
+    for(int n = 0; n < mesh->cubNq; ++n)
+      for(int m = 0; m < mesh->cubNq; ++m)
+        cubDWT[n + m * mesh->cubNq] = mesh->cubDW[n * mesh->cubNq + m];
+
+    dfloat* LIFTT = (dfloat*) calloc(mesh->Np * mesh->Nfaces * mesh->Nfp, sizeof(dfloat));
+
+    mesh->o_LIFTT =
+      mesh->device.malloc(1 * sizeof(dfloat)); // dummy
+
+    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: before intX ");
+
+    mesh->intx = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp, sizeof(dfloat));
+    mesh->inty = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp, sizeof(dfloat));
+    mesh->intz = (dfloat*) calloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp, sizeof(dfloat));
+
+    dfloat* ix = (dfloat*) calloc(mesh->cubNq * mesh->Nq,sizeof(dfloat));
+    dfloat* iy = (dfloat*) calloc(mesh->cubNq * mesh->Nq,sizeof(dfloat));
+    dfloat* iz = (dfloat*) calloc(mesh->cubNq * mesh->Nq,sizeof(dfloat));
+    for(dlong e = 0; e < mesh->Nelements; ++e)
+      for(int f = 0; f < mesh->Nfaces; ++f) {
+        //interpolate in i
+        for(int ny = 0; ny < mesh->Nq; ++ny)
+          for(int nx = 0; nx < mesh->cubNq; ++nx) {
+            ix[nx + mesh->cubNq * ny] = 0;
+            iy[nx + mesh->cubNq * ny] = 0;
+            iz[nx + mesh->cubNq * ny] = 0;
+
+            for(int m = 0; m < mesh->Nq; ++m) {
+              dlong vid = m + ny * mesh->Nq + f * mesh->Nfp + e * mesh->Nfp * mesh->Nfaces;
+              dlong idM = mesh->vmapM[vid];
+
+              dfloat xm = mesh->x[idM];
+              dfloat ym = mesh->y[idM];
+              dfloat zm = mesh->z[idM];
+
+              dfloat Inm = mesh->cubInterp[m + nx * mesh->Nq];
+              ix[nx + mesh->cubNq * ny] += Inm * xm;
+              iy[nx + mesh->cubNq * ny] += Inm * ym;
+              iz[nx + mesh->cubNq * ny] += Inm * zm;
+            }
+          }
+
+        //interpolate in j and store
+        for(int ny = 0; ny < mesh->cubNq; ++ny)
+          for(int nx = 0; nx < mesh->cubNq; ++nx) {
+            dfloat x = 0.0, y = 0.0, z = 0.0;
+
+            for(int m = 0; m < mesh->Nq; ++m) {
+              dfloat xm = ix[nx + m * mesh->cubNq];
+              dfloat ym = iy[nx + m * mesh->cubNq];
+              dfloat zm = iz[nx + m * mesh->cubNq];
+
+              dfloat Inm = mesh->cubInterp[m + ny * mesh->Nq];
+              x += Inm * xm;
+              y += Inm * ym;
+              z += Inm * zm;
+            }
+
+            dlong id = nx + ny * mesh->cubNq + f * mesh->cubNfp + e * mesh->Nfaces * mesh->cubNfp;
+            mesh->intx[id] = x;
+            mesh->inty[id] = y;
+            mesh->intz[id] = z;
+          }
+      }
+    free(ix);
+    free(iy);
+    free(iz);
+
+    mesh->LMM = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
+    mesh->o_LMM =
+      mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat));
+    mesh->invLMM = (dfloat*) calloc(mesh->Nelements * mesh->Np, sizeof(dfloat));
+    mesh->o_invLMM =
+      mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat));
+
+    mesh->o_MM =
+      mesh->device.malloc(mesh->Np * mesh->Np * sizeof(dfloat),
+                          mesh->MM); //dummy
+
+    mesh->o_D = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
+
+    mesh->o_DW = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->DW);
+
+    mesh->o_Dmatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D);
+
+    dfloat* DT = (dfloat*) calloc(mesh->Nq * mesh->Nq,sizeof(dfloat));
+    for(int j = 0; j < mesh->Nq; ++j)
+      for(int i = 0; i < mesh->Nq; ++i)
+        DT[i * mesh->Nq + j] = mesh->D[j * mesh->Nq + i];
+
+    mesh->o_Smatrices = mesh->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), DT); //dummy
+
+    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: before geofactors ");
+
+    mesh->o_vgeo =
+      mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nvgeo * sizeof(dfloat),
+                          mesh->vgeo);
+
+    mesh->o_sgeo =
+      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat),
+                          mesh->sgeo);
+
+    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: before vgeo,sgeo ");
+
+    mesh->o_ggeo =
+      mesh->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat),
+                          mesh->ggeo);
+
+    mesh->o_cubvgeo =
+      mesh->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp * sizeof(dfloat),
+                          mesh->cubvgeo);
+
+    mesh->o_cubsgeo =
+      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * mesh->Nsgeo *
+                          sizeof(dfloat),
+                          mesh->cubsgeo);
+
+    mesh->o_cubggeo =
+      mesh->device.malloc(mesh->Nelements * mesh->Nggeo * mesh->cubNp * sizeof(dfloat),
+                          mesh->cubggeo);
+
+    mesh->o_cubInterpT =
+      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
+                          cubInterpT);
+
+    mesh->o_cubProjectT =
+      mesh->device.malloc(mesh->Nq * mesh->cubNq * sizeof(dfloat),
+                          cubProjectT);
+
+    mesh->o_cubDWT =
+      mesh->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat),
+                          cubDWT);
+
+    mesh->o_cubD =
+      mesh->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat),
+                          mesh->cubD);
+
+    mesh->o_cubDWmatrices = mesh->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat), cubDWT);
+
+    // just neeeded to combine quad and hex cub kernels
+    mesh->o_cubDiffInterpT = mesh->o_cubDWmatrices;
+
+    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: after geofactors ");
+
+    mesh->o_intx =
+      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * sizeof(dfloat),
+                          mesh->intx);
+
+    mesh->o_inty =
+      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * sizeof(dfloat),
+                          mesh->inty);
+
+    mesh->o_intz =
+      mesh->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->cubNfp * sizeof(dfloat),
+                          mesh->intz);
+
+    mesh->o_intInterpT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
+    mesh->o_intInterpT.copyFrom(mesh->o_cubInterpT);
+
+    mesh->o_intLIFTT = mesh->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat));
+    mesh->o_intLIFTT.copyFrom(mesh->o_cubProjectT);
+
+    //    reportMemoryUsage(mesh->device, "meshOccaSetup3D: after intX ");
+  } else {
+    printf("Nverts = %d: unknown element type!\n",mesh->Nverts);
+  }
+
+  mesh->o_vmapM =
+    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
+                        mesh->vmapM);
+
+  mesh->o_vmapP =
+    mesh->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong),
+                        mesh->vmapP);
+
+  mesh->o_EToB =
+    mesh->device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),
+                        mesh->EToB);
+
+  mesh->o_x =
+    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat), mesh->x);
+
+  mesh->o_y =
+    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat), mesh->y);
+
+  mesh->o_z =
+    mesh->device.malloc(mesh->Nelements * mesh->Np * sizeof(dfloat), mesh->z);
+
+  if(mesh->totalHaloPairs > 0) {
+    // copy halo element list to DEVICE
+    mesh->o_haloElementList =
+      mesh->device.malloc(mesh->totalHaloPairs * sizeof(dlong), mesh->haloElementList);
+
+    // temporary DEVICE buffer for halo (maximum size Nfields*Np for dfloat)
+    //printf("mesh->Nfields = %d\n", mesh->Nfields);
+    mesh->o_haloBuffer =
+      mesh->device.malloc(mesh->totalHaloPairs * mesh->Np * mesh->Nfields * sizeof(dfloat));
+
+    mesh->o_haloGetNodeIds =
+      mesh->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloGetNodeIds);
+
+    mesh->o_haloPutNodeIds =
+      mesh->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloPutNodeIds);
+  }
+
+  kernelInfo["defines/" "p_dim"] = 3;
+  kernelInfo["defines/" "p_Nfields"] = mesh->Nfields;
+  kernelInfo["defines/" "p_N"] = mesh->N;
+  kernelInfo["defines/" "p_Nq"] = mesh->N + 1;
+  kernelInfo["defines/" "p_Np"] = mesh->Np;
+  kernelInfo["defines/" "p_Nfp"] = mesh->Nfp;
+  kernelInfo["defines/" "p_Nfaces"] = mesh->Nfaces;
+  kernelInfo["defines/" "p_NfacesNfp"] = mesh->Nfp * mesh->Nfaces;
+  kernelInfo["defines/" "p_Nvgeo"] = mesh->Nvgeo;
+  kernelInfo["defines/" "p_Nsgeo"] = mesh->Nsgeo;
+  kernelInfo["defines/" "p_Nggeo"] = mesh->Nggeo;
+
+  kernelInfo["defines/" "p_max_EL_nnz"] = mesh->max_EL_nnz; // for Bernstein Bezier lift
+
+  kernelInfo["defines/" "p_NXID"] = NXID;
+  kernelInfo["defines/" "p_NYID"] = NYID;
+  kernelInfo["defines/" "p_NZID"] = NZID;
+  kernelInfo["defines/" "p_SJID"] = SJID;
+  kernelInfo["defines/" "p_IJID"] = IJID;
+  kernelInfo["defines/" "p_IHID"] = IHID;
+  kernelInfo["defines/" "p_WSJID"] = WSJID;
+  kernelInfo["defines/" "p_WIJID"] = WIJID;
+  kernelInfo["defines/" "p_STXID"] = STXID;
+  kernelInfo["defines/" "p_STYID"] = STYID;
+  kernelInfo["defines/" "p_STZID"] = STZID;
+  kernelInfo["defines/" "p_SBXID"] = SBXID;
+  kernelInfo["defines/" "p_SBYID"] = SBYID;
+  kernelInfo["defines/" "p_SBZID"] = SBZID;
+
+  int maxNodes = mymax(mesh->Np, (mesh->Nfp * mesh->Nfaces));
+  kernelInfo["defines/" "p_maxNodes"] = maxNodes;
+
+  kernelInfo["defines/" "p_Lambda2"] = 0.5f;
+
+  kernelInfo["defines/" "p_cubNq"] = mesh->cubNq;
+  kernelInfo["defines/" "p_cubNfp"] = mesh->cubNfp;
+  kernelInfo["defines/" "p_cubNp"] = mesh->cubNp;
+  kernelInfo["defines/" "p_intNfp"] = mesh->intNfp;
+  kernelInfo["defines/" "p_intNfpNfaces"] = mesh->intNfp * mesh->Nfaces;
+
+  if(sizeof(dfloat) == 4) {
+    kernelInfo["defines/" "dfloat"] = "float";
+    kernelInfo["defines/" "dfloat4"] = "float4";
+    kernelInfo["defines/" "dfloat8"] = "float8";
+  }
+  if(sizeof(dfloat) == 8) {
+    kernelInfo["defines/" "dfloat"] = "double";
+    kernelInfo["defines/" "dfloat4"] = "double4";
+    kernelInfo["defines/" "dfloat8"] = "double8";
+  }
+
+  if(sizeof(dlong) == 4)
+    kernelInfo["defines/" "dlong"] = "int";
+  if(sizeof(dlong) == 8)
+    kernelInfo["defines/" "dlong"] = "long long int";
+
+  if(mesh->device.mode() == "CUDA") { // add backend compiler optimization for CUDA
+    kernelInfo["compiler_flags"] += "--ftz=true ";
+    kernelInfo["compiler_flags"] += "--prec-div=false ";
+    kernelInfo["compiler_flags"] += "--prec-sqrt=false ";
+    kernelInfo["compiler_flags"] += "--use_fast_math ";
+    kernelInfo["compiler_flags"] += "--fmad=true "; // compiler option for cuda
+    //kernelInfo["compiler_flags"] += "-Xptxas -dlcm=ca";
+  }
+
+  if(mesh->device.mode() == "OpenCL") { // add backend compiler optimization for OPENCL
+    kernelInfo["compiler_flags"] += " -cl-std=CL2.0 ";
+    kernelInfo["compiler_flags"] += " -cl-strict-aliasing ";
+    kernelInfo["compiler_flags"] += " -cl-mad-enable ";
+    kernelInfo["compiler_flags"] += " -cl-no-signed-zeros ";
+    kernelInfo["compiler_flags"] += " -cl-unsafe-math-optimizations ";
+    kernelInfo["compiler_flags"] += " -cl-fast-relaxed-math ";
+  }
+
+  if(mesh->device.mode() == "HIP") { // add backend compiler optimization for HIP
+    kernelInfo["compiler_flags"] += " -O3 ";
+    kernelInfo["compiler_flags"] += " -ffp-contract=fast ";
+    // kernelInfo["compiler_flags"] += " -funsafe-math-optimizations ";
+    // kernelInfo["compiler_flags"] += " -ffast-math ";
+  }
+
+  kernelInfo["defines/" "p_G00ID"] = G00ID;
+  kernelInfo["defines/" "p_G01ID"] = G01ID;
+  kernelInfo["defines/" "p_G02ID"] = G02ID;
+  kernelInfo["defines/" "p_G11ID"] = G11ID;
+  kernelInfo["defines/" "p_G12ID"] = G12ID;
+  kernelInfo["defines/" "p_G22ID"] = G22ID;
+  kernelInfo["defines/" "p_GWJID"] = GWJID;
+
+  kernelInfo["defines/" "p_RXID"] = RXID;
+  kernelInfo["defines/" "p_SXID"] = SXID;
+  kernelInfo["defines/" "p_TXID"] = TXID;
+
+  kernelInfo["defines/" "p_RYID"] = RYID;
+  kernelInfo["defines/" "p_SYID"] = SYID;
+  kernelInfo["defines/" "p_TYID"] = TYID;
+
+  kernelInfo["defines/" "p_RZID"] = RZID;
+  kernelInfo["defines/" "p_SZID"] = SZID;
+  kernelInfo["defines/" "p_TZID"] = TZID;
+
+  kernelInfo["defines/" "p_JID"] = JID;
+  kernelInfo["defines/" "p_JWID"] = JWID;
+  kernelInfo["defines/" "p_IJWID"] = IJWID;
+}
+
+void meshOccaSetup3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo)
+{
+  //make seperate stream for halo exchange
+  mesh->defaultStream = mesh->device.getStream();
+  mesh->dataStream = mesh->device.createStream();
+  mesh->computeStream = mesh->device.createStream();
+  mesh->device.setStream(mesh->defaultStream);
+
+  meshOccaPopulateDevice3D(mesh, newOptions, kernelInfo);
+}
+
+void meshOccaCloneDevice(mesh_t* donorMesh, mesh_t* mesh)
+{
+  mesh->device = donorMesh->device;
+
+  mesh->defaultStream = donorMesh->defaultStream;
+  mesh->dataStream = donorMesh->dataStream;
+  mesh->computeStream = donorMesh->computeStream;
+}
diff --git a/src/mesh/meshParallelConnectNodes.cpp b/src/mesh/meshParallelConnectNodes.cpp
index af02aacf9..5eb5a9117 100644
--- a/src/mesh/meshParallelConnectNodes.cpp
+++ b/src/mesh/meshParallelConnectNodes.cpp
@@ -12,7 +12,7 @@ typedef struct
 }parallelNode_t;
 
 // uniquely label each node with a global index, used for gatherScatter
-void meshNekParallelConnectNodes(mesh_t* mesh, int isTmesh)
+void meshNekParallelConnectNodes(mesh_t* mesh)
 {
   int rank, size;
   rank = mesh->rank;
@@ -21,16 +21,16 @@ void meshNekParallelConnectNodes(mesh_t* mesh, int isTmesh)
   dlong localNodeCount = mesh->Np * mesh->Nelements;
 
   mesh->globalIds = (hlong*) calloc(localNodeCount, sizeof(hlong));
-  hlong ngv = nek_set_glo_num(mesh->N + 1, isTmesh);
+  hlong ngv = nek_set_glo_num(mesh->N + 1, mesh->cht);
   for(dlong id = 0; id < localNodeCount; ++id)
     mesh->globalIds[id] = nekData.glo_num[id];
 }
 
-void meshParallelConnectNodes(mesh_t* mesh, int isTmesh, int nrsBuildOnly)
+void meshParallelConnectNodes(mesh_t* mesh, int nrsBuildOnly)
 {
   if(!nrsBuildOnly) {
     // hotfix as libP version seems to be broken
-    meshNekParallelConnectNodes(mesh, isTmesh);
+    meshNekParallelConnectNodes(mesh);
     return;
   }
 
diff --git a/src/libP/src/meshParallelConnectOpt.c b/src/mesh/meshParallelConnectOpt.cpp
similarity index 100%
rename from src/libP/src/meshParallelConnectOpt.c
rename to src/mesh/meshParallelConnectOpt.cpp
diff --git a/src/libP/src/meshParallelConsecutiveGlobalNumbering.c b/src/mesh/meshParallelConsecutiveGlobalNumbering.cpp
similarity index 100%
rename from src/libP/src/meshParallelConsecutiveGlobalNumbering.c
rename to src/mesh/meshParallelConsecutiveGlobalNumbering.cpp
diff --git a/src/libP/src/meshParallelGatherScatter.c b/src/mesh/meshParallelGatherScatter.cpp
similarity index 100%
rename from src/libP/src/meshParallelGatherScatter.c
rename to src/mesh/meshParallelGatherScatter.cpp
diff --git a/src/libP/src/meshParallelGatherScatterSetup.c b/src/mesh/meshParallelGatherScatterSetup.cpp
similarity index 100%
rename from src/libP/src/meshParallelGatherScatterSetup.c
rename to src/mesh/meshParallelGatherScatterSetup.cpp
diff --git a/src/mesh/meshPhysicalNodesHex3D.cpp b/src/mesh/meshPhysicalNodesHex3D.cpp
index 15efef671..ff36ab0dd 100644
--- a/src/mesh/meshPhysicalNodesHex3D.cpp
+++ b/src/mesh/meshPhysicalNodesHex3D.cpp
@@ -27,7 +27,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "libParanumal.hpp"
+#include "nrs.hpp"
 #include "nekInterfaceAdapter.hpp"
 
 void meshPhysicalBoxNodesHex3D(mesh3D* mesh)
diff --git a/src/mesh/meshSetup.cpp b/src/mesh/meshSetup.cpp
index 355125891..efdec918c 100644
--- a/src/mesh/meshSetup.cpp
+++ b/src/mesh/meshSetup.cpp
@@ -21,6 +21,7 @@ mesh_t* createMeshDummy(MPI_Comm comm,
   mesh->rank = rank;
   mesh->size = size;
 
+  mesh->cht = 0;
   mesh->Nfields = 1;
   mesh->dim = 3;
   mesh->Nverts = 8; // number of vertices per element
@@ -132,10 +133,10 @@ mesh_t* createMeshDummy(MPI_Comm comm,
   mesh->boundaryInfo = NULL; // no boundaries
 
   // connect elements using parallel sort
-  libParanumal::meshParallelConnect(mesh);
+  meshParallelConnect(mesh);
 
   // load reference (r,s,t) element nodes
-  libParanumal::meshLoadReferenceNodesHex3D(mesh, N, cubN);
+  meshLoadReferenceNodesHex3D(mesh, N, cubN);
   if (mesh->rank == 0)
     printf("Nq: %d cubNq: %d \n", mesh->Nq, mesh->cubNq);
 
@@ -143,19 +144,19 @@ mesh_t* createMeshDummy(MPI_Comm comm,
   meshPhysicalNodesHex3D(mesh, 1);
 
   // compute geometric factors
-  libParanumal::meshGeometricFactorsHex3D(mesh);
+  meshGeometricFactorsHex3D(mesh);
 
   // set up halo exchange info for MPI (do before connect face nodes)
-  libParanumal::meshHaloSetup(mesh);
+  meshHaloSetup(mesh);
 
   // connect face nodes (find trace indices)
   meshConnectPeriodicFaceNodes3D(mesh,XMAX - XMIN,YMAX - YMIN,ZMAX - ZMIN);
 
   // compute surface geofacs (including halo)
-  libParanumal::meshSurfaceGeometricFactorsHex3D(mesh);
+  meshSurfaceGeometricFactorsHex3D(mesh);
 
   // global nodes
-  meshParallelConnectNodes(mesh, 0, 1);
+  meshParallelConnectNodes(mesh, 1);
 
   mesh->device = device;
   meshOccaSetup3D(mesh, options, kernelInfo);
@@ -163,13 +164,13 @@ mesh_t* createMeshDummy(MPI_Comm comm,
   return mesh;
 }
 
-mesh_t* createMeshT(MPI_Comm comm,
-                    int N,
-                    int cubN,
-                    int isMeshT,
-                    setupAide &options,
-                    occa::device device,
-                    occa::properties& kernelInfo)
+mesh_t* createMesh(MPI_Comm comm,
+                   int N,
+                   int cubN,
+                   int isMeshT,
+                   setupAide &options,
+                   occa::device device,
+                   occa::properties& kernelInfo)
 {
   mesh_t* mesh = new mesh_t[1];
 
@@ -180,20 +181,21 @@ mesh_t* createMeshT(MPI_Comm comm,
   mesh->comm = comm;
   mesh->rank = rank;
   mesh->size = size;
+  mesh->cht  = isMeshT;
 
   // get mesh from nek
-  meshNekReaderHex3D(N, mesh, isMeshT);
+  meshNekReaderHex3D(N, mesh);
 
   mesh->Nfields = 1; // TW: note this is a temporary patch (halo exchange depends on nfields)
 
   // connect elements using parallel sort
-  libParanumal::meshParallelConnect(mesh);
+  meshParallelConnect(mesh);
 
   // connect elements to boundary faces
-  libParanumal::meshConnectBoundary(mesh);
+  meshConnectBoundary(mesh);
 
   // load reference (r,s,t) element nodes
-  libParanumal::meshLoadReferenceNodesHex3D(mesh, N, cubN);
+  meshLoadReferenceNodesHex3D(mesh, N, cubN);
   if (mesh->rank == 0)
     printf("Nq: %d cubNq: %d \n", mesh->Nq, mesh->cubNq);
 
@@ -201,21 +203,21 @@ mesh_t* createMeshT(MPI_Comm comm,
   meshPhysicalNodesHex3D(mesh, 0);
 
   // compute geometric factors
-  libParanumal::meshGeometricFactorsHex3D(mesh);
+  meshGeometricFactorsHex3D(mesh);
 
   // set up halo exchange info for MPI (do before connect face nodes)
-  libParanumal::meshHaloSetup(mesh);
+  meshHaloSetup(mesh);
 
   // connect face nodes (find trace indices)
-  libParanumal::meshConnectFaceNodes3D(mesh);
+  meshConnectFaceNodes3D(mesh);
 
   // compute surface geofacs (including halo)
-  libParanumal::meshSurfaceGeometricFactorsHex3D(mesh);
+  meshSurfaceGeometricFactorsHex3D(mesh);
 
   // global nodes
-  meshParallelConnectNodes(mesh, 1, 0);
+  meshParallelConnectNodes(mesh, 0);
 
-  bcMap::check(mesh, isMeshT);
+  bcMap::check(mesh);
 
   mesh->device = device;
   meshOccaSetup3D(mesh, options, kernelInfo);
@@ -239,30 +241,27 @@ mesh_t* createMeshV(MPI_Comm comm,
 
   // shallow copy
   memcpy(mesh, meshT, sizeof(*meshT));
+  mesh->cht = 0;
 
   // find EToV and boundaryInfo
-  meshNekReaderHex3D(N, mesh, 0);
+  meshNekReaderHex3D(N, mesh);
   free(mesh->elementInfo);
   mesh->elementInfo = meshT->elementInfo;
 
   mesh->Nfields = 1; // temporary patch (halo exchange depends on nfields)
 
   // find mesh->EToP, mesh->EToE and mesh->EToF, required mesh->EToV
-  libParanumal::meshParallelConnect(mesh);
+  meshParallelConnect(mesh);
 
   // find mesh->EToB, required mesh->EToV and mesh->boundaryInfo
-  libParanumal::meshConnectBoundary(mesh);
-
-  // load reference (r,s,t) element nodes
-  //libParanumal::meshLoadReferenceNodesHex3D(mesh, N);
+  meshConnectBoundary(mesh);
 
   // compute physical (x,y) locations of the element nodes
-  // mesh->x ...
   meshPhysicalNodesHex3D(mesh, 0);
 
   // compute geometric factors
-  // note: we only need vgeo because elliptic performs helo change
-  libParanumal::meshGeometricFactorsHex3D(mesh);
+  meshGeometricFactorsHex3D(mesh);
+
   free(mesh->cubvgeo);
   mesh->cubvgeo = meshT->cubvgeo;
   free(mesh->ggeo);
@@ -272,21 +271,17 @@ mesh_t* createMeshV(MPI_Comm comm,
 
   // set up halo exchange info for MPI (do before connect face nodes)
   // note: realloc mesh->X and mesh->EX ...
-  libParanumal::meshHaloSetup(mesh);
+  meshHaloSetup(mesh);
 
   // connect face nodes (find trace indices)
   // find vmapM, vmapP, mapP based on EToE and EToF
-  libParanumal::meshConnectFaceNodes3D(mesh);
-
-  // compute surface geofacs
-  // assumption: no halo exchange required!
-  //libParanumal::meshSurfaceGeometricFactorsHex3D(mesh);
+  meshConnectFaceNodes3D(mesh);
 
   // uniquely label each node with a global index, used for gatherScatter
   // mesh->globalIds
-  meshParallelConnectNodes(mesh, 0, 1);
+  meshParallelConnectNodes(mesh, 0);
 
-  bcMap::check(mesh, 0);
+  bcMap::check(mesh);
   meshVOccaSetup3D(mesh, options, kernelInfo);
 
   return mesh;
diff --git a/src/mesh/meshSetup.hpp b/src/mesh/meshSetup.hpp
index c0ff1bf04..7360de20d 100644
--- a/src/mesh/meshSetup.hpp
+++ b/src/mesh/meshSetup.hpp
@@ -1,20 +1,22 @@
 #if !defined(nekrs_meshsetup_hpp_)
 #define nekrs_meshsetup_hpp_
 
-#include "nekrs.hpp"
+#include "nrs.hpp"
 mesh_t* createMeshDummy(MPI_Comm comm,
                         int N,
                         int cubN,
                         setupAide &options,
                         occa::device device,
                         occa::properties &kernelInfo);
-mesh_t* createMeshT(MPI_Comm comm,
-                    int N,
-                    int cubN,
-                    int isMeshT,
-                    setupAide &options,
-                    occa::device device,
-                    occa::properties &kernelInfo);
+
+mesh_t* createMesh(MPI_Comm comm,
+                   int N,
+                   int cubN,
+                   int isMeshT,
+                   setupAide &options,
+                   occa::device device,
+                   occa::properties &kernelInfo);
+
 mesh_t* createMeshV(MPI_Comm comm,
                     int N,
                     int cubN,
diff --git a/src/libP/src/meshSurfaceGeometricFactorsHex3D.c b/src/mesh/meshSurfaceGeometricFactorsHex3D.cpp
similarity index 99%
rename from src/libP/src/meshSurfaceGeometricFactorsHex3D.c
rename to src/mesh/meshSurfaceGeometricFactorsHex3D.cpp
index 9ef098e4d..6a9390fd3 100644
--- a/src/libP/src/meshSurfaceGeometricFactorsHex3D.c
+++ b/src/mesh/meshSurfaceGeometricFactorsHex3D.cpp
@@ -426,7 +426,6 @@ void meshSurfaceGeometricFactorsHex3D(mesh3D* mesh)
       mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP);
     }
 
-
   free(xre);
   free(xse);
   free(xte);
diff --git a/src/nekInterface/nekInterface.f b/src/nekInterface/nekInterface.f
index 34268ab44..273daabfa 100644
--- a/src/nekInterface/nekInterface.f
+++ b/src/nekInterface/nekInterface.f
@@ -356,6 +356,52 @@ subroutine nekf_outfld(suffix)
       call copy(pm1,pr,nx1*ny1*nz1*nelv)
       call outfld(suffix)
 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine nekf_storesol()
+
+      include 'SIZE'
+      include 'TOTAL'
+      include 'NEKINTF'
+
+      parameter(ltot=lx1*ly1*lz1*lelt)
+      common /outtmp/  w1(ltot),w2(ltot),w3(ltot),wp(ltot)
+     &                ,wt(ltot,ldimt)
+
+      ntot1  = lx1*ly1*lz1*nelt
+
+      call copy(w1,vx,ntot1)
+      call copy(w2,vy,ntot1)
+      call copy(w3,vz,ntot1)
+      call copy(wp,pr,ntot1)
+      do i = 1,ldimt
+         call copy(wt(1,i),t(1,1,1,1,i),ntot1)
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine nekf_restoresol()
+
+      include 'SIZE'
+      include 'TOTAL'
+      include 'NEKINTF'
+
+      parameter(ltot=lx1*ly1*lz1*lelt)
+      common /outtmp/  w1(ltot),w2(ltot),w3(ltot),wp(ltot)
+     &                ,wt(ltot,ldimt)
+
+      ntot1  = lx1*ly1*lz1*nelt
+
+      call copy(vx,w1,ntot1)
+      call copy(vy,w2,ntot1)
+      call copy(vz,w3,ntot1)
+      call copy(pr,wp,ntot1)
+      do i = 1,ldimt
+         call copy(t(1,1,1,1,i),wt(1,i),ntot1)
+      enddo
+
       return
       end
 c-----------------------------------------------------------------------
diff --git a/src/nekInterface/nekInterfaceAdapter.cpp b/src/nekInterface/nekInterfaceAdapter.cpp
index ea4fa08dc..d6de627cb 100644
--- a/src/nekInterface/nekInterfaceAdapter.cpp
+++ b/src/nekInterface/nekInterfaceAdapter.cpp
@@ -7,8 +7,7 @@
 nekdata_private nekData;
 static int rank;
 static setupAide* options;
-static ins_t** ins;
-static dfloat timeLast = -1;
+static nrs_t* nrs;
 
 static void (* usrdat_ptr)(void);
 static void (* usrdat2_ptr)(void);
@@ -44,6 +43,12 @@ static void (* nek_gen_bcmap_ptr)(void);
 static int (* nek_nbid_ptr)(int*);
 static long long (* nek_set_vert_ptr)(int*, int*);
 
+static void (* nek_setbd_ptr)(double *, double*, int*);
+static void (* nek_setabbd_ptr)(double *, double*, int*, int*);
+
+static void (* nek_storesol_ptr)(void);
+static void (* nek_restoresol_ptr)(void);
+
 void noop_func(void) {}
 
 void* nek_ptr(const char* id)
@@ -61,29 +66,13 @@ void* nek_scPtr(int id)
   return ptr;
 }
 
-void nek_outfld()
-{
-  timer::tic("checkpointing", 1);
-  const char suffix[] = "   ";
-  (*nek_outfld_ptr)((char*)suffix);
-  timer::toc("checkpointing");
-}
-
-void nek_outfld(const char* suffix)
-{
-  timer::tic("checkpointing", 1);
-  (*nek_outfld_ptr)((char*)suffix);
-  timer::toc("checkpointing");
-}
-
-void nek_outfld(const char* suffix, dfloat t, int coords,
-                occa::memory o_u, occa::memory o_p, occa::memory o_s,
-                int NSfields, int FP64)
+void nek_outfld(const char* suffix, dfloat t, int coords, int FP64,
+                occa::memory &o_u, occa::memory &o_p, occa::memory &o_s,
+                int NSfields)
 {
 
-  timer::tic("checkpointing", 1);
-  mesh_t* mesh = (*ins)->mesh;
-  cds_t* cds = (*ins)->cds;
+  mesh_t* mesh = nrs->mesh;
+  cds_t* cds = nrs->cds;
   dlong Nlocal = mesh->Nelements * mesh->Np;
 
   double time = t;
@@ -93,12 +82,19 @@ void nek_outfld(const char* suffix, dfloat t, int coords,
   int po = 0;
   int so = 0;
 
+  (*nek_storesol_ptr)();
+
+  timer::tic("checkpointing", 1);
+
   if(coords)
+    nrs->mesh->o_x.copyTo(nekData.xm1, Nlocal * sizeof(dfloat));
+    nrs->mesh->o_y.copyTo(nekData.ym1, Nlocal * sizeof(dfloat));
+    nrs->mesh->o_z.copyTo(nekData.zm1, Nlocal * sizeof(dfloat));
     xo = 1;
   if(o_u.ptr()) {
-    occa::memory o_vx = o_u + 0 * (*ins)->fieldOffset * sizeof(dfloat);
-    occa::memory o_vy = o_u + 1 * (*ins)->fieldOffset * sizeof(dfloat);
-    occa::memory o_vz = o_u + 2 * (*ins)->fieldOffset * sizeof(dfloat);
+    occa::memory o_vx = o_u + 0 * nrs->fieldOffset * sizeof(dfloat);
+    occa::memory o_vy = o_u + 1 * nrs->fieldOffset * sizeof(dfloat);
+    occa::memory o_vz = o_u + 2 * nrs->fieldOffset * sizeof(dfloat);
     o_vx.copyTo(nekData.vx, Nlocal * sizeof(dfloat));
     o_vy.copyTo(nekData.vy, Nlocal * sizeof(dfloat));
     o_vz.copyTo(nekData.vz, Nlocal * sizeof(dfloat));
@@ -112,7 +108,7 @@ void nek_outfld(const char* suffix, dfloat t, int coords,
     const dlong nekFieldOffset = nekData.lelt * mesh->Np;
     for(int is = 0; is < NSfields; is++) {
       mesh_t* mesh;
-      (is) ? mesh = (*ins)->cds->meshV : mesh = (*ins)->cds->mesh;
+      (is) ? mesh = nrs->cds->meshV : mesh = nrs->cds->mesh;
       const dlong Nlocal = mesh->Nelements * mesh->Np;
       dfloat* Ti = nekData.t + is * nekFieldOffset;
       occa::memory o_Si = o_s + is * cds->fieldOffset * sizeof(dfloat);
@@ -124,7 +120,10 @@ void nek_outfld(const char* suffix, dfloat t, int coords,
   (*nek_setio_ptr)(&t, &xo, &vo, &po, &so, &NSfields, &FP64);
   (*nek_outfld_ptr)((char*)suffix);
   (*nek_resetio_ptr)();
+
   timer::toc("checkpointing");
+
+  (*nek_restoresol_ptr)();
 }
 
 void nek_uic(int ifield)
@@ -152,11 +151,6 @@ void nek_setic(void)
   }
 
   (*nek_setics_ptr)();
-
-  if (readRestartFile) {
-    double startTime = *(nekData.time);
-    options->setArgs("START TIME", to_string_f(startTime));
-  }
 }
 
 void nek_map_m_to_n(double* a, int na, double* b, int nb)
@@ -195,7 +189,7 @@ void nek_setics(void)
 
 void nek_userchk(void)
 {
-  if(rank == 0) printf("calling nek_userchk\n");
+  if(rank == 0) printf("calling nek_userchk ...\n");
   (*userchk_ptr)();
 }
 
@@ -292,6 +286,16 @@ void set_function_handles(const char* session_in,int verbose)
   nek_set_vert_ptr = (long long (*)(int*, int*))dlsym(handle,fname("nekf_set_vert"));
   check_error(dlerror());
 
+  nek_setbd_ptr = (void (*)(double *, double*, int*))dlsym(handle, fname("setbd"));
+  check_error(dlerror());
+  nek_setabbd_ptr = (void (*)(double *, double*, int*, int*))dlsym(handle, fname("setabbd"));
+  check_error(dlerror());
+
+  nek_storesol_ptr = (void (*)(void))dlsym(handle, fname("nekf_storesol"));
+  check_error(dlerror());
+  nek_restoresol_ptr = (void (*)(void))dlsym(handle, fname("nekf_restoresol"));
+  check_error(dlerror());
+
 #define postfix(x) x ## _ptr
 #define load_or_noop(s) \
   do { \
@@ -370,6 +374,8 @@ void mkSIZE(int lx1, int lxd, int lelt, int lelg, int ldim, int lpmin, int ldimt
       sprintf(line, "      parameter (lgmres=%d)\n", 1);
     else if(strstr(line, "parameter (lorder=") != NULL)
       sprintf(line, "      parameter (lorder=%d)\n", 1);
+    else if(strstr(line, "parameter (lhis=") != NULL)
+      sprintf(line, "      parameter (lhis=%d)\n", 100000);
     else if(strstr(line, "parameter (lelr=") != NULL)
       sprintf(line, "      parameter (lelr=%d)\n", 128 * lelt);
 
@@ -464,7 +470,6 @@ int buildNekInterface(const char* casename, int ldimt, int N, int np)
   retval = system(buf);
   if (retval) goto err;
 
-  // Copy Nek5000/core from install_dir to cache_dir
   sprintf(buf, "cp -pr %s %s", nek5000_dir, cache_dir); 
   retval = system(buf);
   if (retval) goto err;
@@ -503,10 +508,10 @@ void nek_gen_bcmap()
   (*nek_gen_bcmap_ptr)();
 }
 
-int nek_setup(MPI_Comm c, setupAide &options_in, ins_t** ins_in)
+int nek_setup(MPI_Comm c, setupAide &options_in, nrs_t* nrs_in)
 {
   options = &options_in;
-  ins = ins_in;
+  nrs = nrs_in;
   MPI_Comm_rank(c,&rank);
   MPI_Fint nek_comm = MPI_Comm_c2f(c);
 
@@ -640,6 +645,10 @@ int nek_setup(MPI_Comm c, setupAide &options_in, ins_t** ins_in)
   options->getArgs("SCALAR00 DIFFUSIVITY", diff);
   nekData.param[7] = diff;
 
+  dfloat startTime;
+  options->getArgs("START TIME", startTime);
+   *(nekData.time) = startTime; 
+
   return 0;
 }
 
@@ -650,42 +659,46 @@ void nek_copyFrom(dfloat time)
     fflush(stdout);
   }
 
-  timeLast = time;
-
-  mesh_t* mesh = (*ins)->mesh;
+  mesh_t* mesh = nrs->mesh;
   dlong Nlocal = mesh->Nelements * mesh->Np;
 
-  dfloat* vx = (*ins)->U + 0 * (*ins)->fieldOffset;
-  dfloat* vy = (*ins)->U + 1 * (*ins)->fieldOffset;
-  dfloat* vz = (*ins)->U + 2 * (*ins)->fieldOffset;
+  dfloat* vx = nrs->U + 0 * nrs->fieldOffset;
+  dfloat* vy = nrs->U + 1 * nrs->fieldOffset;
+  dfloat* vz = nrs->U + 2 * nrs->fieldOffset;
 
   *(nekData.time) = time;
 
   memcpy(nekData.vx, vx, sizeof(dfloat) * Nlocal);
   memcpy(nekData.vy, vy, sizeof(dfloat) * Nlocal);
   memcpy(nekData.vz, vz, sizeof(dfloat) * Nlocal);
-  memcpy(nekData.pr, (*ins)->P, sizeof(dfloat) * Nlocal);
-  if((*ins)->Nscalar) {
+  memcpy(nekData.pr, nrs->P, sizeof(dfloat) * Nlocal);
+  if(nrs->Nscalar) {
     const dlong nekFieldOffset = nekData.lelt * mesh->Np;
-    for(int is = 0; is < (*ins)->Nscalar; is++) {
+    for(int is = 0; is < nrs->Nscalar; is++) {
       mesh_t* mesh;
-      (is) ? mesh = (*ins)->cds->meshV : mesh = (*ins)->cds->mesh;
+      (is) ? mesh = nrs->cds->meshV : mesh = nrs->cds->mesh;
       const dlong Nlocal = mesh->Nelements * mesh->Np;
       dfloat* Ti = nekData.t   + is * nekFieldOffset;
-      dfloat* Si = (*ins)->cds->S + is * (*ins)->cds->fieldOffset;
+      dfloat* Si = nrs->cds->S + is * nrs->cds->fieldOffset;
       memcpy(Ti, Si, Nlocal * sizeof(dfloat));
     }
   }
 }
 
+void nek_ocopyFrom(void)
+{
+  nrs->o_U.copyTo(nrs->U);
+  nrs->o_P.copyTo(nrs->P);
+  if(nrs->Nscalar) nrs->cds->o_S.copyTo(nrs->cds->S);
+  nek_copyFrom(0.0);
+}
+
 void nek_ocopyFrom(dfloat time, int tstep)
 {
-  if(time != timeLast) {
-    (*ins)->o_U.copyTo((*ins)->U);
-    (*ins)->o_P.copyTo((*ins)->P);
-    if((*ins)->Nscalar) (*ins)->cds->o_S.copyTo((*ins)->cds->S);
-    nek_copyFrom(time, tstep);
-  }
+  nrs->o_U.copyTo(nrs->U);
+  nrs->o_P.copyTo(nrs->P);
+  if(nrs->Nscalar) nrs->cds->o_S.copyTo(nrs->cds->S);
+  nek_copyFrom(time, tstep);
 }
 
 void nek_copyFrom(dfloat time, int tstep)
@@ -697,9 +710,9 @@ void nek_copyFrom(dfloat time, int tstep)
 void nek_ocopyTo(dfloat &time)
 {
   nek_copyTo(time);
-  (*ins)->o_P.copyFrom((*ins)->P);
-  (*ins)->o_U.copyFrom((*ins)->U);
-  if((*ins)->Nscalar) (*ins)->cds->o_S.copyFrom((*ins)->cds->S);
+  nrs->o_P.copyFrom(nrs->P);
+  nrs->o_U.copyFrom(nrs->U);
+  if(nrs->Nscalar) nrs->cds->o_S.copyFrom(nrs->cds->S);
 }
 
 void nek_copyTo(dfloat &time)
@@ -709,59 +722,46 @@ void nek_copyTo(dfloat &time)
     fflush(stdout);
   }
 
-  mesh_t* mesh = (*ins)->mesh;
+  mesh_t* mesh = nrs->mesh;
   dlong Nlocal = mesh->Nelements * mesh->Np;
 
   time = *(nekData.time);
 
-  dfloat* vx = (*ins)->U + 0 * (*ins)->fieldOffset;
-  dfloat* vy = (*ins)->U + 1 * (*ins)->fieldOffset;
-  dfloat* vz = (*ins)->U + 2 * (*ins)->fieldOffset;
+  dfloat* vx = nrs->U + 0 * nrs->fieldOffset;
+  dfloat* vy = nrs->U + 1 * nrs->fieldOffset;
+  dfloat* vz = nrs->U + 2 * nrs->fieldOffset;
 
   memcpy(vx, nekData.vx, sizeof(dfloat) * Nlocal);
   memcpy(vy, nekData.vy, sizeof(dfloat) * Nlocal);
   memcpy(vz, nekData.vz, sizeof(dfloat) * Nlocal);
-  memcpy((*ins)->P, nekData.pr, sizeof(dfloat) * Nlocal);
-  if((*ins)->Nscalar) {
+  memcpy(nrs->P, nekData.pr, sizeof(dfloat) * Nlocal);
+  if(nrs->Nscalar) {
     const dlong nekFieldOffset = nekData.lelt * mesh->Np;
-    for(int is = 0; is < (*ins)->Nscalar; is++) {
+    for(int is = 0; is < nrs->Nscalar; is++) {
       mesh_t* mesh;
-      (is) ? mesh = (*ins)->cds->meshV : mesh = (*ins)->cds->mesh;
+      (is) ? mesh = nrs->cds->meshV : mesh = nrs->cds->mesh;
       const dlong Nlocal = mesh->Nelements * mesh->Np;
       dfloat* Ti = nekData.t   + is * nekFieldOffset;
-      dfloat* Si = (*ins)->cds->S + is * (*ins)->cds->fieldOffset;
+      dfloat* Si = nrs->cds->S + is * nrs->cds->fieldOffset;
       memcpy(Si, Ti, Nlocal * sizeof(dfloat));
     }
   }
 }
 
-void nek_copyRestart()
+long long nek_set_glo_num(int nx, int isTMesh)
 {
-  mesh_t* mesh = (*ins)->mesh;
-  dlong Nlocal = mesh->Nelements * mesh->Np;
-  if (*(nekData.ifgetu)) {
-    dfloat* vx = (*ins)->U + 0 * (*ins)->fieldOffset;
-    dfloat* vy = (*ins)->U + 1 * (*ins)->fieldOffset;
-    dfloat* vz = (*ins)->U + 2 * (*ins)->fieldOffset;
-    memcpy(vx, nekData.vx, sizeof(dfloat) * Nlocal);
-    memcpy(vy, nekData.vy, sizeof(dfloat) * Nlocal);
-    memcpy(vz, nekData.vz, sizeof(dfloat) * Nlocal);
-  }
-  if (*(nekData.ifgetp)) memcpy((*ins)->P, nekData.pr, sizeof(dfloat) * Nlocal);
-  if((*ins)->Nscalar) {
-    const dlong nekFieldOffset = nekData.lelt * mesh->Np;
-    for(int is = 0; is < (*ins)->Nscalar; is++) {
-      mesh_t* mesh;
-      (is) ? mesh = (*ins)->cds->meshV : mesh = (*ins)->cds->mesh;
-      const dlong Nlocal = mesh->Nelements * mesh->Np;
-      dfloat* Ti = nekData.t   + is * nekFieldOffset;
-      dfloat* Si = (*ins)->cds->S + is * (*ins)->cds->fieldOffset;
-      memcpy(Si, Ti, Nlocal * sizeof(dfloat));
-    }
-  }
+  return (*nek_set_vert_ptr)(&nx, &isTMesh);
 }
 
-long long nek_set_glo_num(int nx, int isTMesh)
+void nek_bdfCoeff(double *g0, double *coeff, double *dt, int order) 
 {
-  return (*nek_set_vert_ptr)(&nx, &isTMesh);
+  double nekCoeff[4];
+  (*nek_setbd_ptr)(nekCoeff, dt, &order);
+  *g0 = nekCoeff[0];
+  memcpy(coeff, &nekCoeff[1], 3*sizeof(double));
+}
+
+void nek_extCoeff(double *coeff, double *dt, int order)
+{
+  (*nek_setabbd_ptr)(coeff, dt, &order, &order);
 }
diff --git a/src/nekInterface/nekInterfaceAdapter.hpp b/src/nekInterface/nekInterfaceAdapter.hpp
index 3738a24bb..b6b4d3be3 100644
--- a/src/nekInterface/nekInterfaceAdapter.hpp
+++ b/src/nekInterface/nekInterfaceAdapter.hpp
@@ -7,7 +7,6 @@
 #include <dlfcn.h>
 #include <mpi.h>
 
-#include "setupAide.hpp"
 #include "nrs.hpp"
 
 #define DECLARE_USER_FUNC(a) void nek_ ## a(void);
@@ -92,18 +91,17 @@ DECLARE_USER_FUNC(userqtl)
 
 void*  nek_ptr(const char* id);
 void*  nek_scPtr(int id);
-void   nek_outfld(void);
-void   nek_outfld(const char* suffix);
-void   nek_outfld(const char* suffix, dfloat t, int coords,
-                  occa::memory o_u, occa::memory o_p, occa::memory o_s,
-                  int NSfields, int FP64);
+void   nek_outSolutionFld(double time, double outputTime);
+void   nek_outfld(const char* suffix, dfloat t, int coords, int FP64,
+                  occa::memory &o_u, occa::memory &o_p, occa::memory &o_s,
+                  int NSfields);
 void   nek_uic(int ifield);
 void   nek_end(void);
 void   nek_map_m_to_n(double* a, int na, double* b, int nb);
 void   nek_outpost(double* v1, double* v2, double* v3, double* vp, double* vt, char* name);
 int    nek_lglel(int e);
 void   nek_uf(double* u, double* v, double* w);
-int    nek_setup(MPI_Comm c, setupAide &options, ins_t** insAddr);
+int    nek_setup(MPI_Comm c, setupAide &options, nrs_t* nrs);
 void   nek_ifoutfld(int i);
 void   nek_setic(void);
 void   nek_userchk(void);
@@ -111,11 +109,13 @@ int    nek_bcmap(int bid, int ifld);
 
 int buildNekInterface(const char* casename, int nFields, int N, int np);
 void nek_copyFrom(dfloat time, int tstep);
+void nek_ocopyFrom(void);
 void nek_ocopyFrom(dfloat time, int tstep);
 void nek_copyFrom(dfloat time);
 void nek_copyTo(dfloat &time);
 void nek_ocopyTo(dfloat &time);
-void nek_copyRestart();
 long long nek_set_glo_num(int npts, int isTMesh);
 
+void nek_bdfCoeff(double *g0, double *coeff, double *dt, int order);
+void nek_extCoeff(double *coeff, double *dt, int order);
 #endif
diff --git a/src/nrs.hpp b/src/nrs.hpp
deleted file mode 100644
index eaa3bdf41..000000000
--- a/src/nrs.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#if !defined(nekrs_nekrs_hpp_)
-#define nekrs_nekrs_hpp_
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <unistd.h>
-#include <getopt.h>
-#include <mpi.h>
-
-#define NEKRS_VERSION "20"
-#define NEKRS_SUBVERSION "1"
-
-#define EXIT(a)  { fflush(stdout); MPI_Finalize(); exit(a); }
-#define ABORT(a) { fflush(stdout); MPI_Abort(MPI_COMM_WORLD,a); }
-
-#include "libParanumal.hpp"
-#include "ins.h"
-#include "timer.hpp"
-
-occa::device occaDeviceConfig(setupAide &options, MPI_Comm comm);
-
-// std::to_string might be not accurate enough
-static string to_string_f(double a)
-{
-  stringstream s;
-  s << std::scientific << a;
-  return s.str();
-}
-
-static std::vector<std::string> serializeString(const std::string sin)
-{
-  std::vector<std::string> slist;
-  string s(sin);
-  s.erase(std::remove_if(s.begin(), s.end(), ::isspace), s.end());
-  std::stringstream ss;
-  ss.str(s);
-  while( ss.good() ) {
-    std::string substr;
-    std::getline(ss, substr, ',');
-    slist.push_back(substr);
-  }
-  return slist;
-}
-
-#endif
diff --git a/src/plugins/RANSktau.cpp b/src/plugins/RANSktau.cpp
index 44acd48d6..73c9fb9f2 100644
--- a/src/plugins/RANSktau.cpp
+++ b/src/plugins/RANSktau.cpp
@@ -1,11 +1,11 @@
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 #include "RANSktau.hpp"
 
 // private members
 namespace
 {
-static ins_t* ins;
+static nrs_t* nrs;
 
 int kFieldIndex;
 
@@ -43,11 +43,11 @@ static dfloat coeff[] = {
 };
 }
 
-void RANSktau::buildKernel(ins_t* ins)
+void RANSktau::buildKernel(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
 
-  occa::properties kernelInfo = *(ins->kernelInfo);
+  occa::properties kernelInfo = *(nrs->kernelInfo);
   kernelInfo["defines/p_sigma_k"]       = coeff[0];
   kernelInfo["defines/p_sigma_tau"]     = coeff[1];
   kernelInfo["defines/p_alpinf_str"]    = coeff[2];
@@ -79,24 +79,24 @@ void RANSktau::buildKernel(ins_t* ins)
     MPI_Barrier(mesh->comm);
   }
 
-  if(ins->Nscalar < 2) {
+  if(nrs->Nscalar < 2) {
     if(mesh->rank == 0) cout << "RANSktau: Nscalar needs to be >= 2!\n";
     ABORT(1);
   }
-  ins->options.setArgs("VARIABLE VISCOSITY", "TRUE");
+  nrs->options.setArgs("VARIABLE VISCOSITY", "TRUE");
 }
 
 void RANSktau::updateProperties()
 {
-  mesh_t* mesh = ins->mesh;
-  cds_t* cds = ins->cds;
+  mesh_t* mesh = nrs->mesh;
+  cds_t* cds = nrs->cds;
 
-  occa::memory o_mue  = ins->o_mue;
+  occa::memory o_mue  = nrs->o_mue;
   occa::memory o_diff = cds->o_diff + kFieldIndex * cds->fieldOffset * sizeof(dfloat);
 
   limitKernel(mesh->Nelements * mesh->Np, o_k, o_tau);
   mueKernel(mesh->Nelements * mesh->Np,
-            ins->fieldOffset,
+            nrs->fieldOffset,
             rho,
             mueLam,
             o_k,
@@ -113,41 +113,41 @@ occa::memory RANSktau::o_mue_t()
 
 void RANSktau::updateSourceTerms()
 {
-  mesh_t* mesh = ins->mesh;
-  cds_t* cds = ins->cds;
+  mesh_t* mesh = nrs->mesh;
+  cds_t* cds = nrs->cds;
 
-  occa::memory o_OiOjSk  = ins->o_wrk0;
-  occa::memory o_SijMag2 = ins->o_wrk1;
-  occa::memory o_SijOij  = ins->o_wrk2;
+  occa::memory o_OiOjSk  = nrs->o_wrk0;
+  occa::memory o_SijMag2 = nrs->o_wrk1;
+  occa::memory o_SijOij  = nrs->o_wrk2;
 
   occa::memory o_FS      = cds->o_FS     + kFieldIndex * cds->fieldOffset * sizeof(dfloat);
   occa::memory o_BFDiag  = cds->o_BFDiag + kFieldIndex * cds->fieldOffset * sizeof(dfloat);
 
   const int NSOfields = 9;
   SijOijKernel(mesh->Nelements,
-               ins->fieldOffset,
+               nrs->fieldOffset,
                mesh->o_vgeo,
                mesh->o_Dmatrices,
-               ins->o_U,
+               nrs->o_U,
                o_SijOij);
 
   ogsGatherScatterMany(o_SijOij,
                        NSOfields,
-                       ins->fieldOffset,
+                       nrs->fieldOffset,
                        ogsDfloat,
                        ogsAdd,
                        mesh->ogs);
 
-  ins->invMassMatrixKernel(
+  nrs->invMassMatrixKernel(
     mesh->Nelements,
-    ins->fieldOffset,
+    nrs->fieldOffset,
     NSOfields,
     mesh->o_vgeo,
-    ins->mesh->o_invLMM,
+    nrs->mesh->o_invLMM,
     o_SijOij);
 
   SijOijMag2Kernel(mesh->Nelements * mesh->Np,
-                   ins->fieldOffset,
+                   nrs->fieldOffset,
                    o_SijOij,
                    o_OiOjSk,
                    o_SijMag2);
@@ -155,7 +155,7 @@ void RANSktau::updateSourceTerms()
   limitKernel(mesh->Nelements * mesh->Np, o_k, o_tau);
 
   computeKernel(mesh->Nelements,
-                ins->cds->fieldOffset,
+                nrs->cds->fieldOffset,
                 rho,
                 mueLam,
                 mesh->o_vgeo,
@@ -168,23 +168,23 @@ void RANSktau::updateSourceTerms()
                 o_FS);
 }
 
-void RANSktau::setup(ins_t* insIn, dfloat mueIn, dfloat rhoIn, int ifld)
+void RANSktau::setup(nrs_t* nrsIn, dfloat mueIn, dfloat rhoIn, int ifld)
 {
-  setup(insIn, mueIn, rhoIn, ifld, NULL);
+  setup(nrsIn, mueIn, rhoIn, ifld, NULL);
 }
 
-void RANSktau::setup(ins_t* insIn, dfloat mueIn, dfloat rhoIn,
+void RANSktau::setup(nrs_t* nrsIn, dfloat mueIn, dfloat rhoIn,
                      int ifld, const dfloat* coeffIn)
 {
   if(setupCalled) return;
 
-  ins    = insIn;
+  nrs    = nrsIn;
   mueLam = mueIn;
   rho    = rhoIn;
   kFieldIndex = ifld;
 
-  cds_t* cds = ins->cds;
-  mesh_t* mesh = ins->mesh;
+  cds_t* cds = nrs->cds;
+  mesh_t* mesh = nrs->mesh;
 
   if(coeffIn) memcpy(coeff, coeffIn, sizeof(coeff));
 
@@ -195,7 +195,7 @@ void RANSktau::setup(ins_t* insIn, dfloat mueIn, dfloat rhoIn,
 
   if(!cds->o_BFDiag.ptr()) {
     cds->o_BFDiag = mesh->device.malloc(cds->NSfields * cds->fieldOffset * sizeof(dfloat));
-    ins->fillKernel(cds->NSfields * cds->fieldOffset, 0.0, cds->o_BFDiag);
+    nrs->fillKernel(cds->NSfields * cds->fieldOffset, 0.0, cds->o_BFDiag);
   }
 
   setupCalled = 1;
diff --git a/src/plugins/RANSktau.hpp b/src/plugins/RANSktau.hpp
index df1ca8e60..cb5c80aaf 100644
--- a/src/plugins/RANSktau.hpp
+++ b/src/plugins/RANSktau.hpp
@@ -1,12 +1,12 @@
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 
 namespace RANSktau
 {
-void buildKernel(ins_t* ins);
+void buildKernel(nrs_t* nrs);
 void updateSourceTerms();
-void setup(ins_t* insIn, dfloat mue, dfloat rho, int startIndex);
-void setup(ins_t* insIn, dfloat mue, dfloat rho, int startIndex, const dfloat* coeffIn);
+void setup(nrs_t* nrsIn, dfloat mue, dfloat rho, int startIndex);
+void setup(nrs_t* nrsIn, dfloat mue, dfloat rho, int startIndex, const dfloat* coeffIn);
 void updateProperties();
 occa::memory o_mue_t();
 }
diff --git a/src/plugins/avg.cpp b/src/plugins/avg.cpp
index 914d044c7..d02e9ff8d 100644
--- a/src/plugins/avg.cpp
+++ b/src/plugins/avg.cpp
@@ -13,15 +13,15 @@
            is the expected value of the sub-ensemble i (i=1...N).
  */
 
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 #include "avg.hpp"
 
 // private members
 namespace
 {
 static ogs_t* ogs;
-static ins_t* ins;
+static nrs_t* nrs;
 
 static occa::memory o_Uavg, o_Urms;
 static occa::memory o_Urm2;
@@ -41,15 +41,15 @@ static dfloat atime;
 static dfloat timel;
 }
 
-void avg::buildKernel(ins_t* ins)
+void avg::buildKernel(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
 
   string fileName;
   int rank = mesh->rank;
   fileName.assign(getenv("NEKRS_INSTALL_DIR"));
   fileName += "/okl/plugins/avg.okl";
-  occa::properties& kernelInfo = *ins->kernelInfo;
+  occa::properties& kernelInfo = *nrs->kernelInfo;
   for (int r = 0; r < 2; r++) {
     if ((r == 0 && rank == 0) || (r == 1 && rank > 0)) {
       EXKernel  = mesh->device.buildKernel(fileName.c_str(), "EX", kernelInfo);
@@ -69,12 +69,12 @@ void avg::reset()
 
 void avg::EX (dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EX)
 {
-  EXKernel(N, ins->fieldOffset, nflds, a, b, o_x, o_EX);
+  EXKernel(N, nrs->fieldOffset, nflds, a, b, o_x, o_EX);
 }
 
 void avg::EXX(dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EXX)
 {
-  EXXKernel(N, ins->fieldOffset, nflds, a, b, o_x, o_EXX);
+  EXXKernel(N, nrs->fieldOffset, nflds, a, b, o_x, o_EXX);
 }
 
 void avg::EXY(dlong N,
@@ -85,7 +85,7 @@ void avg::EXY(dlong N,
               occa::memory o_y,
               occa::memory o_EXY)
 {
-  EXYKernel(N, ins->fieldOffset, nflds, a, b, o_x, o_y, o_EXY);
+  EXYKernel(N, nrs->fieldOffset, nflds, a, b, o_x, o_y, o_EXY);
 }
 
 void avg::run(dfloat time)
@@ -109,29 +109,29 @@ void avg::run(dfloat time)
   const dfloat b = dtime / atime;
   const dfloat a = 1 - b;
 
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
   const dlong N = mesh->Nelements * mesh->Np;
 
   // velocity
-  EX (N, a, b, ins->NVfields, ins->o_U, o_Uavg);
-  EXX(N, a, b, ins->NVfields, ins->o_U, o_Urms);
+  EX (N, a, b, nrs->NVfields, nrs->o_U, o_Uavg);
+  EXX(N, a, b, nrs->NVfields, nrs->o_U, o_Urms);
 
-  const dlong offsetByte = ins->fieldOffset * sizeof(dfloat);
-  occa::memory o_vx = ins->o_U + 0 * offsetByte;
-  occa::memory o_vy = ins->o_U + 1 * offsetByte;
-  occa::memory o_vz = ins->o_U + 2 * offsetByte;
+  const dlong offsetByte = nrs->fieldOffset * sizeof(dfloat);
+  occa::memory o_vx = nrs->o_U + 0 * offsetByte;
+  occa::memory o_vy = nrs->o_U + 1 * offsetByte;
+  occa::memory o_vz = nrs->o_U + 2 * offsetByte;
 
   EXY(N, a, b, 1, o_vx, o_vy, o_Urm2 + 0 * offsetByte);
   EXY(N, a, b, 1, o_vy, o_vz, o_Urm2 + 1 * offsetByte);
   EXY(N, a, b, 1, o_vz, o_vx, o_Urm2 + 2 * offsetByte);
 
   // pressure
-  EX (N, a, b, 1, ins->o_P, o_Pavg);
-  EXX(N, a, b, 1, ins->o_P, o_Prms);
+  EX (N, a, b, 1, nrs->o_P, o_Pavg);
+  EXX(N, a, b, 1, nrs->o_P, o_Prms);
 
   // scalars
-  if(ins->Nscalar) {
-    cds_t* cds = ins->cds;
+  if(nrs->Nscalar) {
+    cds_t* cds = nrs->cds;
     const dlong N = cds->mesh->Nelements * cds->mesh->Np;
     EX (N, a, b, cds->NSfields, cds->o_S, o_Savg);
     EXX(N, a, b, cds->NSfields, cds->o_S, o_Srms);
@@ -140,37 +140,37 @@ void avg::run(dfloat time)
   timel = time;
 }
 
-void avg::setup(ins_t* ins_)
+void avg::setup(nrs_t* nrs_)
 {
   if(!buildKernelCalled) {
     cout << "avg::setup() was called prior avg::buildKernel()!\n";
     ABORT(1);
   }
 
-  ins = ins_;
-  mesh_t* mesh = ins->mesh;
+  nrs = nrs_;
+  mesh_t* mesh = nrs->mesh;
 
   if(setupCalled) return;
 
-  o_Uavg = mesh->device.malloc(ins->fieldOffset * ins->NVfields * sizeof(dfloat));
-  o_Urms = mesh->device.malloc(ins->fieldOffset * ins->NVfields * sizeof(dfloat));
-  ins->fillKernel(ins->fieldOffset * ins->NVfields, 0.0, o_Uavg);
-  ins->fillKernel(ins->fieldOffset * ins->NVfields, 0.0, o_Urms);
+  o_Uavg = mesh->device.malloc(nrs->fieldOffset * nrs->NVfields * sizeof(dfloat));
+  o_Urms = mesh->device.malloc(nrs->fieldOffset * nrs->NVfields * sizeof(dfloat));
+  nrs->fillKernel(nrs->fieldOffset * nrs->NVfields, 0.0, o_Uavg);
+  nrs->fillKernel(nrs->fieldOffset * nrs->NVfields, 0.0, o_Urms);
 
-  o_Urm2 = mesh->device.malloc(ins->fieldOffset * ins->NVfields * sizeof(dfloat));
-  ins->fillKernel(ins->fieldOffset * ins->NVfields, 0.0, o_Urm2);
+  o_Urm2 = mesh->device.malloc(nrs->fieldOffset * nrs->NVfields * sizeof(dfloat));
+  nrs->fillKernel(nrs->fieldOffset * nrs->NVfields, 0.0, o_Urm2);
 
-  o_Pavg = mesh->device.malloc(ins->fieldOffset * sizeof(dfloat));
-  o_Prms = mesh->device.malloc(ins->fieldOffset * sizeof(dfloat));
-  ins->fillKernel(ins->fieldOffset, 0.0, o_Pavg);
-  ins->fillKernel(ins->fieldOffset, 0.0, o_Prms);
+  o_Pavg = mesh->device.malloc(nrs->fieldOffset * sizeof(dfloat));
+  o_Prms = mesh->device.malloc(nrs->fieldOffset * sizeof(dfloat));
+  nrs->fillKernel(nrs->fieldOffset, 0.0, o_Pavg);
+  nrs->fillKernel(nrs->fieldOffset, 0.0, o_Prms);
 
-  if(ins->Nscalar) {
-    cds_t* cds = ins->cds;
+  if(nrs->Nscalar) {
+    cds_t* cds = nrs->cds;
     o_Savg = mesh->device.malloc(cds->fieldOffset * cds->NSfields * sizeof(dfloat));
     o_Srms = mesh->device.malloc(cds->fieldOffset * cds->NSfields * sizeof(dfloat));
-    ins->fillKernel(cds->fieldOffset * cds->NSfields, 0.0, o_Savg);
-    ins->fillKernel(cds->fieldOffset * cds->NSfields, 0.0, o_Srms);
+    nrs->fillKernel(cds->fieldOffset * cds->NSfields, 0.0, o_Savg);
+    nrs->fillKernel(cds->fieldOffset * cds->NSfields, 0.0, o_Srms);
   }
 
   setupCalled = 1;
@@ -178,40 +178,37 @@ void avg::setup(ins_t* ins_)
 
 void avg::outfld()
 {
-  cds_t* cds = ins->cds;
-  mesh_t* mesh = ins->mesh;
+  cds_t* cds = nrs->cds;
+  mesh_t* mesh = nrs->mesh;
   const int FP64 = 1;
   const int coords = 0;
 
   occa::memory o_null;
   occa::memory o_Tavg, o_Trms;
 
-  const int Nscalar = ins->Nscalar;
-  if(ins->Nscalar) {
+  const int Nscalar = nrs->Nscalar;
+  if(nrs->Nscalar) {
     o_Tavg = o_Savg;
     o_Trms = o_Srms;
   }
 
-  nek_outfld("avg", atime, coords,
-             o_Uavg,
-             o_Pavg,
-             o_Tavg,
-             Nscalar,
-             FP64);
-
-  nek_outfld("rms", atime, coords,
-             o_Urms,
-             o_Prms,
-             o_Trms,
-             Nscalar,
-             FP64);
-
-  nek_outfld("rm2", atime, coords,
-             o_Urm2,
-             o_null,
-             o_null,
-             0,
-             FP64);
+  writeFld("avg", atime, coords, FP64,
+           o_Uavg,
+           o_Pavg,
+           o_Tavg,
+           Nscalar);
+
+  writeFld("rms", atime, coords, FP64,
+           o_Urms,
+           o_Prms,
+           o_Trms,
+           Nscalar);
+
+  writeFld("rm2", atime, coords, FP64,
+           o_Urm2,
+           o_null,
+           o_null,
+           0);
 
   atime = 0;
 }
diff --git a/src/plugins/avg.hpp b/src/plugins/avg.hpp
index 85374ea09..b9c95f5da 100644
--- a/src/plugins/avg.hpp
+++ b/src/plugins/avg.hpp
@@ -1,11 +1,11 @@
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 
 namespace avg
 {
-void buildKernel(ins_t* ins);
+void buildKernel(nrs_t* nrs);
 void run(dfloat time);
-void setup(ins_t* ins_);
+void setup(nrs_t* nrs_);
 void outfld();
 void reset();
 void EX (dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EX);
diff --git a/src/plugins/lowMach.cpp b/src/plugins/lowMach.cpp
index ba9a6a960..17b0480eb 100644
--- a/src/plugins/lowMach.cpp
+++ b/src/plugins/lowMach.cpp
@@ -8,60 +8,56 @@
 
 #include "lowMach.hpp"
 
-void lowMach::setup(ins_t* ins)
+void lowMach::setup(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
-  int err = 0;
-  if(ins->Nscalar) {
-    if(!ins->cds->compute[0]) err = 1; 
-  } else {
-    err = 1;
-  }
+  mesh_t* mesh = nrs->mesh;
+  int err = 1;
+  if(nrs->options.compareArgs("TEMPERATURE", "TRUE")) err = 0;
   if(err) {
     if(mesh->rank == 0) cout << "lowMach requires solving for temperature!\n";
     ABORT(1);
   } 
   udf.div = &lowMach::qtl;
-  ins->options.setArgs("LOWMACH", "TRUE"); 
+  nrs->options.setArgs("LOWMACH", "TRUE"); 
 }
 
 // qtl = 1/(rho*cp*T) * (div[k*grad[T] ] + qvol)
-void lowMach::qtl(ins_t* ins, dfloat time, occa::memory o_div)
+void lowMach::qtl(nrs_t* nrs, dfloat time, occa::memory o_div)
 {
-  cds_t* cds = ins->cds;
-  mesh_t* mesh = ins->mesh;
+  cds_t* cds = nrs->cds;
+  mesh_t* mesh = nrs->mesh;
 
-  ins->gradientVolumeKernel(
+  nrs->gradientVolumeKernel(
     mesh->Nelements,
     mesh->o_vgeo,
     mesh->o_Dmatrices,
-    ins->fieldOffset,
+    nrs->fieldOffset,
     cds->o_S,
     cds->o_wrk0);
 
-  oogs::startFinish(cds->o_wrk0, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh);
+  oogs::startFinish(cds->o_wrk0, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);
 
-  ins->invMassMatrixKernel(
+  nrs->invMassMatrixKernel(
     mesh->Nelements,
-    ins->fieldOffset,
-    ins->NVfields,
+    nrs->fieldOffset,
+    nrs->NVfields,
     mesh->o_vgeo,
-    ins->mesh->o_invLMM,
+    nrs->mesh->o_invLMM,
     cds->o_wrk0);
 
   if(udf.sEqnSource) {
     timer::tic("udfSEqnSource", 1);
-    udf.sEqnSource(ins, time, cds->o_S, cds->o_wrk3);
+    udf.sEqnSource(nrs, time, cds->o_S, cds->o_wrk3);
     timer::toc("udfSEqnSource");
   } else {
-    ins->fillKernel(mesh->Nelements * mesh->Np, 0.0, cds->o_wrk3);
+    nrs->fillKernel(mesh->Nelements * mesh->Np, 0.0, cds->o_wrk3);
   }
 
-  ins->qtlKernel(
+  nrs->qtlKernel(
     mesh->Nelements,
     mesh->o_vgeo,
     mesh->o_Dmatrices,
-    ins->fieldOffset,
+    nrs->fieldOffset,
     cds->o_wrk0,
     cds->o_S,
     cds->o_diff,
@@ -69,13 +65,13 @@ void lowMach::qtl(ins_t* ins, dfloat time, occa::memory o_div)
     cds->o_wrk3,
     o_div);
 
-  oogs::startFinish(o_div, 1, ins->fieldOffset, ogsDfloat, ogsAdd, ins->gsh);
+  oogs::startFinish(o_div, 1, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh);
 
-  ins->invMassMatrixKernel(
+  nrs->invMassMatrixKernel(
     mesh->Nelements,
-    ins->fieldOffset,
+    nrs->fieldOffset,
     1,
     mesh->o_vgeo,
-    ins->mesh->o_invLMM,
+    nrs->mesh->o_invLMM,
     o_div);
 }
diff --git a/src/plugins/lowMach.hpp b/src/plugins/lowMach.hpp
index f7ac3306d..c6def7f6e 100644
--- a/src/plugins/lowMach.hpp
+++ b/src/plugins/lowMach.hpp
@@ -1,8 +1,8 @@
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 
 namespace lowMach
 {
-void setup(ins_t* ins);
-void qtl(ins_t* ins, dfloat time, occa::memory o_div);
+void setup(nrs_t* nrs);
+void qtl(nrs_t* nrs, dfloat time, occa::memory o_div);
 }
diff --git a/src/plugins/velRecycling.cpp b/src/plugins/velRecycling.cpp
index 457a548c5..8df92d92b 100644
--- a/src/plugins/velRecycling.cpp
+++ b/src/plugins/velRecycling.cpp
@@ -6,15 +6,15 @@
          numbering which is only true for extruded meshes in z from nek!
  */
 
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 #include "velRecycling.hpp"
 
 // private members
 namespace
 {
 static ogs_t* ogs;
-static ins_t* ins;
+static nrs_t* nrs;
 
 static occa::memory o_wrk;
 
@@ -38,15 +38,15 @@ static dfloat wbar;
 static int Nblock;
 }
 
-void velRecycling::buildKernel(ins_t* ins)
+void velRecycling::buildKernel(nrs_t* nrs)
 {
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
 
   string fileName;
   int rank = mesh->rank;
   fileName.assign(getenv("NEKRS_INSTALL_DIR"));
   fileName += "/okl/plugins/velRecycling.okl";
-  occa::properties& kernelInfo = *ins->kernelInfo;
+  occa::properties& kernelInfo = *nrs->kernelInfo;
   for (int r = 0; r < 2; r++) {
     if ((r == 0 && rank == 0) || (r == 1 && rank > 0)) {
       setBCVectorValueKernel =  mesh->device.buildKernel(fileName.c_str(),
@@ -66,25 +66,25 @@ void velRecycling::buildKernel(ins_t* ins)
 
 void velRecycling::copy()
 {
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
   const dfloat zero = 0.0;
 
   // copy recycling plane in interior to inlet
-  o_wrk.copyFrom(ins->o_U, ins->NVfields * ins->fieldOffset * sizeof(dfloat));
-  setBCVectorValueKernel(mesh->Nelements, zero, bID, ins->fieldOffset,
+  o_wrk.copyFrom(nrs->o_U, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat));
+  setBCVectorValueKernel(mesh->Nelements, zero, bID, nrs->fieldOffset,
                          o_wrk, mesh->o_vmapM, mesh->o_EToB);
 
-  ogsGatherScatterMany(o_wrk, ins->NVfields, ins->fieldOffset,
+  ogsGatherScatterMany(o_wrk, nrs->NVfields, nrs->fieldOffset,
                        ogsDfloat, ogsAdd, ogs);
 
 /*
-   for(int k=0;k<ins->dim;++k)
-    ogsGatherScatter(o_wrk+k*ins->fieldOffset*sizeof(dfloat),
+   for(int k=0;k<nrs->dim;++k)
+    ogsGatherScatter(o_wrk+k*nrs->fieldOffset*sizeof(dfloat),
                      ogsDfloat, ogsAdd, ogs);
  */
 
   // rescale
-  getBCFluxKernel(mesh->Nelements, bID, ins->fieldOffset, o_wrk,
+  getBCFluxKernel(mesh->Nelements, bID, nrs->fieldOffset, o_wrk,
                   mesh->o_vmapM, mesh->o_EToB, mesh->o_sgeo, o_area, o_flux);
 
   const int NfpTotal = mesh->Nelements * mesh->Nfaces * mesh->Nfp;
@@ -101,18 +101,18 @@ void velRecycling::copy()
 
   const dfloat scale = -wbar * sbuf[0] / sbuf[1];
   //printf("rescaling inflow: %f\n", scale);
-  scalarMultiplyKernel(ins->NVfields * ins->fieldOffset, scale, o_wrk);
+  scalarMultiplyKernel(nrs->NVfields * nrs->fieldOffset, scale, o_wrk);
 }
 
-void velRecycling::setup(ins_t* ins_, occa::memory o_wrk_, const hlong eOffset, const int bID_,
+void velRecycling::setup(nrs_t* nrs_, occa::memory o_wrk_, const hlong eOffset, const int bID_,
                          const dfloat wbar_)
 {
-  ins = ins_;
+  nrs = nrs_;
   o_wrk = o_wrk_;
   bID = bID_;
   wbar = wbar_;
 
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
 
   const dlong Ntotal = mesh->Np * mesh->Nelements;
   hlong* ids = (hlong*) calloc(Ntotal, sizeof(hlong));
@@ -137,7 +137,7 @@ void velRecycling::setup(ins_t* ins_, occa::memory o_wrk_, const hlong eOffset,
 
   const int NfpTotal = mesh->Nelements * mesh->Nfaces * mesh->Nfp;
 
-  Nblock = (NfpTotal + blockSize - 1) / blockSize;
+  Nblock = (NfpTotal + BLOCKSIZE - 1) / BLOCKSIZE;
   tmp1   = (dfloat*) calloc(Nblock, sizeof(dfloat));
   tmp2   = (dfloat*) calloc(Nblock, sizeof(dfloat));
 
diff --git a/src/plugins/velRecycling.hpp b/src/plugins/velRecycling.hpp
index 32d124110..8e8275623 100644
--- a/src/plugins/velRecycling.hpp
+++ b/src/plugins/velRecycling.hpp
@@ -6,13 +6,13 @@
          numbering which is only true for extruded meshes in z from nek!
  */
 
-#include <nekrs.hpp>
-#include <nekInterfaceAdapter.hpp>
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
 
 namespace velRecycling
 {
-void buildKernel(ins_t* ins);
+void buildKernel(nrs_t* nrs);
 void copy();
-void setup(ins_t* ins_, occa::memory o_wrk_, const hlong eOffset, const int bID_,
+void setup(nrs_t* nrs_, occa::memory o_wrk_, const hlong eOffset, const int bID_,
            const dfloat wbar_);
 }
diff --git a/src/timeStepper/runTime.cpp b/src/timeStepper/runTime.cpp
new file mode 100644
index 000000000..6198d8205
--- /dev/null
+++ b/src/timeStepper/runTime.cpp
@@ -0,0 +1,739 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "nrs.hpp"
+#include "nekInterfaceAdapter.hpp"
+#include "udf.hpp"
+#include "tombo.hpp"
+#include "cfl.hpp"
+
+void extbdfCoefficents(nrs_t* nrs, int order);
+
+void makef(nrs_t* nrs, dfloat time, occa::memory o_FU, occa::memory o_BF);
+occa::memory velocityStrongSubCycle(nrs_t* nrs, dfloat time,
+                                    occa::memory o_U);
+void fluidSolve(nrs_t* nrs, dfloat time, occa::memory o_U);
+
+void makeq(nrs_t* nrs, dfloat time, occa::memory o_FS, occa::memory o_BF);
+occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is,
+                                  occa::memory o_U, occa::memory o_S);
+void scalarSolve(nrs_t* nrs, dfloat time, occa::memory o_S);
+
+void qthermal(nrs_t* nrs, dfloat time, occa::memory o_div);
+
+double tElapsed = 0;
+
+void runStep(nrs_t* nrs, dfloat time, dfloat dt, int tstep)
+{
+  mesh_t* mesh = nrs->mesh;
+  cds_t* cds = nrs->cds;
+
+  mesh->device.finish();
+  MPI_Barrier(mesh->comm);
+  double tStart = MPI_Wtime();
+
+  nrs->dt[0] = dt;
+
+  nrs->idt = 1/nrs->dt[0];
+  if(nrs->Nscalar) cds->idt = 1/cds->dt[0]; 
+  extbdfCoefficents(nrs, mymin(tstep, nrs->temporalOrder));
+
+  if(nrs->flow) 
+    nrs->extrapolateKernel(mesh->Nelements,
+                           nrs->NVfields,
+                           nrs->ExplicitOrder,
+                           nrs->fieldOffset,
+                           nrs->o_extbdfA,
+                           nrs->o_U,
+                           nrs->o_Ue);
+  if(nrs->Nscalar) 
+    nrs->extrapolateKernel(cds->mesh->Nelements,
+                           cds->NSfields,
+                           cds->ExplicitOrder,
+                           cds->fieldOffset,
+                           cds->o_extbdfA,
+                           cds->o_S,
+                           cds->o_Se);
+
+  if(nrs->Nscalar)
+    scalarSolve(nrs, time, cds->o_S);
+
+  if(udf.properties) {
+    timer::tic("udfProperties", 1);
+    occa::memory o_S = nrs->o_wrk0;
+    occa::memory o_SProp = nrs->o_wrk0;
+    if(nrs->Nscalar) {
+      o_S = cds->o_S;
+      o_SProp = cds->o_prop;
+    }
+    udf.properties(nrs, time + nrs->dt[0], nrs->o_U, o_S, nrs->o_prop, o_SProp);
+    timer::toc("udfProperties");
+  }
+
+  if(udf.div) udf.div(nrs, time + nrs->dt[0], nrs->o_div);
+
+  if(nrs->flow) fluidSolve(nrs, time, nrs->o_U); 
+
+  nrs->dt[2] = nrs->dt[1];
+  nrs->dt[1] = nrs->dt[0];
+
+  mesh->device.finish();
+  MPI_Barrier(mesh->comm);
+  const double tElapsedStep = MPI_Wtime() - tStart;
+  tElapsed += tElapsedStep;
+  timer::set("solve", tElapsed);
+
+  // print some diagnostics
+  const dfloat cfl = computeCFL(nrs);
+  if(mesh->rank == 0) {
+    printf("step= %d  t= %.8e  dt=%.1e  C= %.2f",
+           tstep, time + nrs->dt[0], nrs->dt[0], cfl);
+
+    if(nrs->flow) {
+      if(nrs->uvwSolver)
+        printf("  UVW: %d  P: %d", nrs->NiterU, nrs->NiterP);
+      else
+        printf("  U: %d  V: %d  W: %d  P: %d", nrs->NiterU, nrs->NiterV, nrs->NiterW, nrs->NiterP);
+    }
+
+    for(int is = 0; is < nrs->Nscalar; is++)
+      if(cds->compute[is]) printf("  S: %d", cds->Niter[is]);
+
+    printf("  eTime= %.2e, %.5e s\n", tElapsedStep, tElapsed);
+  }
+
+  if(cfl > 30 || std::isnan(cfl)) {
+    if(mesh->rank == 0) cout << "Unreasonable CFL! Dying ...\n" << endl;
+    ABORT(1);
+  }
+
+  if(tstep % 10 == 0) fflush(stdout);
+}
+
+void extbdfCoefficents(nrs_t* nrs, int order)
+{
+  if(order == 1) {
+    nrs->g0 = 1.0;
+    nrs->extbdfB[0] = 1.0;
+    nrs->extbdfB[1] = 0.0;
+    nrs->extbdfB[2] = 0.0;
+    nrs->extbdfA[0] = 1.0;
+    nrs->extbdfA[1] = 0.0;
+    nrs->extbdfA[2] = 0.0;
+    nrs->ExplicitOrder = 1;
+  } else if(order == 2) {
+    nek_bdfCoeff(&nrs->g0, nrs->extbdfB, nrs->dt, order);
+    nrs->extbdfB[2] = 0.0;
+    nrs->ExplicitOrder = 2;
+    nek_extCoeff(nrs->extbdfA, nrs->dt, nrs->ExplicitOrder);
+    nrs->extbdfA[2] = 0.0;
+  } else if(order == 3) {
+    nek_bdfCoeff(&nrs->g0, nrs->extbdfB, nrs->dt, order);
+    nrs->ExplicitOrder = 3;
+    nek_extCoeff(nrs->extbdfA, nrs->dt, nrs->ExplicitOrder);
+  }
+
+  nrs->ig0 = 1.0 / nrs->g0;
+  nrs->o_extbdfB.copyFrom(nrs->extbdfB);
+  nrs->o_extbdfA.copyFrom(nrs->extbdfA);
+
+#if 0
+  if (nrs->mesh->rank == 0) {
+    cout << "DT:" << nrs->dt[0] << "," << nrs->dt[1] << "," << nrs->dt[2] << "\n";
+    cout << "BDF:" << nrs->g0 << "," << nrs->extbdfB[0] << "," << nrs->extbdfB[1] << "," << nrs->extbdfB[2] << "\n";
+    cout << "EXT:" << nrs->extbdfA[0] << "," << nrs->extbdfA[1] << "," << nrs->extbdfA[2] << "\n";
+  }
+#endif
+
+  if (nrs->Nscalar) {
+    nrs->cds->ExplicitOrder = nrs->ExplicitOrder;
+    nrs->cds->g0 = nrs->g0;
+    nrs->cds->ig0 = nrs->ig0;
+  }
+}
+
+void makeq(nrs_t* nrs, dfloat time, occa::memory o_FS, occa::memory o_BF)
+{
+  cds_t* cds   = nrs->cds;
+  mesh_t* mesh = cds->mesh;
+
+  if(udf.sEqnSource) {
+    timer::tic("udfSEqnSource", 1);
+    udf.sEqnSource(nrs, time, cds->o_S, o_FS);
+    timer::toc("udfSEqnSource");
+  }
+
+  for(int is = 0; is < cds->NSfields; is++) {
+    if(!cds->compute[is]) continue;
+
+    mesh_t* mesh;
+    (is) ? mesh = cds->meshV : mesh = cds->mesh;
+    const dlong isOffset = is * cds->fieldOffset;
+    occa::memory o_adv = cds->o_wrk0;
+
+    if(cds->options[is].compareArgs("FILTER STABILIZATION", "RELAXATION"))
+      cds->filterRTKernel(
+        cds->meshV->Nelements,
+        nrs->o_filterMT,
+        nrs->filterS,
+        isOffset,
+        cds->o_rho,
+        cds->o_S,
+        o_FS);
+
+    if(cds->options[is].compareArgs("ADVECTION", "TRUE")) {
+      if(cds->Nsubsteps) {
+        o_adv = scalarStrongSubCycle(cds, time, is, cds->o_U, cds->o_S);
+      } else {
+        if(cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE"))
+          cds->advectionStrongCubatureVolumeKernel(
+            cds->meshV->Nelements,
+            mesh->o_vgeo,
+            mesh->o_cubvgeo,
+            mesh->o_cubDiffInterpT,
+            mesh->o_cubInterpT,
+            mesh->o_cubProjectT,
+            cds->vFieldOffset,
+            isOffset,
+            cds->o_U,
+            cds->o_S,
+            cds->o_rho,
+            cds->o_wrk0);
+        else
+          cds->advectionStrongVolumeKernel(
+            cds->meshV->Nelements,
+            mesh->o_vgeo,
+            mesh->o_Dmatrices,
+            cds->vFieldOffset,
+            isOffset,
+            cds->o_U,
+            cds->o_S,
+            cds->o_rho,
+            cds->o_wrk0);
+ 
+        nrs->scaledAddKernel(
+          cds->meshV->Nelements * cds->meshV->Np,
+          -1.0,
+          0 * cds->fieldOffset,
+          cds->o_wrk0,
+          1.0,
+          isOffset,
+          o_FS);
+      }
+    } else {
+      cds->fillKernel(cds->fieldOffset * cds->NVfields, 0.0, o_adv);
+    } 
+
+    cds->sumMakefKernel(
+      mesh->Nelements,
+      mesh->o_vgeo,
+      cds->idt,
+      cds->o_extbdfA,
+      cds->o_extbdfB,
+      cds->fieldOffset * cds->NSfields,
+      isOffset,
+      cds->o_S,
+      o_adv,
+      o_FS,
+      cds->o_rho,
+      o_BF);
+  }
+}
+
+void scalarSolve(nrs_t* nrs, dfloat time, occa::memory o_S)
+{
+  cds_t* cds   = nrs->cds;
+
+  timer::tic("makeq", 1);
+  cds->fillKernel(cds->fieldOffset * cds->NSfields, 0.0, cds->o_FS);
+  makeq(nrs, time, cds->o_FS, cds->o_BF);
+  timer::toc("makeq");
+
+  for (int s = cds->Nstages; s > 1; s--) {
+    const dlong Nbyte = cds->fieldOffset * cds->NSfields * sizeof(dfloat);
+    cds->o_FS.copyFrom(cds->o_FS, Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
+    cds->o_S.copyFrom (cds->o_S , Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
+  }
+
+  timer::tic("scalarSolve", 1);
+  for (int is = 0; is < cds->NSfields; is++) {
+    if(!cds->compute[is]) continue;
+
+    mesh_t* mesh;
+    (is) ? mesh = cds->meshV : mesh = cds->mesh;
+
+    cds->setEllipticCoeffKernel(
+      cds->Nlocal,
+      cds->g0 * cds->idt,
+      is * cds->fieldOffset,
+      cds->fieldOffset,
+      cds->o_diff,
+      cds->o_rho,
+      cds->o_ellipticCoeff);
+
+    if(cds->o_BFDiag.ptr())
+      cds->scaledAddKernel(
+        cds->Nlocal,
+        1.0,
+        is * cds->fieldOffset,
+        cds->o_BFDiag,
+        1.0,
+        cds->fieldOffset,
+        cds->o_ellipticCoeff);
+
+    occa::memory o_Snew = cdsSolve(is, cds, time + cds->dt[0]);
+    o_Snew.copyTo(o_S, cds->Ntotal * sizeof(dfloat), is * cds->fieldOffset * sizeof(dfloat));
+  }
+  timer::toc("scalarSolve");
+}
+
+void makef(nrs_t* nrs, dfloat time, occa::memory o_FU, occa::memory o_BF)
+{
+  mesh_t* mesh = nrs->mesh;
+
+  if(udf.uEqnSource) {
+    timer::tic("udfUEqnSource", 1);
+    udf.uEqnSource(nrs, time, nrs->o_U, o_FU);
+    timer::toc("udfUEqnSource");
+  }
+
+  if(nrs->options.compareArgs("FILTER STABILIZATION", "RELAXATION"))
+    nrs->filterRTKernel(
+      mesh->Nelements,
+      nrs->o_filterMT,
+      nrs->filterS,
+      nrs->fieldOffset,
+      nrs->o_U,
+      o_FU);
+
+  occa::memory o_adv = nrs->o_wrk0;
+  if(nrs->options.compareArgs("ADVECTION", "TRUE")) {
+    if(nrs->Nsubsteps) {
+      o_adv = velocityStrongSubCycle(nrs, time, nrs->o_U);
+    } else {
+      if(nrs->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
+        nrs->advectionStrongCubatureVolumeKernel(
+          mesh->Nelements,
+          mesh->o_vgeo,
+          mesh->o_cubvgeo,
+          mesh->o_cubDiffInterpT,
+          mesh->o_cubInterpT,
+          mesh->o_cubProjectT,
+          nrs->fieldOffset,
+          nrs->o_U,
+          nrs->o_wrk0);
+      else
+        nrs->advectionStrongVolumeKernel(
+          mesh->Nelements,
+          mesh->o_vgeo,
+          mesh->o_Dmatrices,
+          nrs->fieldOffset,
+          nrs->o_U,
+          nrs->o_wrk0);
+ 
+      nrs->scaledAddKernel(
+        nrs->NVfields * nrs->fieldOffset,
+        -1.0,
+        0,
+        nrs->o_wrk0,
+        1.0,
+        0,
+        o_FU);
+    }
+  } else {
+    if(nrs->Nsubsteps) nrs->fillKernel(nrs->fieldOffset * nrs->NVfields, 0.0, o_adv);
+  }
+
+  nrs->sumMakefKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    nrs->idt,
+    nrs->o_extbdfA,
+    nrs->o_extbdfB,
+    nrs->fieldOffset,
+    nrs->o_U,
+    o_adv,
+    o_FU,
+    o_BF);
+}
+
+void fluidSolve(nrs_t* nrs, dfloat time, occa::memory o_U)
+{
+  mesh_t* mesh = nrs->mesh;
+
+  timer::tic("makef", 1);
+  nrs->fillKernel(nrs->fieldOffset * nrs->NVfields, 0.0, nrs->o_FU);
+  makef(nrs, time, nrs->o_FU, nrs->o_BF);
+  timer::toc("makef");
+
+  for (int s = nrs->Nstages; s > 1; s--) {
+    const dlong Nbyte = nrs->fieldOffset * nrs->NVfields * sizeof(dfloat);
+    nrs->o_FU.copyFrom(nrs->o_FU, Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
+    nrs->o_U.copyFrom (nrs->o_U , Nbyte, (s - 1)*Nbyte, (s - 2)*Nbyte);
+  }
+
+  timer::tic("pressureSolve", 1);
+  nrs->setEllipticCoeffPressureKernel(
+    nrs->Nlocal,
+    nrs->fieldOffset,
+    nrs->o_rho,
+    nrs->o_ellipticCoeff);
+  occa::memory o_Pnew = tombo::pressureSolve(nrs, time + nrs->dt[0]);
+  nrs->o_P.copyFrom(o_Pnew, nrs->Ntotal * sizeof(dfloat));
+  timer::toc("pressureSolve");
+
+  timer::tic("velocitySolve", 1);
+  nrs->setEllipticCoeffKernel(
+    nrs->Nlocal,
+    nrs->g0 * nrs->idt,
+    0 * nrs->fieldOffset,
+    nrs->fieldOffset,
+    nrs->o_mue,
+    nrs->o_rho,
+    nrs->o_ellipticCoeff);
+
+  occa::memory o_Unew = tombo::velocitySolve(nrs, time + nrs->dt[0]);
+  o_U.copyFrom(o_Unew, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat));
+  timer::toc("velocitySolve");
+}
+
+occa::memory velocityStrongSubCycle(nrs_t* nrs, dfloat time, occa::memory o_U)
+{
+  mesh_t* mesh = nrs->mesh;
+
+  // Solve for Each SubProblem
+  for (int torder = nrs->ExplicitOrder - 1; torder >= 0; torder--) {
+    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
+    dlong toffset = torder * nrs->NVfields * nrs->fieldOffset;
+    const dfloat b = nrs->extbdfB[torder];
+    if (torder == nrs->ExplicitOrder - 1)
+      nrs->scaledAddKernel(nrs->NVfields * nrs->fieldOffset, b, toffset,
+                           o_U, 0.0, 0, nrs->o_wrk0);
+    else
+      nrs->scaledAddKernel(nrs->NVfields * nrs->fieldOffset, b, toffset,
+                           o_U, 1.0, 0, nrs->o_wrk0);
+
+    // Advance subproblem from here from t^(n-torder) to t^(n-torder+1)
+    dfloat tsub = time;
+    for (int i = torder; i > 0 ; i--) tsub -= nrs->dt[i];
+    const dfloat sdt = nrs->dt[torder]/nrs->Nsubsteps;
+
+    for(int ststep = 0; ststep < nrs->Nsubsteps; ++ststep) {
+      const dfloat tstage = tsub + ststep * sdt;
+
+      nrs->o_wrk0.copyFrom(nrs->o_wrk0, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat),
+                           nrs->NVfields * nrs->fieldOffset * sizeof(dfloat),0);
+
+      for(int rk = 0; rk < nrs->SNrk; ++rk) {
+        // Extrapolate velocity to subProblem stage time
+        const dfloat t   = tstage +  sdt * nrs->Srkc[rk];
+        const dfloat tn0 = time;
+        const dfloat tn1 = time - nrs->dt[1];
+        const dfloat tn2 = time - (nrs->dt[1] + nrs->dt[2]);
+        switch(nrs->ExplicitOrder) {
+        case 1:
+          nrs->extC[0] = 1;
+          nrs->extC[1] = 0;
+          nrs->extC[2] = 0;
+          break;
+        case 2:
+          nrs->extC[0] = (t - tn1) / (tn0 - tn1);
+          nrs->extC[1] = (t - tn0) / (tn1 - tn0);
+          nrs->extC[2] = 0;
+          break;
+        case 3:
+          nrs->extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2));
+          nrs->extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2));
+          nrs->extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1));
+          break;
+        }
+        nrs->o_extC.copyFrom(nrs->extC);
+
+        if(mesh->NglobalGatherElements) {
+          if(nrs->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
+            nrs->subCycleStrongCubatureVolumeKernel(
+              mesh->NglobalGatherElements,
+              mesh->o_globalGatherElementList,
+              mesh->o_vgeo,
+              mesh->o_cubvgeo,
+              mesh->o_cubDiffInterpT,
+              mesh->o_cubInterpT,
+              mesh->o_cubProjectT,
+              nrs->fieldOffset,
+              rk * nrs->NVfields * nrs->fieldOffset,
+              mesh->o_invLMM,
+              nrs->o_extC,
+              o_U,
+              nrs->o_wrk0,
+              nrs->o_wrk6);
+          else
+            nrs->subCycleStrongVolumeKernel(
+              mesh->NglobalGatherElements,
+              mesh->o_globalGatherElementList,
+              mesh->o_vgeo,
+              mesh->o_Dmatrices,
+              nrs->fieldOffset,
+              rk * nrs->NVfields * nrs->fieldOffset,
+              mesh->o_invLMM,
+              nrs->o_extC,
+              o_U,
+              nrs->o_wrk0,
+              nrs->o_wrk6);
+        }
+
+        occa::memory o_rhs;
+        if(rk == 0) o_rhs = nrs->o_wrk6;
+        if(rk == 1) o_rhs = nrs->o_wrk9;
+        if(rk == 2) o_rhs = nrs->o_wrk12;
+        if(rk == 3) o_rhs = nrs->o_wrk15;
+
+        oogs::start(o_rhs, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);                     
+
+        if(mesh->NlocalGatherElements) {
+          if(nrs->options.compareArgs("ADVECTION TYPE", "CUBATURE"))
+            nrs->subCycleStrongCubatureVolumeKernel(
+              mesh->NlocalGatherElements,
+              mesh->o_localGatherElementList,
+              mesh->o_vgeo,
+              mesh->o_cubvgeo,
+              mesh->o_cubDiffInterpT,
+              mesh->o_cubInterpT,
+              mesh->o_cubProjectT,
+              nrs->fieldOffset,
+              rk * nrs->NVfields * nrs->fieldOffset,
+              mesh->o_invLMM,
+              nrs->o_extC,
+              o_U,
+              nrs->o_wrk0,
+              nrs->o_wrk6);
+          else
+            nrs->subCycleStrongVolumeKernel(
+              mesh->NlocalGatherElements,
+              mesh->o_localGatherElementList,
+              mesh->o_vgeo,
+              mesh->o_Dmatrices,
+              nrs->fieldOffset,
+              rk * nrs->NVfields * nrs->fieldOffset,
+              mesh->o_invLMM,
+              nrs->o_extC,
+              o_U,
+              nrs->o_wrk0,
+              nrs->o_wrk6);
+        }
+
+        oogs::finish(o_rhs, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);                     
+
+        nrs->subCycleRKUpdateKernel(
+          mesh->Nelements,
+          rk,
+          sdt,
+          nrs->fieldOffset,
+          nrs->o_Srka,
+          nrs->o_Srkb,
+          nrs->o_wrk3,
+          nrs->o_wrk6,
+          nrs->o_wrk0);
+      }
+    }
+  }
+  return nrs->o_wrk0;
+}
+
+occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is,
+                                  occa::memory o_U, occa::memory o_S)
+{
+  mesh_t* mesh = cds->meshV;
+
+  // Solve for Each SubProblem
+  for (int torder = (cds->ExplicitOrder - 1); torder >= 0; torder--) {
+    // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt)
+    const dlong toffset = is * cds->fieldOffset +
+                          torder * cds->NSfields * cds->fieldOffset;
+    if (torder == cds->ExplicitOrder - 1)
+      cds->scaledAddKernel(cds->fieldOffset, cds->extbdfB[torder],
+                           toffset, o_S, 0.0, 0, cds->o_wrk0);
+    else
+      cds->scaledAddKernel(cds->fieldOffset, cds->extbdfB[torder],
+                           toffset, o_S, 1.0, 0, cds->o_wrk0);
+
+    // Advance SubProblem to t^(n-torder+1)
+    dfloat tsub = time;
+    for (int i = torder; i > 0 ; i--) tsub -= cds->dt[i];
+    const dfloat sdt = cds->dt[torder]/cds->Nsubsteps;
+
+    for(int ststep = 0; ststep < cds->Nsubsteps; ++ststep) {
+      const dfloat tstage = tsub + ststep * sdt;
+
+      cds->o_wrk0.copyFrom(cds->o_wrk0, cds->fieldOffset * sizeof(dfloat),
+                           cds->fieldOffset * sizeof(dfloat), 0);
+
+      for(int rk = 0; rk < cds->SNrk; ++rk) {
+        // Extrapolate velocity to subProblem stage time
+        const dfloat t   = tstage +  sdt * cds->Srkc[rk];
+        const dfloat tn0 = time;
+        const dfloat tn1 = time - cds->dt[1];
+        const dfloat tn2 = time - (cds->dt[1] + cds->dt[2]);
+        switch(cds->ExplicitOrder) {
+        case 1:
+          cds->extC[0] = 1;
+          cds->extC[1] = 0;
+          cds->extC[2] = 0;
+          break;
+        case 2:
+          cds->extC[0] = (t - tn1) / (tn0 - tn1);
+          cds->extC[1] = (t - tn0) / (tn1 - tn0);
+          cds->extC[2] = 0;
+          break;
+        case 3:
+          cds->extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2));
+          cds->extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2));
+          cds->extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1));
+          break;
+        }
+        cds->o_extC.copyFrom(cds->extC);
+
+        if(mesh->NglobalGatherElements) {
+          if(cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE"))
+            cds->subCycleStrongCubatureVolumeKernel(
+              mesh->NglobalGatherElements,
+              mesh->o_globalGatherElementList,
+              cds->vFieldOffset,
+              rk * cds->fieldOffset,
+              mesh->o_vgeo,
+              mesh->o_cubvgeo,
+              mesh->o_cubDiffInterpT,
+              mesh->o_cubInterpT,
+              mesh->o_cubProjectT,
+              mesh->o_invLMM,
+              cds->o_extC,
+              o_U,
+              cds->o_wrk0,
+              cds->o_wrk2);
+          else
+            cds->subCycleStrongVolumeKernel(
+              mesh->NglobalGatherElements,
+              mesh->o_globalGatherElementList,
+              cds->vFieldOffset,
+              rk * cds->fieldOffset,
+              mesh->o_vgeo,
+              mesh->o_Dmatrices,
+              mesh->o_invLMM,
+              cds->o_extC,
+              o_U,
+              cds->o_wrk0,
+              cds->o_wrk2);
+        }
+
+        occa::memory o_rhs;
+        if(rk == 0) o_rhs = cds->o_wrk2;
+        if(rk == 1) o_rhs = cds->o_wrk3;
+        if(rk == 2) o_rhs = cds->o_wrk4;
+        if(rk == 3) o_rhs = cds->o_wrk5;
+
+        oogs::start(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh);
+
+        if(mesh->NlocalGatherElements) {
+          if(cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE"))
+            cds->subCycleStrongCubatureVolumeKernel(
+              mesh->NlocalGatherElements,
+              mesh->o_localGatherElementList,
+              cds->vFieldOffset,
+              rk * cds->fieldOffset,
+              mesh->o_vgeo,
+              mesh->o_cubvgeo,
+              mesh->o_cubDiffInterpT,
+              mesh->o_cubInterpT,
+              mesh->o_cubProjectT,
+              mesh->o_invLMM,
+              cds->o_extC, 
+              o_U,
+              cds->o_wrk0,
+              cds->o_wrk2);
+          else
+            cds->subCycleStrongVolumeKernel(
+              mesh->NlocalGatherElements,
+              mesh->o_localGatherElementList,
+              cds->vFieldOffset,
+              rk * cds->fieldOffset,
+              mesh->o_vgeo,
+              mesh->o_Dmatrices,
+              mesh->o_invLMM,
+              cds->o_extC,
+              o_U,
+              cds->o_wrk0,
+              cds->o_wrk2);
+        }
+
+        oogs::finish(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh);
+
+        cds->subCycleRKUpdateKernel(
+          mesh->Nelements,
+          rk,
+          sdt,
+          cds->fieldOffset,
+          cds->o_Srka,
+          cds->o_Srkb,
+          cds->o_wrk1,
+          cds->o_wrk2,
+          cds->o_wrk0);
+      }
+    }
+  }
+  return cds->o_wrk0;
+}
+
+// qtl = 1/(rho*cp*T) * (div[k*grad[T] ] + qvol)
+void qthermal(nrs_t* nrs, dfloat time, occa::memory o_div)
+{
+  cds_t* cds = nrs->cds;
+  mesh_t* mesh = nrs->mesh;
+
+  nrs->gradientVolumeKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    cds->o_S,
+    cds->o_wrk0);
+
+  oogs::startFinish(cds->o_wrk0, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh);
+
+  nrs->invMassMatrixKernel(
+    mesh->Nelements,
+    nrs->fieldOffset,
+    nrs->NVfields,
+    mesh->o_vgeo,
+    mesh->o_invLMM,
+    cds->o_wrk0);
+
+  if(udf.sEqnSource) {
+    timer::tic("udfSEqnSource", 1);
+    udf.sEqnSource(nrs, time, cds->o_S, cds->o_wrk3);
+    timer::toc("udfSEqnSource");
+  } else {
+    nrs->fillKernel(mesh->Nelements * mesh->Np, 0.0, cds->o_wrk3);
+  }
+
+  nrs->qtlKernel(
+    mesh->Nelements,
+    mesh->o_vgeo,
+    mesh->o_Dmatrices,
+    nrs->fieldOffset,
+    cds->o_wrk0,
+    cds->o_S,
+    cds->o_diff,
+    cds->o_rho,
+    cds->o_wrk3,
+    o_div);
+
+  oogs::startFinish(o_div, 1, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh);
+
+  nrs->invMassMatrixKernel(
+    mesh->Nelements,
+    nrs->fieldOffset,
+    1,
+    mesh->o_vgeo,
+    mesh->o_invLMM,
+    o_div);
+}
diff --git a/src/timeStepper/runTime.hpp b/src/timeStepper/runTime.hpp
new file mode 100644
index 000000000..eb4654cb0
--- /dev/null
+++ b/src/timeStepper/runTime.hpp
@@ -0,0 +1,7 @@
+#if !defined(nekrs_runtime_hpp_)
+#define nekrs_runtime_hpp_
+
+#include "nrs.hpp"
+void runStep(nrs_t* nrs, dfloat time, dfloat dt, int tstep);
+
+#endif
diff --git a/src/udf/CMakeLists.txt b/src/udf/CMakeLists.txt
index d9530d498..ef907644f 100644
--- a/src/udf/CMakeLists.txt
+++ b/src/udf/CMakeLists.txt
@@ -14,27 +14,26 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} $ENV{NEKRS_LIBP_DEFINES}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} $ENV{NEKRS_LIBP_DEFINES}")
 
 set(NEKRS_INSTALL_DIR $ENV{NEKRS_INSTALL_DIR})
-set(HDRDIR      ${NEKRS_INSTALL_DIR}/libparanumal/include)
 set(OGSDIR      ${NEKRS_INSTALL_DIR}/gatherScatter)
 set(GSDIR       ${NEKRS_INSTALL_DIR}/gslib)
-set(ALMONDDIR   ${NEKRS_INSTALL_DIR}/parAlmond)
-set(ELLIPTICDIR ${NEKRS_INSTALL_DIR}/elliptic)
 set(CDSDIR      ${NEKRS_INSTALL_DIR}/cds)
-set(INSDIR      ${NEKRS_INSTALL_DIR}/ins)
 set(INCLUDE_DIRS
   ${CMAKE_CURRENT_SOURCE_DIR}/../..
-  ${INSDIR} 
   ${CDSDIR}
-  ${HDRDIR}
   ${OGSDIR}
   ${GSDIR}
-  ${ELLIPTICDIR}
-  ${ALMONDDIR}
   ${NEKRS_INSTALL_DIR}
   ${NEKRS_INSTALL_DIR}/include
+  ${NEKRS_INSTALL_DIR}/include/mesh
+  ${NEKRS_INSTALL_DIR}/include/io
   ${NEKRS_INSTALL_DIR}/include/core
+  ${NEKRS_INSTALL_DIR}/include/core/utils
+  ${NEKRS_INSTALL_DIR}/include/timeStepper
   ${NEKRS_INSTALL_DIR}/include/udf
+  ${NEKRS_INSTALL_DIR}/include/elliptic
+  ${NEKRS_INSTALL_DIR}/include/elliptic/parAlmond
   ${NEKRS_INSTALL_DIR}/include/nekInterface
+  ${NEKRS_INSTALL_DIR}/include/cds
   ${NEKRS_INSTALL_DIR}/include/linAlg
   ${NEKRS_INSTALL_DIR}/occa/include
 )
diff --git a/src/udf/udf.cpp b/src/udf/udf.cpp
index 3b00e2c30..dc4aab6f5 100644
--- a/src/udf/udf.cpp
+++ b/src/udf/udf.cpp
@@ -75,24 +75,24 @@ void udfLoad(void)
 {
   *(void**)(&udf.setup0) = udfLoadFunction("UDF_Setup0",0);
   *(void**)(&udf.setup) = udfLoadFunction("UDF_Setup",1);
-  *(void**)(&udf.loadKernels) = udfLoadFunction("UDF_LoadKernels",1);
-  *(void**)(&udf.executeStep) = udfLoadFunction("UDF_ExecuteStep",1);
+  *(void**)(&udf.loadKernels) = udfLoadFunction("UDF_LoadKernels",0);
+  *(void**)(&udf.executeStep) = udfLoadFunction("UDF_ExecuteStep",0);
 }
 
-occa::kernel udfBuildKernel(ins_t* ins, const char* function)
+occa::kernel udfBuildKernel(nrs_t* nrs, const char* function)
 {
   int rank;
-  mesh_t* mesh = ins->mesh;
+  mesh_t* mesh = nrs->mesh;
   MPI_Comm_rank(mesh->comm, &rank);
 
   string install_dir;
-  occa::properties kernelInfo = *ins->kernelInfo;
+  occa::properties kernelInfo = *nrs->kernelInfo;
   install_dir.assign(getenv("NEKRS_INSTALL_DIR"));
-  const string bcDataFile = install_dir + "/include/insBcData.h";
+  const string bcDataFile = install_dir + "/include/core/bcData.h";
   kernelInfo["includes"] += bcDataFile.c_str();
 
   string oudf;
-  ins->options.getArgs("DATA FILE", oudf);
+  nrs->options.getArgs("DATA FILE", oudf);
 
   occa::kernel k;
   for (int r = 0; r < 2; r++) {
diff --git a/src/udf/udf.hpp b/src/udf/udf.hpp
index c19085c0c..229a89d8f 100644
--- a/src/udf/udf.hpp
+++ b/src/udf/udf.hpp
@@ -1,27 +1,29 @@
 #if !defined(nekrs_udf_hpp_)
 #define nekrs_udf_hpp_
 
+#define ins_t nrs_t
+
 #include "nrs.hpp"
 #include "nekInterfaceAdapter.hpp"
 
 extern "C" {
 void UDF_Setup0(MPI_Comm comm, setupAide &options);
-void UDF_Setup(ins_t* ins);
-void UDF_LoadKernels(ins_t* ins);
-void UDF_ExecuteStep(ins_t* ins, dfloat time, int tstep);
+void UDF_Setup(nrs_t* nrs);
+void UDF_LoadKernels(nrs_t* nrs);
+void UDF_ExecuteStep(nrs_t* nrs, dfloat time, int tstep);
 };
 
 typedef void (* udfsetup0)(MPI_Comm comm, setupAide &options);
-typedef void (* udfsetup)(ins_t* ins);
-typedef void (* udfloadKernels)(ins_t* ins);
-typedef void (* udfexecuteStep)(ins_t* ins, dfloat time, int tstep);
+typedef void (* udfsetup)(nrs_t* nrs);
+typedef void (* udfloadKernels)(nrs_t* nrs);
+typedef void (* udfexecuteStep)(nrs_t* nrs, dfloat time, int tstep);
 
-typedef void (* udfuEqnSource)(ins_t* ins, dfloat time, occa::memory o_U, occa::memory o_FU);
-typedef void (* udfsEqnSource)(ins_t* ins, dfloat time, occa::memory o_S, occa::memory o_SU);
-typedef void (* udfproperties)(ins_t* ins, dfloat time, occa::memory o_U,
+typedef void (* udfuEqnSource)(nrs_t* nrs, dfloat time, occa::memory o_U, occa::memory o_FU);
+typedef void (* udfsEqnSource)(nrs_t* nrs, dfloat time, occa::memory o_S, occa::memory o_SU);
+typedef void (* udfproperties)(nrs_t* nrs, dfloat time, occa::memory o_U,
                                occa::memory o_S, occa::memory o_UProp,
                                occa::memory o_SProp);
-typedef void (* udfdiv)(ins_t* ins, dfloat time, occa::memory o_div);
+typedef void (* udfdiv)(nrs_t* nrs, dfloat time, occa::memory o_div);
 
 typedef struct
 {
@@ -40,6 +42,6 @@ extern UDF udf;
 void udfBuild(const char* udfFile);
 void udfLoad(void);
 void* udfLoadFunction(const char* fname, int errchk);
-occa::kernel udfBuildKernel(ins_t* ins, const char* function);
+occa::kernel udfBuildKernel(nrs_t* nrs, const char* function);
 
 #endif