From 3e2773698387869ef4884608ca71ca090d3a031f Mon Sep 17 00:00:00 2001 From: stgeke Date: Sat, 8 Aug 2020 09:20:30 +0200 Subject: [PATCH] Add oogs (#129) --- 3rd_party/gslib/ogs/include/ogsKernels.hpp | 6 +- 3rd_party/gslib/ogs/ogs.hpp | 20 +- 3rd_party/gslib/ogs/okl/gather.okl | 44 --- 3rd_party/gslib/ogs/okl/oogs.okl | 97 ++++++ 3rd_party/gslib/ogs/okl/scatterMany.okl | 2 +- 3rd_party/gslib/ogs/src/ogsGather.cpp | 4 - 3rd_party/gslib/ogs/src/ogsGatherMany.cpp | 7 - 3rd_party/gslib/ogs/src/ogsGatherScatter.cpp | 7 - .../gslib/ogs/src/ogsGatherScatterMany.cpp | 7 - .../gslib/ogs/src/ogsGatherScatterVec.cpp | 7 - 3rd_party/gslib/ogs/src/ogsGatherVec.cpp | 8 - 3rd_party/gslib/ogs/src/ogsHostSetup.c | 2 +- 3rd_party/gslib/ogs/src/ogsKernels.cpp | 315 +++++++++--------- 3rd_party/gslib/ogs/src/ogsScatterMany.cpp | 7 - 3rd_party/gslib/ogs/src/ogsScatterVec.cpp | 7 - 3rd_party/gslib/ogs/src/ogsSetup.cpp | 4 +- 3rd_party/gslib/ogs/src/oogs.cpp | 231 ++++++++----- RELEASE.md | 2 +- examples/conj_ht/conj_ht.usr | 12 +- examples/conj_ht/conj_ht_ci.h | 6 +- examples/ethier/ethier.usr | 16 +- examples/ethier/ethier_ci.h | 2 +- examples/lowMach/lowMach.par | 3 +- examples/lowMach/lowMach.usr | 14 +- examples/lowMach/lowMach_ci.h | 2 +- scripts/nrsqsub_lassen | 1 + scripts/nrsqsub_summit | 1 + src/core/cds.cpp | 11 +- src/core/cds.h | 2 + src/core/ins.h | 2 + src/core/insSetup.cpp | 16 + src/core/runTime.cpp | 11 +- src/core/tombo.cpp | 13 +- src/libP/solvers/elliptic/elliptic.h | 2 + .../elliptic/src/ellipticBuildJacobi.c | 15 +- .../src/ellipticBuildMultigridLevel.c | 19 ++ .../elliptic/src/ellipticMultiGridLevel.c | 2 +- .../elliptic/src/ellipticMultiGridSchwarz.c | 62 +--- .../solvers/elliptic/src/ellipticOperator.c | 72 +--- .../solvers/elliptic/src/ellipticSolveSetup.c | 47 ++- src/nekInterface/NEKINTF | 4 +- src/nekInterface/nekInterface.f | 24 +- src/nekInterface/nekInterfaceAdapter.cpp | 11 +- src/nekInterface/nekInterfaceAdapter.hpp | 3 +- 44 files changed, 622 insertions(+), 528 deletions(-) create mode 100644 3rd_party/gslib/ogs/okl/oogs.okl diff --git a/3rd_party/gslib/ogs/include/ogsKernels.hpp b/3rd_party/gslib/ogs/include/ogsKernels.hpp index 8b4bf09f9..f223e724b 100644 --- a/3rd_party/gslib/ogs/include/ogsKernels.hpp +++ b/3rd_party/gslib/ogs/include/ogsKernels.hpp @@ -107,13 +107,11 @@ namespace ogs { extern occa::kernel gatherKernel_floatAdd; - extern occa::kernel gatherKernel_floatAddSelf; extern occa::kernel gatherKernel_floatMul; extern occa::kernel gatherKernel_floatMin; extern occa::kernel gatherKernel_floatMax; extern occa::kernel gatherKernel_doubleAdd; - extern occa::kernel gatherKernel_doubleAddSelf; extern occa::kernel gatherKernel_doubleMul; extern occa::kernel gatherKernel_doubleMin; extern occa::kernel gatherKernel_doubleMax; @@ -129,7 +127,6 @@ namespace ogs { extern occa::kernel gatherKernel_longMax; - extern occa::kernel gatherVecKernel_floatAdd; extern occa::kernel gatherVecKernel_floatMul; extern occa::kernel gatherVecKernel_floatMin; @@ -151,7 +148,6 @@ namespace ogs { extern occa::kernel gatherVecKernel_longMax; - extern occa::kernel gatherManyKernel_floatAdd; extern occa::kernel gatherManyKernel_floatMul; extern occa::kernel gatherManyKernel_floatMin; @@ -193,6 +189,8 @@ namespace ogs { void initKernels(MPI_Comm comm, occa::device device); + extern occa::properties kernelInfo; + void freeKernels(); } diff --git a/3rd_party/gslib/ogs/ogs.hpp b/3rd_party/gslib/ogs/ogs.hpp index 7aee1fa79..0eac7fe3b 100644 --- a/3rd_party/gslib/ogs/ogs.hpp +++ b/3rd_party/gslib/ogs/ogs.hpp @@ -112,6 +112,7 @@ SOFTWARE. #ifndef OGS_HPP #define OGS_HPP 1 +#include #include #include #include @@ -119,7 +120,7 @@ SOFTWARE. #include "mpi.h" #include "types.h" -#define OGS_ENABLE_TIMER +//#define OGS_ENABLE_TIMER #ifdef OGS_ENABLE_TIMER #include "timer.hpp" #endif @@ -246,18 +247,23 @@ typedef struct { occa::memory o_scatterOffsets, o_gatherOffsets; occa::memory o_scatterIds, o_gatherIds; + occa::kernel packBufDoubleKernel, unpackBufDoubleKernel; + occa::kernel packBufFloatKernel, unpackBufFloatKernel; + oogs_mode mode; } oogs_t; namespace oogs{ -void gatherScatter(void *v, const char *type, const char *op, oogs_t *h); -void gatherScatter(occa::memory o_v, const char *type, const char *op, oogs_t *h); -void start(occa::memory o_v, const char *type, const char *op, oogs_t *h); -void finish(occa::memory o_v, const char *type, const char *op, oogs_t *h); -oogs_t *setup(dlong N, hlong *ids, const char *type, MPI_Comm &comm, - int verbose, occa::device device, oogs_mode mode); +void start(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h); +void finish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h); +void startFinish(void *v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h); +void startFinish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h); +oogs_t *setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::function callback, oogs_mode gsMode); +oogs_t *setup(dlong N, hlong *ids, const int k, const dlong stride, const char *type, MPI_Comm &comm, + int verbose, occa::device device, std::function callback, oogs_mode mode); +void destroy(oogs_t *h); } diff --git a/3rd_party/gslib/ogs/okl/gather.okl b/3rd_party/gslib/ogs/okl/gather.okl index 287f34eb5..e86a72ec1 100644 --- a/3rd_party/gslib/ogs/okl/gather.okl +++ b/3rd_party/gslib/ogs/okl/gather.okl @@ -47,28 +47,6 @@ SOFTWARE. } } -@kernel void gather_floatAddSelf(const dlong Ngather, - @restrict const dlong * gatherStarts, - @restrict const dlong * gatherIds, - @restrict const float * q, - @restrict float * gatherq){ - - for(dlong g=0;gNhaloGather*Nbytes; -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif // MPI based gather using libgs ogsHostGatherMany(H, k, type, op, ogs->haloGshNonSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally gather halo data back from HOST to DEVICE if (ogs->NownedHalo) diff --git a/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp b/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp index 92d87525c..3fb74eff2 100644 --- a/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp +++ b/3rd_party/gslib/ogs/src/ogsGatherScatter.cpp @@ -47,7 +47,6 @@ void ogsGatherScatterStart(occa::memory o_v, const char *type, const char *op, ogs_t *ogs){ - size_t Nbytes; if (!strcmp(type, "double")) @@ -104,13 +103,7 @@ void ogsGatherScatterFinish(occa::memory o_v, ogs->device.finish(); // MPI based gather scatter using libgs -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif ogsHostGatherScatter(ogs::haloBuf, type, op, ogs->haloGshSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally gather halo data back from HOST to DEVICE ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); diff --git a/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp b/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp index d66c092cb..16ef0305c 100644 --- a/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp +++ b/3rd_party/gslib/ogs/src/ogsGatherScatterMany.cpp @@ -67,7 +67,6 @@ void ogsGatherScatterManyStart(occa::memory o_v, const char *type, const char *op, ogs_t *ogs){ - size_t Nbytes; if (!strcmp(type, "float")) Nbytes = sizeof(float); @@ -127,14 +126,8 @@ void ogsGatherScatterManyFinish(occa::memory o_v, void* H[k]; for (int i=0;iNhaloGather*Nbytes; -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif // MPI based gather scatter using libgs ogsHostGatherScatterMany(H, k, type, op, ogs->haloGshSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally gather halo data back from HOST to DEVICE ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); diff --git a/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp b/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp index 2ee7073cb..50f99933c 100644 --- a/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp +++ b/3rd_party/gslib/ogs/src/ogsGatherScatterVec.cpp @@ -50,7 +50,6 @@ void ogsGatherScatterVecStart(occa::memory o_v, const char *type, const char *op, ogs_t *ogs){ - size_t Nbytes; if (!strcmp(type, "float")) Nbytes = sizeof(float); @@ -106,14 +105,8 @@ void ogsGatherScatterVecFinish(occa::memory o_v, ogs->device.setStream(ogs::dataStream); ogs->device.finish(); -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif // MPI based gather scatter using libgs ogsHostGatherScatterVec(ogs::haloBuf, k, type, op, ogs->haloGshSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally gather halo data back from HOST to DEVICE ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); diff --git a/3rd_party/gslib/ogs/src/ogsGatherVec.cpp b/3rd_party/gslib/ogs/src/ogsGatherVec.cpp index c5d393bc6..512feb6b9 100644 --- a/3rd_party/gslib/ogs/src/ogsGatherVec.cpp +++ b/3rd_party/gslib/ogs/src/ogsGatherVec.cpp @@ -51,7 +51,6 @@ void ogsGatherVecStart(occa::memory o_gv, const char *type, const char *op, ogs_t *ogs){ - size_t Nbytes; if (!strcmp(type, "float")) Nbytes = sizeof(float); @@ -80,7 +79,6 @@ void ogsGatherVecStart(occa::memory o_gv, ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); ogs->device.setStream(ogs::defaultStream); } - } @@ -108,14 +106,8 @@ void ogsGatherVecFinish(occa::memory o_gv, ogs->device.setStream(ogs::dataStream); ogs->device.finish(); -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif // MPI based gather using libgs ogsHostGatherVec(ogs::haloBuf, k, type, op, ogs->haloGshNonSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally gather halo data back from HOST to DEVICE if (ogs->NownedHalo) diff --git a/3rd_party/gslib/ogs/src/ogsHostSetup.c b/3rd_party/gslib/ogs/src/ogsHostSetup.c index f3f0e4d96..f48f6ae79 100644 --- a/3rd_party/gslib/ogs/src/ogsHostSetup.c +++ b/3rd_party/gslib/ogs/src/ogsHostSetup.c @@ -58,7 +58,7 @@ void *ogsHostSetup(MPI_Comm meshComm, id[n] = (slong) gatherGlobalNodes[n]; } - struct gs_data *gsh = gs_setup(id, NuniqueBases, &com, nonsymm, gs_pairwise, verbose); // gs_auto, gs_crystal_router, gs_pw + struct gs_data *gsh = gs_setup(id, NuniqueBases, &com, nonsymm, gs_pairwise, 0); // gs_auto, gs_crystal_router, gs_pw free(id); diff --git a/3rd_party/gslib/ogs/src/ogsKernels.cpp b/3rd_party/gslib/ogs/src/ogsKernels.cpp index d6a8b34e7..1ad2ac567 100644 --- a/3rd_party/gslib/ogs/src/ogsKernels.cpp +++ b/3rd_party/gslib/ogs/src/ogsKernels.cpp @@ -41,6 +41,8 @@ namespace ogs { occa::stream defaultStream; occa::stream dataStream; + occa::properties kernelInfo; + occa::kernel gatherScatterKernel_floatAdd; occa::kernel gatherScatterKernel_floatMul; occa::kernel gatherScatterKernel_floatMin; @@ -91,12 +93,10 @@ namespace ogs { occa::kernel gatherScatterManyKernel_longMax; occa::kernel gatherKernel_floatAdd; - occa::kernel gatherKernel_floatAddSelf; occa::kernel gatherKernel_floatMul; occa::kernel gatherKernel_floatMin; occa::kernel gatherKernel_floatMax; occa::kernel gatherKernel_doubleAdd; - occa::kernel gatherKernel_doubleAddSelf; occa::kernel gatherKernel_doubleMul; occa::kernel gatherKernel_doubleMin; occa::kernel gatherKernel_doubleMax; @@ -165,39 +165,38 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) { ogs::defaultStream = device.getStream(); ogs::dataStream = device.createStream(); - occa::properties kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); + ogs::kernelInfo["defines"].asObject(); + ogs::kernelInfo["includes"].asArray(); + ogs::kernelInfo["header"].asArray(); + ogs::kernelInfo["flags"].asObject(); if(sizeof(dlong)==4){ - kernelInfo["defines/" "dlong"]="int"; + ogs::kernelInfo["defines/" "dlong"]="int"; } if(sizeof(dlong)==8){ - kernelInfo["defines/" "dlong"]="long long int"; + ogs::kernelInfo["defines/" "dlong"]="long long int"; } if(sizeof(dfloat) == sizeof(double)){ - kernelInfo["defines/" "dfloat"]= "double"; - kernelInfo["defines/" "dfloat4"]= "double4"; + ogs::kernelInfo["defines/" "dfloat"]= "double"; + ogs::kernelInfo["defines/" "dfloat4"]= "double4"; } else if(sizeof(dfloat) == sizeof(float)){ - kernelInfo["defines/" "dfloat"]= "float"; - kernelInfo["defines/" "dfloat4"]= "float4"; + ogs::kernelInfo["defines/" "dfloat"]= "float"; + ogs::kernelInfo["defines/" "dfloat4"]= "float4"; } if(device.mode()=="OpenCL"){ - //kernelInfo["compiler_flags"] += "-cl-opt-disable"; + //ogs::kernelInfo["compiler_flags"] += "-cl-opt-disable"; } if(device.mode()=="CUDA"){ // add backend compiler optimization for CUDA - kernelInfo["compiler_flags"] += " --ftz=true "; - kernelInfo["compiler_flags"] += " --prec-div=false "; - kernelInfo["compiler_flags"] += " --prec-sqrt=false "; - kernelInfo["compiler_flags"] += " --use_fast_math "; - kernelInfo["compiler_flags"] += " --fmad=true "; // compiler option for cuda + ogs::kernelInfo["compiler_flags"] += " --ftz=true "; + ogs::kernelInfo["compiler_flags"] += " --prec-div=false "; + ogs::kernelInfo["compiler_flags"] += " --prec-sqrt=false "; + ogs::kernelInfo["compiler_flags"] += " --use_fast_math "; + ogs::kernelInfo["compiler_flags"] += " --fmad=true "; // compiler option for cuda } if (rank==0) printf("Compiling GatherScatter Kernels...");fflush(stdout); @@ -205,146 +204,144 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) { for (int r=0;r<2;r++){ if ((r==0 && rank==0) || (r==1 && rank>0)) { - ogs::gatherScatterKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatAdd", kernelInfo); - ogs::gatherScatterKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMul", kernelInfo); - ogs::gatherScatterKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMin", kernelInfo); - ogs::gatherScatterKernel_floatMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMax", kernelInfo); - - ogs::gatherScatterKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleAdd", kernelInfo); - ogs::gatherScatterKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleMul", kernelInfo); - ogs::gatherScatterKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleMin", kernelInfo); - ogs::gatherScatterKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleMax", kernelInfo); - - ogs::gatherScatterKernel_intAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intAdd", kernelInfo); - ogs::gatherScatterKernel_intMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intMul", kernelInfo); - ogs::gatherScatterKernel_intMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intMin", kernelInfo); - ogs::gatherScatterKernel_intMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intMax", kernelInfo); - - ogs::gatherScatterKernel_longAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longAdd", kernelInfo); - ogs::gatherScatterKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longMul", kernelInfo); - ogs::gatherScatterKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longMin", kernelInfo); - ogs::gatherScatterKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longMax", kernelInfo); - - ogs::gatherScatterVecKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatAdd", kernelInfo); - ogs::gatherScatterVecKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatMul", kernelInfo); - ogs::gatherScatterVecKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatMin", kernelInfo); - ogs::gatherScatterVecKernel_floatMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatMax", kernelInfo); - - ogs::gatherScatterVecKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleAdd", kernelInfo); - ogs::gatherScatterVecKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleMul", kernelInfo); - ogs::gatherScatterVecKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleMin", kernelInfo); - ogs::gatherScatterVecKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleMax", kernelInfo); - - ogs::gatherScatterVecKernel_intAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intAdd", kernelInfo); - ogs::gatherScatterVecKernel_intMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intMul", kernelInfo); - ogs::gatherScatterVecKernel_intMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intMin", kernelInfo); - ogs::gatherScatterVecKernel_intMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intMax", kernelInfo); - - ogs::gatherScatterVecKernel_longAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longAdd", kernelInfo); - ogs::gatherScatterVecKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longMul", kernelInfo); - ogs::gatherScatterVecKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longMin", kernelInfo); - ogs::gatherScatterVecKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longMax", kernelInfo); - - ogs::gatherScatterManyKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatAdd", kernelInfo); - ogs::gatherScatterManyKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatMul", kernelInfo); - ogs::gatherScatterManyKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatMin", kernelInfo); - ogs::gatherScatterManyKernel_floatMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatMax", kernelInfo); - - ogs::gatherScatterManyKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleAdd", kernelInfo); - ogs::gatherScatterManyKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleMul", kernelInfo); - ogs::gatherScatterManyKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleMin", kernelInfo); - ogs::gatherScatterManyKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleMax", kernelInfo); - - ogs::gatherScatterManyKernel_intAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intAdd", kernelInfo); - ogs::gatherScatterManyKernel_intMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intMul", kernelInfo); - ogs::gatherScatterManyKernel_intMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intMin", kernelInfo); - ogs::gatherScatterManyKernel_intMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intMax", kernelInfo); - - ogs::gatherScatterManyKernel_longAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longAdd", kernelInfo); - ogs::gatherScatterManyKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMul", kernelInfo); - ogs::gatherScatterManyKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMin", kernelInfo); - ogs::gatherScatterManyKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMax", kernelInfo); - - - - ogs::gatherKernel_floatAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatAdd", kernelInfo); - ogs::gatherKernel_floatAddSelf = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatAddSelf", kernelInfo); - ogs::gatherKernel_floatMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatMul", kernelInfo); - ogs::gatherKernel_floatMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatMin", kernelInfo); - ogs::gatherKernel_floatMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatMax", kernelInfo); - - ogs::gatherKernel_doubleAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleAdd", kernelInfo); - ogs::gatherKernel_doubleAddSelf = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleAddSelf", kernelInfo); - ogs::gatherKernel_doubleMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleMul", kernelInfo); - ogs::gatherKernel_doubleMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleMin", kernelInfo); - ogs::gatherKernel_doubleMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleMax", kernelInfo); - - ogs::gatherKernel_intAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_intAdd", kernelInfo); - ogs::gatherKernel_intMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_intMul", kernelInfo); - ogs::gatherKernel_intMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_intMin", kernelInfo); - ogs::gatherKernel_intMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_intMax", kernelInfo); - - ogs::gatherKernel_longAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_longAdd", kernelInfo); - ogs::gatherKernel_longMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_longMul", kernelInfo); - ogs::gatherKernel_longMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_longMin", kernelInfo); - ogs::gatherKernel_longMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_longMax", kernelInfo); - - ogs::gatherVecKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatAdd", kernelInfo); - ogs::gatherVecKernel_floatMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatMul", kernelInfo); - ogs::gatherVecKernel_floatMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatMin", kernelInfo); - ogs::gatherVecKernel_floatMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatMax", kernelInfo); - - ogs::gatherVecKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleAdd", kernelInfo); - ogs::gatherVecKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleMul", kernelInfo); - ogs::gatherVecKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleMin", kernelInfo); - ogs::gatherVecKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleMax", kernelInfo); - - ogs::gatherVecKernel_intAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intAdd", kernelInfo); - ogs::gatherVecKernel_intMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intMul", kernelInfo); - ogs::gatherVecKernel_intMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intMin", kernelInfo); - ogs::gatherVecKernel_intMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intMax", kernelInfo); - - ogs::gatherVecKernel_longAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longAdd", kernelInfo); - ogs::gatherVecKernel_longMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longMul", kernelInfo); - ogs::gatherVecKernel_longMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longMin", kernelInfo); - ogs::gatherVecKernel_longMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longMax", kernelInfo); - - ogs::gatherManyKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatAdd", kernelInfo); - ogs::gatherManyKernel_floatMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatMul", kernelInfo); - ogs::gatherManyKernel_floatMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatMin", kernelInfo); - ogs::gatherManyKernel_floatMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatMax", kernelInfo); - - ogs::gatherManyKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleAdd", kernelInfo); - ogs::gatherManyKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleMul", kernelInfo); - ogs::gatherManyKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleMin", kernelInfo); - ogs::gatherManyKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleMax", kernelInfo); - - ogs::gatherManyKernel_intAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intAdd", kernelInfo); - ogs::gatherManyKernel_intMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intMul", kernelInfo); - ogs::gatherManyKernel_intMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intMin", kernelInfo); - ogs::gatherManyKernel_intMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intMax", kernelInfo); - - ogs::gatherManyKernel_longAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longAdd", kernelInfo); - ogs::gatherManyKernel_longMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMul", kernelInfo); - ogs::gatherManyKernel_longMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMin", kernelInfo); - ogs::gatherManyKernel_longMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMax", kernelInfo); - - - - ogs::scatterKernel_float = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_float", kernelInfo); - ogs::scatterKernel_double = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_double", kernelInfo); - ogs::scatterKernel_int = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_int", kernelInfo); - ogs::scatterKernel_long = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_long", kernelInfo); - - ogs::scatterVecKernel_float = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_float", kernelInfo); - ogs::scatterVecKernel_double = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_double", kernelInfo); - ogs::scatterVecKernel_int = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_int", kernelInfo); - ogs::scatterVecKernel_long = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_long", kernelInfo); - - ogs::scatterManyKernel_float = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_float", kernelInfo); - ogs::scatterManyKernel_double = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_double", kernelInfo); - ogs::scatterManyKernel_int = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_int", kernelInfo); - ogs::scatterManyKernel_long = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_long", kernelInfo); + ogs::gatherScatterKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatAdd", ogs::kernelInfo); + ogs::gatherScatterKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMul", ogs::kernelInfo); + ogs::gatherScatterKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMin", ogs::kernelInfo); + ogs::gatherScatterKernel_floatMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMax", ogs::kernelInfo); + + ogs::gatherScatterKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleAdd", ogs::kernelInfo); + ogs::gatherScatterKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleMul", ogs::kernelInfo); + ogs::gatherScatterKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleMin", ogs::kernelInfo); + ogs::gatherScatterKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_doubleMax", ogs::kernelInfo); + + ogs::gatherScatterKernel_intAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intAdd", ogs::kernelInfo); + ogs::gatherScatterKernel_intMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intMul", ogs::kernelInfo); + ogs::gatherScatterKernel_intMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intMin", ogs::kernelInfo); + ogs::gatherScatterKernel_intMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_intMax", ogs::kernelInfo); + + ogs::gatherScatterKernel_longAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longAdd", ogs::kernelInfo); + ogs::gatherScatterKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longMul", ogs::kernelInfo); + ogs::gatherScatterKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longMin", ogs::kernelInfo); + ogs::gatherScatterKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_longMax", ogs::kernelInfo); + + ogs::gatherScatterVecKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatAdd", ogs::kernelInfo); + ogs::gatherScatterVecKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatMul", ogs::kernelInfo); + ogs::gatherScatterVecKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatMin", ogs::kernelInfo); + ogs::gatherScatterVecKernel_floatMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_floatMax", ogs::kernelInfo); + + ogs::gatherScatterVecKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleAdd", ogs::kernelInfo); + ogs::gatherScatterVecKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleMul", ogs::kernelInfo); + ogs::gatherScatterVecKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleMin", ogs::kernelInfo); + ogs::gatherScatterVecKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_doubleMax", ogs::kernelInfo); + + ogs::gatherScatterVecKernel_intAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intAdd", ogs::kernelInfo); + ogs::gatherScatterVecKernel_intMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intMul", ogs::kernelInfo); + ogs::gatherScatterVecKernel_intMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intMin", ogs::kernelInfo); + ogs::gatherScatterVecKernel_intMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_intMax", ogs::kernelInfo); + + ogs::gatherScatterVecKernel_longAdd = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longAdd", ogs::kernelInfo); + ogs::gatherScatterVecKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longMul", ogs::kernelInfo); + ogs::gatherScatterVecKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longMin", ogs::kernelInfo); + ogs::gatherScatterVecKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatterVec.okl", "gatherScatterVec_longMax", ogs::kernelInfo); + + ogs::gatherScatterManyKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatAdd", ogs::kernelInfo); + ogs::gatherScatterManyKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatMul", ogs::kernelInfo); + ogs::gatherScatterManyKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatMin", ogs::kernelInfo); + ogs::gatherScatterManyKernel_floatMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_floatMax", ogs::kernelInfo); + + ogs::gatherScatterManyKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleAdd", ogs::kernelInfo); + ogs::gatherScatterManyKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleMul", ogs::kernelInfo); + ogs::gatherScatterManyKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleMin", ogs::kernelInfo); + ogs::gatherScatterManyKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_doubleMax", ogs::kernelInfo); + + ogs::gatherScatterManyKernel_intAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intAdd", ogs::kernelInfo); + ogs::gatherScatterManyKernel_intMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intMul", ogs::kernelInfo); + ogs::gatherScatterManyKernel_intMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intMin", ogs::kernelInfo); + ogs::gatherScatterManyKernel_intMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_intMax", ogs::kernelInfo); + + ogs::gatherScatterManyKernel_longAdd = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longAdd", ogs::kernelInfo); + ogs::gatherScatterManyKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMul", ogs::kernelInfo); + ogs::gatherScatterManyKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMin", ogs::kernelInfo); + ogs::gatherScatterManyKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMax", ogs::kernelInfo); + + + + ogs::gatherKernel_floatAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatAdd", ogs::kernelInfo); + ogs::gatherKernel_floatMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatMul", ogs::kernelInfo); + ogs::gatherKernel_floatMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatMin", ogs::kernelInfo); + ogs::gatherKernel_floatMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatMax", ogs::kernelInfo); + + ogs::gatherKernel_doubleAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleAdd", ogs::kernelInfo); + ogs::gatherKernel_doubleMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleMul", ogs::kernelInfo); + ogs::gatherKernel_doubleMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleMin", ogs::kernelInfo); + ogs::gatherKernel_doubleMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_doubleMax", ogs::kernelInfo); + + ogs::gatherKernel_intAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_intAdd", ogs::kernelInfo); + ogs::gatherKernel_intMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_intMul", ogs::kernelInfo); + ogs::gatherKernel_intMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_intMin", ogs::kernelInfo); + ogs::gatherKernel_intMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_intMax", ogs::kernelInfo); + + ogs::gatherKernel_longAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_longAdd", ogs::kernelInfo); + ogs::gatherKernel_longMul = device.buildKernel(DOGS "/okl/gather.okl", "gather_longMul", ogs::kernelInfo); + ogs::gatherKernel_longMin = device.buildKernel(DOGS "/okl/gather.okl", "gather_longMin", ogs::kernelInfo); + ogs::gatherKernel_longMax = device.buildKernel(DOGS "/okl/gather.okl", "gather_longMax", ogs::kernelInfo); + + ogs::gatherVecKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatAdd", ogs::kernelInfo); + ogs::gatherVecKernel_floatMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatMul", ogs::kernelInfo); + ogs::gatherVecKernel_floatMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatMin", ogs::kernelInfo); + ogs::gatherVecKernel_floatMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_floatMax", ogs::kernelInfo); + + ogs::gatherVecKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleAdd", ogs::kernelInfo); + ogs::gatherVecKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleMul", ogs::kernelInfo); + ogs::gatherVecKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleMin", ogs::kernelInfo); + ogs::gatherVecKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_doubleMax", ogs::kernelInfo); + + ogs::gatherVecKernel_intAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intAdd", ogs::kernelInfo); + ogs::gatherVecKernel_intMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intMul", ogs::kernelInfo); + ogs::gatherVecKernel_intMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intMin", ogs::kernelInfo); + ogs::gatherVecKernel_intMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_intMax", ogs::kernelInfo); + + ogs::gatherVecKernel_longAdd = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longAdd", ogs::kernelInfo); + ogs::gatherVecKernel_longMul = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longMul", ogs::kernelInfo); + ogs::gatherVecKernel_longMin = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longMin", ogs::kernelInfo); + ogs::gatherVecKernel_longMax = device.buildKernel(DOGS "/okl/gatherVec.okl", "gatherVec_longMax", ogs::kernelInfo); + + ogs::gatherManyKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatAdd", ogs::kernelInfo); + ogs::gatherManyKernel_floatMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatMul", ogs::kernelInfo); + ogs::gatherManyKernel_floatMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatMin", ogs::kernelInfo); + ogs::gatherManyKernel_floatMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_floatMax", ogs::kernelInfo); + + ogs::gatherManyKernel_doubleAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleAdd", ogs::kernelInfo); + ogs::gatherManyKernel_doubleMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleMul", ogs::kernelInfo); + ogs::gatherManyKernel_doubleMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleMin", ogs::kernelInfo); + ogs::gatherManyKernel_doubleMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_doubleMax", ogs::kernelInfo); + + ogs::gatherManyKernel_intAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intAdd", ogs::kernelInfo); + ogs::gatherManyKernel_intMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intMul", ogs::kernelInfo); + ogs::gatherManyKernel_intMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intMin", ogs::kernelInfo); + ogs::gatherManyKernel_intMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_intMax", ogs::kernelInfo); + + ogs::gatherManyKernel_longAdd = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longAdd", ogs::kernelInfo); + ogs::gatherManyKernel_longMul = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMul", ogs::kernelInfo); + ogs::gatherManyKernel_longMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMin", ogs::kernelInfo); + ogs::gatherManyKernel_longMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMax", ogs::kernelInfo); + + + + ogs::scatterKernel_float = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_float", ogs::kernelInfo); + ogs::scatterKernel_double = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_double", ogs::kernelInfo); + ogs::scatterKernel_int = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_int", ogs::kernelInfo); + ogs::scatterKernel_long = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_long", ogs::kernelInfo); + + ogs::scatterVecKernel_float = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_float", ogs::kernelInfo); + ogs::scatterVecKernel_double = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_double", ogs::kernelInfo); + ogs::scatterVecKernel_int = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_int", ogs::kernelInfo); + ogs::scatterVecKernel_long = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_long", ogs::kernelInfo); + + ogs::scatterManyKernel_float = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_float", ogs::kernelInfo); + ogs::scatterManyKernel_double = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_double", ogs::kernelInfo); + ogs::scatterManyKernel_int = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_int", ogs::kernelInfo); + ogs::scatterManyKernel_long = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_long", ogs::kernelInfo); } MPI_Barrier(comm); } diff --git a/3rd_party/gslib/ogs/src/ogsScatterMany.cpp b/3rd_party/gslib/ogs/src/ogsScatterMany.cpp index 5ca1056b1..1adf28ab3 100644 --- a/3rd_party/gslib/ogs/src/ogsScatterMany.cpp +++ b/3rd_party/gslib/ogs/src/ogsScatterMany.cpp @@ -58,7 +58,6 @@ void ogsScatterManyStart(occa::memory o_sv, const char *type, const char *op, ogs_t *ogs){ - size_t Nbytes; if (!strcmp(type, "float")) Nbytes = sizeof(float); @@ -121,14 +120,8 @@ void ogsScatterManyFinish(occa::memory o_sv, void* H[k]; for (int i=0;iNhaloGather*Nbytes; -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif // MPI based scatter using gslib ogsHostScatterMany(H, k, type, op, ogs->haloGshNonSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally scattered halo data back from HOST to DEVICE ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); diff --git a/3rd_party/gslib/ogs/src/ogsScatterVec.cpp b/3rd_party/gslib/ogs/src/ogsScatterVec.cpp index 1636db9b1..f3de2cb76 100644 --- a/3rd_party/gslib/ogs/src/ogsScatterVec.cpp +++ b/3rd_party/gslib/ogs/src/ogsScatterVec.cpp @@ -48,7 +48,6 @@ void ogsScatterVecStart(occa::memory o_sv, const char *type, const char *op, ogs_t *ogs){ - size_t Nbytes; if (!strcmp(type, "float")) Nbytes = sizeof(float); @@ -104,14 +103,8 @@ void ogsScatterVecFinish(occa::memory o_sv, ogs->device.setStream(ogs::dataStream); ogs->device.finish(); -#ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); -#endif // MPI based scatter using gslib ogsHostScatterVec(ogs::haloBuf, k, type, op, ogs->haloGshNonSym); -#ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); -#endif // copy totally scattered halo data back from HOST to DEVICE ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); diff --git a/3rd_party/gslib/ogs/src/ogsSetup.cpp b/3rd_party/gslib/ogs/src/ogsSetup.cpp index d2ba1c694..6c8568b1b 100644 --- a/3rd_party/gslib/ogs/src/ogsSetup.cpp +++ b/3rd_party/gslib/ogs/src/ogsSetup.cpp @@ -105,7 +105,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, MPI_Comm_size(ogs->comm, &size); //make a host gs handle (calls gslib) - ogs->hostGsh = ogsHostSetup(comm, N, ids, 0, 0); + ogs->hostGsh = ogsHostSetup(comm, N, ids, 0, 1); //use the host gs to find what nodes are local to this rank int *minRank = (int *) calloc(N,sizeof(int)); @@ -384,7 +384,7 @@ void ogsFree(ogs_t *ogs) { ogs->o_gatherInvDegree.free(); } - free(ogs); + delete[] ogs; ogs::Nrefs--; if (!ogs::Nrefs) ogs::freeKernels(); diff --git a/3rd_party/gslib/ogs/src/oogs.cpp b/3rd_party/gslib/ogs/src/oogs.cpp index 265332c9a..0995a73e0 100644 --- a/3rd_party/gslib/ogs/src/oogs.cpp +++ b/3rd_party/gslib/ogs/src/oogs.cpp @@ -8,6 +8,7 @@ #include "ogs.hpp" #include "ogsKernels.hpp" #include "ogsInterface.h" +#include #ifdef __cplusplus extern "C" { @@ -78,9 +79,7 @@ static void convertPwMap(const uint *restrict map, } } -static void _ogsHostGatherScatter(occa::memory o_u, - const char *type, const char *op, - oogs_t *gs) +static void pairwiseExchange(occa::memory o_halo, int unit_size, oogs_t *gs) { ogs_t *ogs = gs->ogs; struct gs_data *hgs = (gs_data*) ogs->haloGshSym; @@ -93,75 +92,49 @@ static void _ogsHostGatherScatter(occa::memory o_u, const unsigned transpose = 0; const unsigned recv = 0^transpose, send = 1^transpose; - size_t unit_size; - if (!strcmp(type, "float")) - unit_size = sizeof(float); - else if (!strcmp(type, "double")) - unit_size = sizeof(double); - else if (!strcmp(type, "int")) - unit_size = sizeof(int); - else if (!strcmp(type, "long long int")) - unit_size = sizeof(long long int); - { // prepost recv comm_req *req = pwd->req; const struct pw_comm_data *c = &pwd->comm[recv]; const uint *p, *pe, *size=c->size; uint bufOffset = 0; for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; + const size_t len = *(size++); unsigned char *recvbuf = (unsigned char *)gs->bufRecv + bufOffset; if(gs->mode == OOGS_DEVICEMPI) recvbuf = (unsigned char*)gs->o_bufRecv.ptr() + bufOffset; - MPI_Irecv((void*)recvbuf,len,MPI_UNSIGNED_CHAR,*p,*p,comm->c,req++); - bufOffset += len; + MPI_Irecv((void*)recvbuf,len*unit_size,MPI_UNSIGNED_CHAR,*p,*p,comm->c,req++); + bufOffset += len*unit_size; } } - { // scatter - occaScatter(Nhalo, gs->o_scatterOffsets, gs->o_scatterIds, type, op, o_u, gs->o_bufSend); - if(gs->mode == OOGS_HOSTMPI) { - gs->o_bufSend.copyTo(gs->bufSend, pwd->comm[send].total*unit_size, 0, "async: true"); - } - } + if(gs->mode == OOGS_HOSTMPI) + gs->o_bufSend.copyTo(gs->bufSend, pwd->comm[send].total*unit_size, 0, "async: true"); { // pw exchange - ogs->device.finish(); // waiting for buffers to be ready - MPI_Barrier(comm->c); + if(gs->mode != OOGS_DEVICEMPI) ogs->device.finish(); // waiting for buffers to be ready comm_req *req = &pwd->req[pwd->comm[recv].n]; const struct pw_comm_data *c = &pwd->comm[send]; const uint *p, *pe, *size=c->size; uint bufOffset = 0; for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; + const size_t len = *(size++); unsigned char *sendbuf = (unsigned char*)gs->bufSend + bufOffset; if(gs->mode == OOGS_DEVICEMPI) sendbuf = (unsigned char*)gs->o_bufSend.ptr() + bufOffset; - MPI_Isend((void*)sendbuf,len,MPI_UNSIGNED_CHAR,*p,comm->id,comm->c,req++); - bufOffset += len; + MPI_Isend((void*)sendbuf,len*unit_size,MPI_UNSIGNED_CHAR,*p,comm->id,comm->c,req++); + bufOffset += len*unit_size; } MPI_Waitall(pwd->comm[send].n + pwd->comm[recv].n,pwd->req,MPI_STATUSES_IGNORE); } - { // gather - if(gs->mode == OOGS_HOSTMPI){ - gs->o_bufRecv.copyFrom(gs->bufRecv,pwd->comm[recv].total*unit_size, 0, "async: true"); - } - - // op hardwired for now!!! - occaGather(Nhalo, gs->o_gatherOffsets, gs->o_gatherIds, type, "add+self", gs->o_bufRecv, o_u); - } - + if(gs->mode == OOGS_HOSTMPI) + gs->o_bufRecv.copyFrom(gs->bufRecv,pwd->comm[recv].total*unit_size, 0, "async: true"); } -oogs_t* oogs::setup(dlong N, hlong *ids, const char *type, MPI_Comm &comm, - int verbose, occa::device device, oogs_mode gsMode) +oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::function callback, oogs_mode gsMode) { - int rank; - MPI_Comm_rank(comm, &rank); - oogs_t *gs = new oogs_t[1]; - gs->ogs = ogsSetup(N, ids, comm, verbose, device); - ogs_t *ogs = gs->ogs; + gs->ogs = ogs; + occa::device device = gs->ogs->device; const unsigned transpose = 0; struct gs_data *hgs = (gs_data*) ogs->haloGshSym; @@ -169,156 +142,234 @@ oogs_t* oogs::setup(dlong N, hlong *ids, const char *type, MPI_Comm &comm, const void* execdata = hgs->r.data; const struct pw_data *pwd = (pw_data*) execdata; const unsigned Nhalo = ogs->NhaloGather; - const unsigned unit_size = sizeof(double); // hardwire just need to be big enough + const unsigned unit_size = nVec*sizeof(double); // hardwire just need to be big enough + const struct comm *comm = &hgs->comm; + const int rank = comm->id; if(Nhalo == 0) return gs; + for (int r=0;r<2;r++) { + if ((r==0 && rank==0) || (r==1 && rank>0)) { + gs->packBufDoubleKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_double", ogs::kernelInfo); + gs->unpackBufDoubleKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_double", ogs::kernelInfo); + gs->packBufFloatKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_float", ogs::kernelInfo); + gs->unpackBufFloatKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_float", ogs::kernelInfo); + } + MPI_Barrier(comm->c); + } + occa::properties props; props["mapped"] = true; - gs->h_buffSend = ogs->device.malloc(pwd->comm[send].total*unit_size, props); gs->bufSend = (unsigned char*)gs->h_buffSend.ptr(props); - int *scatterOffsets = (int*) calloc(2*Nhalo,sizeof(int)); + int *scatterOffsets = (int*) calloc((Nhalo+1),sizeof(int)); int *scatterIds = (int*) calloc(pwd->comm[send].total,sizeof(int)); convertPwMap(pwd->map[send], scatterOffsets, scatterIds); gs->o_bufSend = ogs->device.malloc(pwd->comm[send].total*unit_size); - gs->o_scatterOffsets = ogs->device.malloc(2*Nhalo*sizeof(int), scatterOffsets); + gs->o_scatterOffsets = ogs->device.malloc((Nhalo+1)*sizeof(int), scatterOffsets); gs->o_scatterIds = ogs->device.malloc(pwd->comm[send].total*sizeof(int), scatterIds); free(scatterOffsets); free(scatterIds); gs->h_buffRecv = ogs->device.malloc(pwd->comm[recv].total*unit_size, props); gs->bufRecv = (unsigned char*)gs->h_buffRecv.ptr(props); - int* gatherOffsets = (int*) calloc(2*Nhalo,sizeof(int)); + int* gatherOffsets = (int*) calloc((Nhalo+1),sizeof(int)); int *gatherIds = (int*) calloc(pwd->comm[recv].total,sizeof(int)); convertPwMap(pwd->map[recv], gatherOffsets, gatherIds); gs->o_bufRecv = ogs->device.malloc(pwd->comm[recv].total*unit_size); - gs->o_gatherOffsets = ogs->device.malloc(2*Nhalo*sizeof(int), gatherOffsets); + gs->o_gatherOffsets = ogs->device.malloc((Nhalo+1)*sizeof(int), gatherOffsets); gs->o_gatherIds = ogs->device.malloc(pwd->comm[recv].total*sizeof(int), gatherIds); free(gatherOffsets); free(gatherIds); + + std::list oogs_mode_list; + oogs_mode_list.push_back(OOGS_DEFAULT); + oogs_mode_list.push_back(OOGS_HOSTMPI); + const char* env_val = std::getenv ("OGS_MPI_SUPPORT"); + if(env_val != NULL) { + if(std::stoi(env_val)) oogs_mode_list.push_back(OOGS_DEVICEMPI);; + } if(gsMode == OOGS_AUTO) { if(rank == 0) printf("timing gs modes: "); const int Ntests = 10; double elapsedLast = std::numeric_limits::max(); - oogs_mode fastestMode; - occa::memory o_q = device.malloc(N*unit_size); - for (auto const& mode : {OOGS_DEFAULT, OOGS_HOSTMPI}) + oogs_mode fastestMode; + occa::memory o_q; + if(!stride) + o_q = device.malloc(ogs->N*unit_size); + else + o_q = device.malloc(stride*unit_size); + + for (auto const& mode : oogs_mode_list) { gs->mode = mode; + // warum-up + oogs::start (o_q, nVec, stride, type, ogsAdd, gs); + if(callback) callback(); + oogs::finish(o_q, nVec, stride, type, ogsAdd, gs); device.finish(); - MPI_Barrier(comm); + MPI_Barrier(comm->c); const double tStart = MPI_Wtime(); for(int test=0;testc); const double elapsed = (MPI_Wtime() - tStart)/Ntests; if(rank == 0) printf("%gs ", elapsed); if(elapsed < elapsedLast) fastestMode = gs->mode; elapsedLast = elapsed; } - MPI_Bcast(&fastestMode, 1, MPI_INT, 0, comm); + MPI_Bcast(&fastestMode, 1, MPI_INT, 0, comm->c); gs->mode = fastestMode; o_q.free(); } else { gs->mode = gsMode; } - if(rank == 0) printf("\nused mode: %d\n", gs->mode); + if(rank == 0) printf("used oogs mode: %d\n", gs->mode); return gs; } -void oogs::start(occa::memory o_v, const char *type, const char *op, oogs_t *gs) +oogs_t* oogs::setup(dlong N, hlong *ids, int nVec, dlong stride, const char *type, MPI_Comm &comm, + int verbose, occa::device device, std::function callback, oogs_mode gsMode) +{ + ogs_t *ogs = ogsSetup(N, ids, comm, verbose, device); + return setup(ogs, nVec, stride, type, callback, gsMode); +} + +void oogs::start(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *gs) { size_t Nbytes; - if (!strcmp(type, "float")) + occa::kernel packBuf; + if (!strcmp(type, "float")) { Nbytes = sizeof(float); - else if (!strcmp(type, "double")) + packBuf = gs->packBufFloatKernel; + } else if (!strcmp(type, "double")) { Nbytes = sizeof(double); - else if (!strcmp(type, "int")) + packBuf = gs->packBufDoubleKernel; + } else if (!strcmp(type, "int")) { Nbytes = sizeof(int); - else if (!strcmp(type, "long long int")) + } else if (!strcmp(type, "long long int")) { Nbytes = sizeof(long long int); + } ogs_t *ogs = gs->ogs; if (ogs->NhaloGather) { - if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); - ogs::haloBuf = ogsHostMallocPinned(ogs->device, ogs->NhaloGather*Nbytes, NULL, ogs::o_haloBuf, ogs::h_haloBuf); + ogs::haloBuf = ogsHostMallocPinned(ogs->device, ogs->NhaloGather*Nbytes*k, NULL, ogs::o_haloBuf, ogs::h_haloBuf); } } if (ogs->NhaloGather) { - occaGather(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); - ogs->device.finish(); // just in case dataStream is non-blocking + occaGatherMany(ogs->NhaloGather, k, stride, ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + if(gs->mode != OOGS_DEFAULT) packBuf(ogs->NhaloGather, k, gs->o_scatterOffsets, gs->o_scatterIds, ogs::o_haloBuf, gs->o_bufSend); + ogs->device.finish(); if(gs->mode == OOGS_DEFAULT) { ogs->device.setStream(ogs::dataStream); - ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); ogs->device.setStream(ogs::defaultStream); } } } -void oogs::finish(occa::memory o_v, const char *type, const char *op, oogs_t *gs) +void oogs::finish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *gs) { size_t Nbytes; - if (!strcmp(type, "float")) + occa::kernel unpackBuf; + if (!strcmp(type, "float")) { Nbytes = sizeof(float); - else if (!strcmp(type, "double")) + unpackBuf = gs->unpackBufFloatKernel; + } else if (!strcmp(type, "double")) { Nbytes = sizeof(double); - else if (!strcmp(type, "int")) - Nbytes = sizeof(int); - else if (!strcmp(type, "long long int")) - Nbytes = sizeof(long long int); + unpackBuf = gs->unpackBufDoubleKernel; + } else { + printf("oogs: unsupported datatype %s!\n", type); + exit(1); + } + + if (strcmp(op, "add")) { + printf("oogs: unsupported operation %s!\n", op); + exit(1); + } ogs_t *ogs = gs->ogs; if(ogs->NlocalGather) { - occaGatherScatter(ogs->NlocalGather, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v); + occaGatherScatterMany(ogs->NlocalGather, k, stride, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v); } if (ogs->NhaloGather) { ogs->device.setStream(ogs::dataStream); + if(gs->mode == OOGS_DEFAULT) ogs->device.finish(); // waiting for gs::haloBuf copy to finish + #ifdef OGS_ENABLE_TIMER - timer::tic("gsMPI",1); + timer::tic("gsMPI",1); #endif if(gs->mode == OOGS_DEFAULT) { - ogs->device.finish(); // waiting for gs::haloBuf copy to finish - ogsHostGatherScatter(ogs::haloBuf, type, op, ogs->haloGshSym); + void* H[10]; + for (int i=0;iNhaloGather*Nbytes; + ogsHostGatherScatterMany(H, k, type, op, ogs->haloGshSym); } else { - _ogsHostGatherScatter(ogs::o_haloBuf, type, op, gs); + pairwiseExchange(ogs::o_haloBuf, Nbytes*k, gs); } #ifdef OGS_ENABLE_TIMER - timer::toc("gsMPI"); + timer::toc("gsMPI"); #endif if(gs->mode == OOGS_DEFAULT) { - ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + } else { + unpackBuf(ogs->NhaloGather, k, gs->o_gatherOffsets, gs->o_gatherIds, gs->o_bufRecv, ogs::o_haloBuf); } ogs->device.finish(); ogs->device.setStream(ogs::defaultStream); - occaScatter(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_v); + occaScatterMany(ogs->NhaloGather, k, ogs->NhaloGather, stride, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_v); } } - -void oogs::gatherScatter(occa::memory o_v, const char *type, const char *op, oogs_t *gs){ - oogs::start(o_v, type, op, gs); - oogs::finish(o_v, type, op, gs); +void oogs::startFinish(void *v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h) +{ + ogsGatherScatterMany(v, k, stride, type, op, h->ogs); +} +void oogs::startFinish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, oogs_t *h) +{ + start(o_v, k, stride, type, op, h); + finish(o_v, k, stride, type, op, h); } -void oogs::gatherScatter(void *v, const char *type, const char *op, oogs_t *gs){ - ogsHostGatherScatter(v, type, op, gs->ogs->hostGsh); -} +void oogs::destroy(oogs_t *gs) +{ + //ogsFree(gs->ogs); + + gs->h_buffSend.free(); + gs->h_buffRecv.free(); + + gs->o_scatterIds.free(); + gs->o_gatherIds.free(); + + gs->o_scatterOffsets.free(); + gs->o_gatherOffsets.free(); + + gs->o_bufRecv.free(); + gs->o_bufSend.free(); + + gs->packBufDoubleKernel.free(); + gs->unpackBufDoubleKernel.free(); + gs->packBufFloatKernel.free(); + gs->unpackBufFloatKernel.free(); + + free(gs); +} diff --git a/RELEASE.md b/RELEASE.md index 96f0b9340..a01311505 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,7 +11,7 @@ ## What you may have to change to be compatible -* n/a +* common block SCRNS was replaced by pointer array NRSSCPTR (see ethier example) ## Known Bugs diff --git a/examples/conj_ht/conj_ht.usr b/examples/conj_ht/conj_ht.usr index 328050421..5c2856b24 100644 --- a/examples/conj_ht/conj_ht.usr +++ b/examples/conj_ht/conj_ht.usr @@ -18,9 +18,15 @@ c----------------------------------------------------------------------- include 'SIZE' include 'TOTAL' - common /SCNRS/ dnorm(2) + real err(2) + save err + COMMON /NRSSCPTR/ nrs_scptr(1) + integer*8 nrs_scptr + real wrk(lx1,ly1,lz1,lelt,3) + nrs_scptr(1) = loc(err(1)) + nv = nx1*ny1*nz1*nelv nt = nx1*ny1*nz1*nelt @@ -34,7 +40,7 @@ c call sub2(wrk(1,1,1,1,3), t,nt) err_inf = glamax(wrk(1,1,1,1,1),nv) err_l2 = glsc3(wrk(1,1,1,1,1),bm1,wrk(1,1,1,1,1),nv) - dnorm(1) = sqrt(err_l2) + err(1) = sqrt(err_l2) c if(nid.eq.0) write(6,*) 'inf/L2 error vx:', err_inf, sqrt(err_l2) c err_inf = glamax(wrk(1,1,1,1,2),nv) @@ -43,7 +49,7 @@ c if(nid.eq.0) write(6,*) 'inf/L2 error vy:', err_inf, sqrt(err_l2) err_inf = glamax(wrk(1,1,1,1,3),nt) err_l2 = glsc3(wrk(1,1,1,1,3),bm2,wrk(1,1,1,1,3),nt) - dnorm(2) = sqrt(err_l2) + err(2) = sqrt(err_l2) c if(nid.eq.0) write(6,*) 'inf/L2 error t:', err_inf, sqrt(err_l2) return diff --git a/examples/conj_ht/conj_ht_ci.h b/examples/conj_ht/conj_ht_ci.h index 039138d3b..088e0909e 100644 --- a/examples/conj_ht/conj_ht_ci.h +++ b/examples/conj_ht/conj_ht_ci.h @@ -31,13 +31,13 @@ void ciTestErrors(ins_t *ins, dfloat time, int tstep) nek_ocopyFrom(time, tstep); nek_userchk(); - double *norm = nekData.cbscnrs; + double *err = (double *) nek_scPtr(1); double vxErr, sErr; switch (ciMode) { // cross compare solution to nek5000 - case 1: vxErr = abs((norm[0] - 2.06559)/norm[0]); - sErr = abs((norm[1] - 28.3833)/norm[1]); + case 1: vxErr = abs((err[0] - 2.06559)/err[0]); + sErr = abs((err[1] - 28.3833)/err[1]); break; } diff --git a/examples/ethier/ethier.usr b/examples/ethier/ethier.usr index 1aab9be98..d7c47aec8 100644 --- a/examples/ethier/ethier.usr +++ b/examples/ethier/ethier.usr @@ -32,11 +32,17 @@ C----------------------------------------------------------------------- real pre (lx2,ly2,lz2,lelv) real prerr(lx2,ly2,lz2,lelv) + real err(4) + save err + common /SCRNS/ wo1(lx1,ly1,lz1,lelv) & ,wo2(lx1,ly1,lz1,lelv) & ,omg(lx1*ly1*lz1*lelv,ldim) - common /SCNRS/ wrk(4) + COMMON /NRSSCPTR/ nrs_scptr(1) + integer*8 nrs_scptr + + nrs_scptr(1) = loc(err) n = nelv*nx1*ny1*nz1 n2 = nelv*nx2*ny2*nz2 @@ -68,10 +74,10 @@ C----------------------------------------------------------------------- prerrl2 = glsc3(prerr,bm2,prerr,n2) prerrl2 = sqrt(prerrl2) - wrk(1) = uxerrl2 - wrk(2) = prerrl2 - wrk(3) = terrl2 - wrk(4) = serrl2 + err(1) = uxerrl2 + err(2) = prerrl2 + err(3) = terrl2 + err(4) = serrl2 if (nid.eq.nio) write(6,*) istep,time,uxerrl2,prerrl2, & terrl2,serrl2,' L2 err' diff --git a/examples/ethier/ethier_ci.h b/examples/ethier/ethier_ci.h index 811a69d4f..ce96906b6 100644 --- a/examples/ethier/ethier_ci.h +++ b/examples/ethier/ethier_ci.h @@ -43,7 +43,7 @@ void ciTestErrors(ins_t *ins, dfloat time, int tstep) nek_ocopyFrom(time, tstep); nek_userchk(); - double *err = nekData.cbscnrs; + double *err = (double *) nek_scPtr(1); const double vxErr = abs((err[0] - 1.19E-04)/err[0]); const double prErr = abs((err[1] - 6.49E-04)/err[1]); diff --git a/examples/lowMach/lowMach.par b/examples/lowMach/lowMach.par index cc4b9cf09..a143073aa 100644 --- a/examples/lowMach/lowMach.par +++ b/examples/lowMach/lowMach.par @@ -1,5 +1,5 @@ [OCCA] -backend = CUDA +backend = SERIAL deviceNumber = LOCAL-RANK [GENERAL] @@ -16,6 +16,7 @@ variableProperties = yes [PRESSURE] residualTol = 1e-10 +residualProj = false [VELOCITY] boundaryTypeMap = fixedValue diff --git a/examples/lowMach/lowMach.usr b/examples/lowMach/lowMach.usr index 6e744be5d..1108697fa 100644 --- a/examples/lowMach/lowMach.usr +++ b/examples/lowMach/lowMach.usr @@ -17,10 +17,16 @@ c----------------------------------------------------------------------- real pre,vT_err,VX_err,QTL_err, pr_err real sumqw,l2_err,exact,max_err,XD,sech - common /SCNRS/ wrk(3) + + real err(3) + save err + COMMON /NRSSCPTR/ nrs_scptr(1) + integer*8 nrs_scptr if (istep.lt.1) return + nrs_scptr(1) = loc(err(1)) + call printdiverr ntot = nx1*ny1*nz1*nelv @@ -43,7 +49,7 @@ c----------------------------------------------------------------------- 100 continue max_err = glmax(VX_err,ntot) - wrk(1) = max_err + err(1) = max_err l2_err = glsc3(VX_err,bm1,VX_err,ntot)/VOLVM1 if (l2_err.gt.0) l2_err = sqrt(l2_err) if(nid.eq.0) @@ -51,7 +57,7 @@ c----------------------------------------------------------------------- 1 format(i8,1p3e12.4,' ERROR VX: MAX/L2') max_err = glmax(T_err,ntot) - wrk(3) = max_err + err(3) = max_err l2_err = glsc3(T_err,bm1,T_err,ntot)/VOLVM1 if (l2_err.gt.0) l2_err = sqrt(l2_err) if(nid.eq.0) @@ -72,7 +78,7 @@ c 3 format(i8,1p3e12.4,' ERROR QTL: MAX/L2') call sub3(pr_err,PR,pre,ntot) max_err = glamax(pr_err,ntot) - wrk(2) = max_err + err(2) = max_err l2_err = glsc3(pr_err,bm1,pr_err,ntot)/VOLVM1 if (l2_err.gt.0) l2_err = sqrt(l2_err) if(nid.eq.0) diff --git a/examples/lowMach/lowMach_ci.h b/examples/lowMach/lowMach_ci.h index d65c74718..775fa7e8b 100644 --- a/examples/lowMach/lowMach_ci.h +++ b/examples/lowMach/lowMach_ci.h @@ -35,7 +35,7 @@ void ciTestErrors(ins_t *ins, dfloat time, int tstep) nek_ocopyFrom(time, tstep); nek_userchk(); - double *err = nekData.cbscnrs; + double *err = (double *) nek_scPtr(1); double vxErr, prErr, sErr; switch (ciMode) { diff --git a/scripts/nrsqsub_lassen b/scripts/nrsqsub_lassen index f6bc1719b..fc0b0ab45 100755 --- a/scripts/nrsqsub_lassen +++ b/scripts/nrsqsub_lassen @@ -10,6 +10,7 @@ XL_HOME="/usr/tce/packages/xl/xl-2020.03.18/xlC/16.1.1" export NEKRS_HOME export OCCA_CACHE_DIR export NEKRS_HYPRE_NUM_THREADS=1 +export OGS_MPI_SUPPORT=1 export OCCA_CXX="$XL_HOME/bin/xlc" export OCCA_CXXFLAGS="-O3 -qarch=pwr9 -qhot -DUSE_OCCA_MEM_BYTE_ALIGN=64" export OCCA_LDFLAGS="$XL_HOME/lib/libibmc++.a" diff --git a/scripts/nrsqsub_summit b/scripts/nrsqsub_summit index 510d726d0..cc9c864ec 100755 --- a/scripts/nrsqsub_summit +++ b/scripts/nrsqsub_summit @@ -11,6 +11,7 @@ XL_HOME="/sw/summit/xl/16.1.1-3/xlC/16.1.1" export NEKRS_HOME export OCCA_CACHE_DIR export NEKRS_HYPRE_NUM_THREADS=1 +export OGS_MPI_SUPPORT=1 export OCCA_CXX="$XL_HOME/bin/xlc" export OCCA_CXXFLAGS="-O3 -qarch=pwr9 -qhot -DUSE_OCCA_MEM_BYTE_ALIGN=64" export OCCA_LDFLAGS="$XL_HOME/lib/libibmc++.a" diff --git a/src/core/cds.cpp b/src/core/cds.cpp index 4d2fefa5a..bbe04214d 100644 --- a/src/core/cds.cpp +++ b/src/core/cds.cpp @@ -3,7 +3,14 @@ occa::memory cdsSolve(const int is, cds_t* cds, dfloat time) { mesh_t* mesh; - (is) ? mesh = cds->meshV : mesh = cds->mesh; + oogs_t* gsh; + if(is) { + mesh = cds->meshV; + gsh = cds->gsh; + } else { + mesh = cds->mesh; + gsh = cds->gshT; + } elliptic_t* solver = cds->solver[is]; cds->o_wrk1.copyFrom(cds->o_BF, cds->Ntotal * sizeof(dfloat), 0, @@ -29,7 +36,7 @@ occa::memory cdsSolve(const int is, cds_t* cds, dfloat time) *(cds->o_usrwrk), cds->o_wrk1); - ogsGatherScatter(cds->o_wrk1, ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(cds->o_wrk1, 1, cds->fieldOffset, ogsDfloat, ogsAdd, gsh); if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, cds->o_wrk1); //copy current solution fields as initial guess diff --git a/src/core/cds.h b/src/core/cds.h index 3490e0f28..c225b12f3 100644 --- a/src/core/cds.h +++ b/src/core/cds.h @@ -26,6 +26,8 @@ typedef struct setupAide options; + oogs_t *gsh, *gshT; + dlong vFieldOffset; dlong fieldOffset; dlong Nlocal, Ntotal; diff --git a/src/core/ins.h b/src/core/ins.h index f2ca73b70..02bcf93da 100644 --- a/src/core/ins.h +++ b/src/core/ins.h @@ -21,6 +21,8 @@ typedef struct elliptic_t* pSolver; cds_t* cds; + oogs_t *gsh; + dlong ellipticWrkOffset; int flow; diff --git a/src/core/insSetup.cpp b/src/core/insSetup.cpp index b326c6a60..7fe675782 100644 --- a/src/core/insSetup.cpp +++ b/src/core/insSetup.cpp @@ -290,6 +290,11 @@ ins_t* insSetup(MPI_Comm comm, occa::device device, setupAide &options, int buil const int nbrBIDs = bcMap::size(0); int NBCType = nbrBIDs + 1; + meshParallelGatherScatterSetup(mesh, ins->Nlocal, mesh->globalIds, mesh->comm, 0); + oogs_mode oogsMode = OOGS_AUTO; + if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT; + ins->gsh = oogs::setup(mesh->ogs, ins->NVfields, ins->fieldOffset, ogsDfloat, NULL, oogsMode); + if (ins->flow) { if (mesh->rank == 0) printf("==================VELOCITY SETUP=========================\n"); @@ -846,6 +851,17 @@ cds_t* cdsSetup(ins_t* ins, mesh_t* mesh, setupAide options, occa::properties &k cds->o_wrk5 = ins->o_wrk5; cds->o_wrk6 = ins->o_wrk6; + cds->gsh = ins->gsh; + + if(ins->cht) { + meshParallelGatherScatterSetup(mesh, cds->Nlocal, mesh->globalIds, mesh->comm, 0); + oogs_mode oogsMode = OOGS_AUTO; + if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT; + cds->gshT = oogs::setup(mesh->ogs, 1, cds->fieldOffset, ogsDfloat, NULL, oogsMode); + } else { + cds->gshT = cds->gsh; + } + // Solution storage at interpolation nodes cds->U = ins->U; // Point to INS side Velocity cds->S = diff --git a/src/core/runTime.cpp b/src/core/runTime.cpp index 99e87b465..6b1950baf 100644 --- a/src/core/runTime.cpp +++ b/src/core/runTime.cpp @@ -490,8 +490,7 @@ occa::memory velocityStrongSubCycle(ins_t* ins, dfloat time, occa::memory o_U) if(rk == 1) o_rhs = ins->o_wrk9; if(rk == 2) o_rhs = ins->o_wrk12; if(rk == 3) o_rhs = ins->o_wrk15; - ogsGatherScatterMany(o_rhs, ins->NVfields, ins->fieldOffset, - ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(o_rhs, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( mesh->Nelements, ins->fieldOffset, @@ -604,7 +603,7 @@ occa::memory scalarStrongSubCycle(cds_t* cds, dfloat time, int is, if(rk == 1) o_rhs = cds->o_wrk3; if(rk == 2) o_rhs = cds->o_wrk4; if(rk == 3) o_rhs = cds->o_wrk5; - ogsGatherScatter(o_rhs, ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(o_rhs, 1, cds->fieldOffset, ogsDfloat, ogsAdd, cds->gsh); cds->invMassMatrixKernel( mesh->Nelements, cds->fieldOffset, @@ -643,8 +642,7 @@ void qthermal(ins_t* ins, dfloat time, occa::memory o_div) cds->o_S, cds->o_wrk0); - ogsGatherScatterMany(cds->o_wrk0, ins->NVfields, ins->fieldOffset, - ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(cds->o_wrk0, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( mesh->Nelements, @@ -671,8 +669,7 @@ void qthermal(ins_t* ins, dfloat time, occa::memory o_div) cds->o_wrk3, o_div); - ogsGatherScatterMany(o_div, 1, ins->fieldOffset, - ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(o_div, 1, ins->fieldOffset, ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( mesh->Nelements, diff --git a/src/core/tombo.cpp b/src/core/tombo.cpp index 16c60fa62..907252d2f 100644 --- a/src/core/tombo.cpp +++ b/src/core/tombo.cpp @@ -14,8 +14,7 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) ins->o_Ue, ins->o_wrk0); - ogsGatherScatterMany(ins->o_wrk0, ins->NVfields, ins->fieldOffset, - ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(ins->o_wrk0, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( mesh->Nelements, @@ -63,8 +62,8 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) ins->o_wrk0, ins->o_wrk6); - ogsGatherScatterMany(ins->o_wrk6, ins->NVfields, ins->fieldOffset, - ogsDfloat, ogsAdd, mesh->ogs); + + oogs::startFinish(ins->o_wrk6, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); ins->invMassMatrixKernel( mesh->Nelements, @@ -119,7 +118,8 @@ occa::memory pressureSolve(ins_t* ins, dfloat time) elliptic_t* solver = ins->pSolver; - ogsGatherScatter(ins->o_wrk3, ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(ins->o_wrk3, 1, 0, ogsDfloat, ogsAdd, ins->gsh); + if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_wrk3); ins->setScalarKernel(ins->Ntotal, 0.0, ins->o_PI); @@ -208,8 +208,7 @@ occa::memory velocitySolve(ins_t* ins, dfloat time) ins->o_ellipticCoeff, ins->o_wrk3); - ogsGatherScatterMany(ins->o_wrk3, ins->NVfields, ins->fieldOffset, - ogsDfloat, ogsAdd, mesh->ogs); + oogs::startFinish(ins->o_wrk3, ins->NVfields, ins->fieldOffset,ogsDfloat, ogsAdd, ins->gsh); // Use old velocity as initial condition ins->o_wrk0.copyFrom(ins->o_U, ins->NVfields * ins->fieldOffset * sizeof(dfloat)); diff --git a/src/libP/solvers/elliptic/elliptic.h b/src/libP/solvers/elliptic/elliptic.h index 8bd7c09b0..f48111edb 100644 --- a/src/libP/solvers/elliptic/elliptic.h +++ b/src/libP/solvers/elliptic/elliptic.h @@ -60,6 +60,8 @@ typedef struct precon_t* precon; ogs_t* ogs; + oogs_t* oogs; + oogs_t* oogsAx; setupAide options; diff --git a/src/libP/solvers/elliptic/src/ellipticBuildJacobi.c b/src/libP/solvers/elliptic/src/ellipticBuildJacobi.c index 0c57ec1f0..89728d4e4 100644 --- a/src/libP/solvers/elliptic/src/ellipticBuildJacobi.c +++ b/src/libP/solvers/elliptic/src/ellipticBuildJacobi.c @@ -83,19 +83,14 @@ void ellipticUpdateJacobi(elliptic_t* elliptic) elliptic->o_lambda, precon->o_invDiagA); + oogs::startFinish(precon->o_invDiagA, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); + const dfloat one = 1.0; - if(elliptic->blockSolver) { - ogsGatherScatterMany(precon->o_invDiagA, - elliptic->Nfields, - elliptic->Ntotal, - ogsDfloat, - ogsAdd, - elliptic->ogs); + if(elliptic->blockSolver) elliptic->scalarDivideManyKernel(Nlocal, elliptic->Ntotal, one, precon->o_invDiagA); - }else { - ogsGatherScatter(precon->o_invDiagA, ogsDfloat, ogsAdd, elliptic->ogs); + else elliptic->scalarDivideKernel(Nlocal, one, precon->o_invDiagA); - } + } void ellipticBuildJacobi(elliptic_t* elliptic, dfloat** invDiagA) diff --git a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c b/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c index 99795d1a3..c6d83dbd1 100644 --- a/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c +++ b/src/libP/solvers/elliptic/src/ellipticBuildMultigridLevel.c @@ -989,5 +989,24 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf } } + oogs_mode oogsMode = OOGS_AUTO; + if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT; + if(options.compareArgs("THREAD MODEL", "OPENMP")) oogsMode = OOGS_DEFAULT; + auto callback = [&]() // hardwaire to FP64 const coeff + { + occa::kernel &partialAxKernel = elliptic->partialAxKernel; + partialAxKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_ggeo, + mesh->o_Dmatrices, + mesh->o_Smatrices, + mesh->o_MM, + elliptic->lambda[0], + elliptic->o_p, + elliptic->o_Ap); + }; + elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode); + elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode); + return elliptic; } diff --git a/src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c b/src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c index 782dcb4ce..4ca7c6d58 100644 --- a/src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c +++ b/src/libP/solvers/elliptic/src/ellipticMultiGridLevel.c @@ -52,7 +52,7 @@ void MGLevel::coarsen(occa::memory o_x, occa::memory o_Rx) elliptic->precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx); if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - ogsGatherScatter(o_Rx, ogsDfloat, ogsAdd, (ogs_t*) elliptic->ogs); + oogs::startFinish(o_Rx, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx); } } diff --git a/src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c b/src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c index a7620c0bf..479e38505 100644 --- a/src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c +++ b/src/libP/solvers/elliptic/src/ellipticMultiGridSchwarz.c @@ -714,20 +714,14 @@ void MGLevel::generate_weights() work2[i] = 1.0; } extrude(work2, 0, zero, work1, 0, one, elliptic->mesh); -#ifdef USE_OOGS - oogs::gatherScatter(work1, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); -#else - ogsGatherScatter(work1, ogsPfloat, ogsAdd, (ogs_t*) extendedOgs); -#endif + + oogs::startFinish(work1, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); + extrude(work1, 0, one, work2, 0, onem, elliptic->mesh); extrude(work1, 2, one, work1, 0, one, elliptic->mesh); to_reg(wts, work1, elliptic->mesh); -#ifdef USE_OOGS - oogs::gatherScatter(wts, ogsPfloat, ogsAdd, (oogs_t*) ogs); -#else - ogsGatherScatter(wts, ogsPfloat, ogsAdd, (ogs_t*) ogs); -#endif + oogs::startFinish(wts, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) ogs); for(dlong i = 0; i < weightSize; ++i) wts[i] = 1.0 / wts[i]; @@ -775,20 +769,16 @@ void MGLevel::build( delete op; delete lengths; -#ifdef USE_OOGS - extendedOgs = (void*) oogs::setup(Nelements * Np_e, extendedMesh->maskedGlobalIds, ogsPfloat, - extendedMesh->comm, 1, extendedMesh->device, OOGS_AUTO); - ogs = (void*) oogs::setup(Nelements * Np, elliptic->mesh->maskedGlobalIds, ogsPfloat, - elliptic->mesh->comm, 1, elliptic->mesh->device, OOGS_AUTO); -#else - extendedOgs = (void*) ogsSetup(Nelements * Np_e, - extendedMesh->maskedGlobalIds, - extendedMesh->comm, - 0, - extendedMesh->device); - ogs = (void*) elliptic->ogs; -#endif - extendedMesh->ogs = nullptr; + oogs_mode oogsMode = OOGS_AUTO; + if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT; + + extendedOgs = (void*) oogs::setup(Nelements * Np_e, extendedMesh->maskedGlobalIds, elliptic->Nfields, + elliptic->Ntotal, ogsPfloat, extendedMesh->comm, 1, extendedMesh->device, + NULL, oogsMode); + ogs = (void*) oogs::setup(Nelements * Np, elliptic->mesh->maskedGlobalIds, elliptic->Nfields, + elliptic->Ntotal, ogsPfloat, elliptic->mesh->comm, 1, elliptic->mesh->device, + NULL, oogsMode); + meshFree(extendedMesh); const dlong weightSize = Np * Nelements; @@ -836,20 +826,12 @@ void MGLevel::smoothSchwarz(occa::memory& o_u, occa::memory& o_Su, bool xIsZero) const dlong Nelements = elliptic->mesh->Nelements; preFDMKernel(Nelements, o_u, o_work1); -#ifdef USE_OOGS - oogs::gatherScatter(o_work1, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); -#else - ogsGatherScatter(o_work1, ogsPfloat, ogsAdd, (ogs_t*) extendedOgs); -#endif + oogs::startFinish(o_work1, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); if(options.compareArgs("MULTIGRID SMOOTHER","RAS")) { fusedFDMKernel(Nelements,o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); -#ifdef USE_OOGS - oogs::gatherScatter(o_work2, ogsPfloat, ogsAdd, (oogs_t*) ogs); -#else - ogsGatherScatter(o_work2, ogsPfloat, ogsAdd, (ogs_t*) ogs); -#endif + oogs::startFinish(o_work2, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) ogs); collocateKernel(elliptic->mesh->Nelements * elliptic->mesh->Np, elliptic->ogs->o_invDegree, @@ -858,19 +840,11 @@ void MGLevel::smoothSchwarz(occa::memory& o_u, occa::memory& o_Su, bool xIsZero) } else { fusedFDMKernel(Nelements,o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); -#ifdef USE_OOGS - oogs::gatherScatter(o_work2, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); -#else - ogsGatherScatter(o_work2, ogsPfloat, ogsAdd, (ogs_t*) extendedOgs); -#endif + oogs::startFinish(o_work2, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); postFDMKernel(Nelements,o_work1,o_work2,o_work3); -#ifdef USE_OOGS - oogs::gatherScatter(o_work3, ogsPfloat, ogsAdd, (oogs_t*) ogs); -#else - ogsGatherScatter(o_work3, ogsPfloat, ogsAdd, (ogs_t*) ogs); -#endif + oogs::startFinish(o_work3, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) ogs); collocateKernel(elliptic->mesh->Nelements * elliptic->mesh->Np, o_wts, o_work3, o_Su); } diff --git a/src/libP/solvers/elliptic/src/ellipticOperator.c b/src/libP/solvers/elliptic/src/ellipticOperator.c index 3a8aebf8b..e6fb46df1 100644 --- a/src/libP/solvers/elliptic/src/ellipticOperator.c +++ b/src/libP/solvers/elliptic/src/ellipticOperator.c @@ -79,26 +79,15 @@ void ellipticSerialOperator(elliptic_t* elliptic, o_Aq); } -/* - ogs_t *ellipticOgs = elliptic->ogs; - if(elliptic->blockSolver){ - void *V[elliptic->Nfields]; - for(int fld=0; fldNfields; fld++) - V[fld] = (char*) o_Aq.ptr() + fld*elliptic->Ntotal*sizeof(dfloat); - ogsHostGatherScatterMany(V, elliptic->Nfields, dfloatString, "add", ellipticOgs->hostGsh); - }else{ - ogsHostGatherScatter(o_Aq.ptr(), dfloatString, "add", ellipticOgs->hostGsh); - } - */ - ogsGatherScatterMany(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, - elliptic->ogs); - - //post-mask + oogs::startFinish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Aq); + } else if(ipdg) { + printf("WARNING: DEBUGGING C0\n"); MPI_Finalize(); + exit(-1); } } @@ -130,14 +119,13 @@ void ellipticOperator(elliptic_t* elliptic, dfloat* tmp = elliptic->tmp; occa::memory &o_tmp = elliptic->o_tmp; - if(continuous) { - // TW: turned off for debugging - if(serial) { - ellipticSerialOperator(elliptic, o_q, o_Aq, precision); - return; - } + if(serial) { + ellipticSerialOperator(elliptic, o_q, o_Aq, precision); + return; + } - ogs_t* ogs = elliptic->ogs; + if(continuous) { + oogs_t* oogsAx = elliptic->oogsAx; int mapType = (elliptic->elementType == HEXAHEDRA && options.compareArgs("ELEMENT MAP", "TRILINEAR")) ? 1:0; @@ -232,24 +220,10 @@ void ellipticOperator(elliptic_t* elliptic, } } } - // else{ - // elliptic->partialCubatureAxKernel(mesh->NglobalGatherElements, - // mesh->o_globalGatherElementList, - // mesh->o_cubggeo, - // mesh->o_cubD, - // mesh->o_cubInterpT, - // lambda, o_q, o_Aq); - // } } - if(enableGatherScatters) { - // printf("1-----gather-scatter Aq \n"); - if(elliptic->blockSolver) - ogsGatherScatterManyStart(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, - ogs); - else - ogsGatherScatterStart(o_Aq, ogsDfloat, ogsAdd, ogs); - } + if(enableGatherScatters) + oogs::start(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, oogsAx); if(mesh->NlocalGatherElements) { if(integrationType == 0) { // GLL or non-hex @@ -335,29 +309,13 @@ void ellipticOperator(elliptic_t* elliptic, } } } -// else{ -// elliptic->partialCubatureAxKernel(mesh->NlocalGatherElements, -// mesh->o_localGatherElementList, -// mesh->o_cubggeo, -// mesh->o_cubD, -// mesh->o_cubInterpT, -// lambda, -// o_q, -// o_Aq); -// } } - // finalize gather using local and global contributions - if(enableGatherScatters) { - if(elliptic->blockSolver) - ogsGatherScatterManyFinish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, - ogs); - else - ogsGatherScatterFinish(o_Aq, ogsDfloat, ogsAdd, ogs); - } + if(enableGatherScatters) + oogs::finish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, oogsAx); - //post-mask if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Aq); + } } diff --git a/src/libP/solvers/elliptic/src/ellipticSolveSetup.c b/src/libP/solvers/elliptic/src/ellipticSolveSetup.c index f5ddf5c3c..d75f71c0e 100644 --- a/src/libP/solvers/elliptic/src/ellipticSolveSetup.c +++ b/src/libP/solvers/elliptic/src/ellipticSolveSetup.c @@ -359,7 +359,7 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo) //setup an unmasked gs handle int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; - meshParallelGatherScatterSetup(mesh, Nlocal, mesh->globalIds, mesh->comm, verbose); + if(mesh->ogs == NULL) meshParallelGatherScatterSetup(mesh, Nlocal, mesh->globalIds, mesh->comm, verbose); //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) const int mapSize = elliptic->blockSolver ? elliptic->Ntotal * elliptic->Nfields: Nlocal; @@ -377,15 +377,7 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo) int BCFlag = elliptic->BCType[bc + elliptic->NBCType * fld]; int fid = mesh->faceNodes[n + f * mesh->Nfp]; elliptic->mapB[fid + e * mesh->Np + fld * elliptic->Ntotal] = mymin(BCFlag, - elliptic->mapB[fid + - e * - mesh - ->Np - + fld - * - elliptic - -> - Ntotal]); + elliptic->mapB[fid + e *mesh->Np + fld*elliptic->Ntotal]); } } } @@ -425,6 +417,7 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo) elliptic->Nmasked * sizeof(dlong), elliptic->maskIds); + if(elliptic->blockSolver) { // Create a gs handle independent from BC handler elliptic->ogs = ogsSetup(Nlocal, mesh->globalIds, mesh->comm, verbose, mesh->device); // Create copy of invDegree so that we can accelerate vector form of masking!!!!!! @@ -1011,6 +1004,39 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo) long long int pre = mesh->device.memoryAllocated(); + oogs_mode oogsMode = OOGS_AUTO; + if(options.compareArgs("THREAD MODEL", "SERIAL")) oogsMode = OOGS_DEFAULT; + if(options.compareArgs("THREAD MODEL", "OPENMP")) oogsMode = OOGS_DEFAULT; + auto callback = [&]() // hardwired to FP64 variable coeff + { + occa::kernel &partialAxKernel = elliptic->partialAxKernel; + if(elliptic->blockSolver) + partialAxKernel(mesh->NlocalGatherElements, + elliptic->Ntotal, + elliptic->loffset, + mesh->o_localGatherElementList, + mesh->o_ggeo, + mesh->o_Dmatrices, + mesh->o_Smatrices, + mesh->o_MM, + elliptic->o_lambda, + elliptic->o_p, + elliptic->o_Ap); + else + partialAxKernel(mesh->NlocalGatherElements, + elliptic->Ntotal, + mesh->o_localGatherElementList, + mesh->o_ggeo, + mesh->o_Dmatrices, + mesh->o_Smatrices, + mesh->o_MM, + elliptic->o_lambda, + elliptic->o_p, + elliptic->o_Ap); + }; + elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode); + elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode); + ellipticPreconditionerSetup(elliptic, elliptic->ogs, kernelInfo); long long int usedBytes = mesh->device.memoryAllocated() - pre; @@ -1053,4 +1079,5 @@ void ellipticSolveSetup(elliptic_t* elliptic, occa::properties &kernelInfo) } elliptic->residualProjection = new ResidualProjection(* elliptic, nVecsProject, nStepsStart); } + } diff --git a/src/nekInterface/NEKINTF b/src/nekInterface/NEKINTF index 05059a692..95790538f 100644 --- a/src/nekInterface/NEKINTF +++ b/src/nekInterface/NEKINTF @@ -4,8 +4,8 @@ common /c_is1/ glo_num((2**ldim)*lelt) integer*8 glo_num - COMMON /SCNRS/ SC_NRS(LX1*LY1*LZ1*LELT*3) - real SC_NRS + COMMON /NRSSCPTR/ nrs_scptr(100) + integer*8 nrs_scptr common /INTNRS/ getu, getp, gett, getps, llelt integer getu, getp, gett, getps, llelt diff --git a/src/nekInterface/nekInterface.f b/src/nekInterface/nekInterface.f index e2114e22b..76e182004 100644 --- a/src/nekInterface/nekInterface.f +++ b/src/nekInterface/nekInterface.f @@ -9,8 +9,7 @@ subroutine nekf_ptr(ptr,id,len) integer len character*(len) id - integer i8 - integer*8 ptr + integer*8 i8 pointer(ptr,i8) include 'SIZE' @@ -36,8 +35,6 @@ subroutine nekf_ptr(ptr,id,len) ptr = loc(ndim) elseif (id .eq. 'nx1') then ptr = loc(nx1) - elseif (id .eq. 'cb_scnrs') then - ptr = loc(sc_nrs(1)) elseif (id .eq. 'glo_num') then ptr = loc(glo_num(1)) elseif (id .eq. 'xc') then @@ -755,3 +752,22 @@ subroutine nekf_gen_bcmap() return end +c----------------------------------------------------------------------- + subroutine nekf_scptr(id,ptr) + + implicit none + + integer id + integer*8 i8 + pointer(ptr,i8) + + include 'SIZE' + include 'TOTAL' + include 'NEKINTF' + + ptr = nrs_scptr(id) + + return + end +c----------------------------------------------------------------------- + diff --git a/src/nekInterface/nekInterfaceAdapter.cpp b/src/nekInterface/nekInterfaceAdapter.cpp index fa1202f43..5e452844a 100644 --- a/src/nekInterface/nekInterfaceAdapter.cpp +++ b/src/nekInterface/nekInterfaceAdapter.cpp @@ -23,6 +23,7 @@ static void (* userqtl_ptr)(void); static void (* usrsetvert_ptr)(void); static void (* nek_ptr_ptr)(void**, char*, int*); +static void (* nek_scptr_ptr)(int*, void*); static void (* nek_outfld_ptr)(char*); static void (* nek_resetio_ptr)(void); static void (* nek_setio_ptr)(double*, int*, int*, int*, int*, int*, int*); @@ -53,6 +54,13 @@ void* nek_ptr(const char* id) return ptr; } +void* nek_scPtr(int id) +{ + void* ptr; + (*nek_scptr_ptr)(&id, &ptr); + return ptr; +} + void nek_outfld() { const char suffix[] = " "; @@ -237,6 +245,8 @@ void set_function_handles(const char* session_in,int verbose) nek_ptr_ptr = (void (*)(void**, char*, int*))dlsym(handle, fname("nekf_ptr")); check_error(dlerror()); + nek_scptr_ptr = (void (*)(int*, void*))dlsym(handle, fname("nekf_scptr")); + check_error(dlerror()); nek_setup_ptr = (void (*)(int*, char*, char*, int*, int*, int*, int*, int, int))dlsym(handle, fname("nekf_setup")); check_error(dlerror()); @@ -561,7 +571,6 @@ int nek_setup(MPI_Comm c, setupAide &options_in, ins_t** ins_in) nekData.zc = (double*) nek_ptr("zc"); nekData.glo_num = (long long*) nek_ptr("glo_num"); - nekData.cbscnrs = (double*) nek_ptr("cb_scnrs"); nekData.cbc = (char*) nek_ptr("cbc"); nekData.boundaryID = (int*) nek_ptr("boundaryID"); diff --git a/src/nekInterface/nekInterfaceAdapter.hpp b/src/nekInterface/nekInterfaceAdapter.hpp index b0317cfdb..ba95901ab 100644 --- a/src/nekInterface/nekInterfaceAdapter.hpp +++ b/src/nekInterface/nekInterfaceAdapter.hpp @@ -37,8 +37,6 @@ typedef struct int* ifgetu, * ifgetp, * ifgett, * ifgetps; - double* cbscnrs; - /* global vertex ids */ long long* glo_num; @@ -91,6 +89,7 @@ DECLARE_USER_FUNC(userqtl) #endif void* nek_ptr(const char* id); +void* nek_scPtr(int id); void nek_outfld(void); void nek_outfld(const char* suffix); void nek_outfld(const char* suffix, dfloat t, int coords,