diff --git a/examples/ethier/ethier.udf b/examples/ethier/ethier.udf index 522d03ed5..2e6135817 100644 --- a/examples/ethier/ethier.udf +++ b/examples/ethier/ethier.udf @@ -36,6 +36,10 @@ void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_FS) void UDF_LoadKernels(occa::properties& kernelInfo) { + // called from all ranks, so MPI collectives are O.K. + int maxRank = platform->comm.mpiRank; + MPI_Allreduce(MPI_IN_PLACE, &maxRank, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + setupAide &options = platform->options; dfloat mue, rho; diff --git a/src/core/compileKernels.cpp b/src/core/compileKernels.cpp index 6889d97f2..ebd552f16 100644 --- a/src/core/compileKernels.cpp +++ b/src/core/compileKernels.cpp @@ -78,6 +78,14 @@ void compileKernels() { platform->kernels.compile(); + // load platform related kernels + std::string kernelName; + kernelName = "copyDfloatToPfloat"; + platform->copyDfloatToPfloatKernel = platform->kernels.get(kernelName); + + kernelName = "copyPfloatToDfloat"; + platform->copyPfloatToDfloatKernel = platform->kernels.get(kernelName); + MPI_Barrier(platform->comm.mpiComm); const double loadTime = MPI_Wtime() - tStart; diff --git a/src/core/device.cpp b/src/core/device.cpp index 974ad1b7b..891af992b 100644 --- a/src/core/device.cpp +++ b/src/core/device.cpp @@ -59,7 +59,7 @@ occa::kernel device_t::buildKernel(const std::string &fileName, const std::string &kernelName, const occa::properties &props, - std::string suffix) const + const std::string& suffix) const { if(fileName.find(".okl") != std::string::npos){ @@ -80,6 +80,27 @@ device_t::buildKernel(const std::string &fileName, } } +occa::kernel +device_t::buildKernel(const std::string &fileName, + const std::string &kernelName, + const occa::properties &props) const +{ + + const std::string suffix(""); + const bool buildNodeLocal = useNodeLocalCache(); + const int rank = buildNodeLocal ? _comm.localRank : _comm.mpiRank; + MPI_Comm localCommunicator = buildNodeLocal ? _comm.mpiCommLocal : _comm.mpiComm; + occa::kernel constructedKernel; + for(int pass = 0; pass < 2; ++pass){ + if((pass == 0 && rank == 0) || (pass == 1 && rank != 0)){ + constructedKernel = this->buildKernel(fileName, kernelName, props, suffix); + } + MPI_Barrier(localCommunicator); + } + return constructedKernel; + +} + occa::kernel device_t::buildKernel(const std::string &fullPath, const occa::properties &props, diff --git a/src/core/device.hpp b/src/core/device.hpp index 238c036b1..f4d6602f5 100644 --- a/src/core/device.hpp +++ b/src/core/device.hpp @@ -24,11 +24,6 @@ class device_t { occa::device& occaDevice() { return _device; } void finish() { _device.finish(); } - occa::kernel buildKernel(const std::string &fullPath, - const occa::properties &props) const; - occa::kernel buildKernel(const std::string &fullPath, - const occa::properties &props, - const std::string& suffix) const; occa::kernel buildKernel(const std::string &fullPath, const occa::properties &props, const std::string& suffix, @@ -36,14 +31,27 @@ class device_t { occa::kernel buildKernel(const std::string &fullPath, const occa::properties &props, bool buildRank0) const; + + // collective + occa::kernel buildKernel(const std::string &fileName, + const std::string &kernelName, + const occa::properties &props) const; + bool deviceAtomic; private: - friend occa::kernel udfBuildKernel(occa::properties, const char*); + + // non-collective + occa::kernel buildKernel(const std::string &fullPath, + const occa::properties &props) const; + occa::kernel buildKernel(const std::string &fullPath, + const occa::properties &props, + const std::string& suffix) const; occa::kernel buildKernel(const std::string &fileName, const std::string &kernelName, const occa::properties &props, - std::string suffix = "") const; + const std::string& suffix) const; + occa::kernel buildNativeKernel(const std::string &fileName, const std::string &kernelName, const occa::properties &props) const; diff --git a/src/core/kernelRequestManager.cpp b/src/core/kernelRequestManager.cpp index 34d03725a..4e906d5ba 100644 --- a/src/core/kernelRequestManager.cpp +++ b/src/core/kernelRequestManager.cpp @@ -114,7 +114,9 @@ kernelRequestManager_t::compile() const std::string fileName = kernelRequest.fileName; const std::string suffix = kernelRequest.suffix; const occa::properties props = kernelRequest.props; - auto kernel = device.buildKernel(fileName, props, suffix); + + // MPI staging already handled + auto kernel = device.buildKernel(fileName, props, suffix, false); requestToKernel[requestName] = kernel; } } @@ -130,7 +132,9 @@ kernelRequestManager_t::compile() const std::string fileName = kernelRequest.fileName; const std::string suffix = kernelRequest.suffix; const occa::properties props = kernelRequest.props; - auto kernel = device.buildKernel(fileName, props, suffix); + + // MPI staging already handled + auto kernel = device.buildKernel(fileName, props, suffix, false); requestToKernel[requestName] = kernel; } } diff --git a/src/core/platform.cpp b/src/core/platform.cpp index 67bf1c167..3d89f560e 100644 --- a/src/core/platform.cpp +++ b/src/core/platform.cpp @@ -140,11 +140,11 @@ platform_t::platform_t(setupAide& _options, MPI_Comm _commg, MPI_Comm _comm) const std::string oklpath = installDir + "/okl/"; kernelName = "copyDfloatToPfloat"; fileName = installDir + "/okl/core/" + kernelName + extension; - this->copyDfloatToPfloatKernel = this->device.buildKernel(fileName, this->kernelInfo); + this->kernels.add(kernelName, fileName, this->kernelInfo); kernelName = "copyPfloatToDfloat"; fileName = installDir + "/okl/core/" + kernelName + extension; - this->copyPfloatToDfloatKernel = this->device.buildKernel(fileName, this->kernelInfo); + this->kernels.add(kernelName, fileName, this->kernelInfo); } void memPool_t::allocate(const dlong offset, const dlong fields) { diff --git a/src/plugins/RANSktau.cpp b/src/plugins/RANSktau.cpp index 284b84daa..00d7156a4 100644 --- a/src/plugins/RANSktau.cpp +++ b/src/plugins/RANSktau.cpp @@ -73,23 +73,23 @@ void RANSktau::buildKernel(occa::properties kernelInfo) { kernelName = "computeHex3D"; fileName = path + kernelName + extension; - computeKernel = platform->device.buildKernel(fileName, kernelInfo); + computeKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "SijOijHex3D"; fileName = path + kernelName + extension; - SijOijKernel = platform->device.buildKernel(fileName, kernelInfo); + SijOijKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "SijOijMag2"; fileName = path + kernelName + extension; - SijOijMag2Kernel = platform->device.buildKernel(fileName, kernelInfo); + SijOijMag2Kernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "limit"; fileName = path + kernelName + extension; - limitKernel = platform->device.buildKernel(fileName, kernelInfo); + limitKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "mue"; fileName = path + kernelName + extension; - mueKernel = platform->device.buildKernel(fileName, kernelInfo); + mueKernel = platform->device.buildKernel(fileName, kernelInfo, true); } int Nscalar; diff --git a/src/plugins/avg.cpp b/src/plugins/avg.cpp index b6e4dc6fa..4978e1d2e 100644 --- a/src/plugins/avg.cpp +++ b/src/plugins/avg.cpp @@ -57,15 +57,15 @@ void avg::buildKernel(occa::properties kernelInfo) { kernelName = "EX"; fileName = path + kernelName + extension; - EXKernel = platform->device.buildKernel(fileName, kernelInfo); + EXKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "EXX"; fileName = path + kernelName + extension; - EXXKernel = platform->device.buildKernel(fileName, kernelInfo); + EXXKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "EXY"; fileName = path + kernelName + extension; - EXYKernel = platform->device.buildKernel(fileName, kernelInfo); + EXYKernel = platform->device.buildKernel(fileName, kernelInfo, true); } buildKernelCalled = 1; } diff --git a/src/plugins/lowMach.cpp b/src/plugins/lowMach.cpp index 25b29fb98..7ed674432 100644 --- a/src/plugins/lowMach.cpp +++ b/src/plugins/lowMach.cpp @@ -32,15 +32,15 @@ void lowMach::buildKernel(occa::properties kernelInfo) { kernelName = "qtlHex3D"; fileName = path + kernelName + extension; - qtlKernel = platform->device.buildKernel(fileName, kernelInfo); + qtlKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "p0thHelper"; fileName = path + kernelName + extension; - p0thHelperKernel = platform->device.buildKernel(fileName, kernelInfo); + p0thHelperKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "surfaceFlux"; fileName = path + kernelName + extension; - surfaceFluxKernel = platform->device.buildKernel(fileName, kernelInfo); + surfaceFluxKernel = platform->device.buildKernel(fileName, kernelInfo, true); } } diff --git a/src/plugins/velRecycling.cpp b/src/plugins/velRecycling.cpp index 7f9210108..503ba9dcd 100644 --- a/src/plugins/velRecycling.cpp +++ b/src/plugins/velRecycling.cpp @@ -49,16 +49,16 @@ void velRecycling::buildKernel(occa::properties kernelInfo) { kernelName = "setBCVectorValue"; fileName = path + kernelName + extension; - setBCVectorValueKernel = platform->device.buildKernel(fileName, kernelInfo); + setBCVectorValueKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "getBCFlux"; fileName = path + kernelName + extension; - getBCFluxKernel = platform->device.buildKernel(fileName, kernelInfo); + getBCFluxKernel = platform->device.buildKernel(fileName, kernelInfo, true); kernelName = "sumReduction"; fileName = path + kernelName + extension; - sumReductionKernel = platform->device.buildKernel(fileName, kernelInfo); + sumReductionKernel = platform->device.buildKernel(fileName, kernelInfo, true); } } diff --git a/src/udf/compileUDFKernels.cpp b/src/udf/compileUDFKernels.cpp index 3758fcac2..c4f110e8f 100644 --- a/src/udf/compileUDFKernels.cpp +++ b/src/udf/compileUDFKernels.cpp @@ -16,37 +16,24 @@ occa::properties compileUDFKernels() kernelInfo["flags"].asObject(); kernelInfo["include_paths"].asArray(); - auto rank = buildNodeLocal ? platform->comm.localRank : platform->comm.mpiRank; - auto communicator = buildNodeLocal ? platform->comm.mpiCommLocal : platform->comm.mpiComm; - MPI_Barrier(platform->comm.mpiComm); const double tStart = MPI_Wtime(); if (platform->comm.mpiRank == 0) printf("loading udf kernels ... "); fflush(stdout); - occa::properties kernelInfoBC; - - for(int pass = 0; pass < 2; ++pass) - { - bool executePass = (pass == 0) && (rank == 0); - executePass |= (pass == 1) && (rank != 0); - if(executePass){ - kernelInfoBC = kernelInfo; - if (udf.loadKernels) { - // side-effect: kernelInfoBC will include any relevant user-defined kernel props - udf.loadKernels(kernelInfoBC); - } - const std::string bcDataFile = installDir + "/include/core/bcData.h"; - kernelInfoBC["includes"] += bcDataFile.c_str(); - std::string boundaryHeaderFileName; - platform->options.getArgs("DATA FILE", boundaryHeaderFileName); - kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL); - - kernelInfoBC += meshKernelProperties(N); - } - MPI_Barrier(communicator); + occa::properties kernelInfoBC = kernelInfo; + if (udf.loadKernels) { + // side-effect: kernelInfoBC will include any relevant user-defined kernel props + udf.loadKernels(kernelInfoBC); } + const std::string bcDataFile = installDir + "/include/core/bcData.h"; + kernelInfoBC["includes"] += bcDataFile.c_str(); + std::string boundaryHeaderFileName; + platform->options.getArgs("DATA FILE", boundaryHeaderFileName); + kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL); + + kernelInfoBC += meshKernelProperties(N); MPI_Barrier(platform->comm.mpiComm); const double loadTime = MPI_Wtime() - tStart;