From afaabee31efb2a4ad9ed6621591e8792f76a6b65 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Jun 2022 15:46:34 -0700 Subject: [PATCH 001/137] Add runtime hip and cuda data routines --- src/common/CudaDataUtils.hpp | 135 +++++++++++++++++++++++++++++++++++ src/common/DataUtils.cpp | 23 ++++++ src/common/DataUtils.hpp | 4 ++ src/common/HipDataUtils.hpp | 135 +++++++++++++++++++++++++++++++++++ 4 files changed, 297 insertions(+) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 5010f982e..d06401c8d 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -15,6 +15,7 @@ #define RAJAPerf_CudaDataUtils_HPP #include "RPTypes.hpp" +#include #if defined(RAJA_ENABLE_CUDA) @@ -135,6 +136,17 @@ void allocCudaDeviceData(T& dptr, int len) len * sizeof(typename std::remove_pointer::type) ) ); } +/*! + * \brief Allocate CUDA managed data array (dptr). + */ +template +void allocCudaManagedData(T& mptr, int len) +{ + cudaErrchk( cudaMallocManaged( (void**)&mptr, + len * sizeof(typename std::remove_pointer::type), + cudaMemAttachGlobal ) ); +} + /*! * \brief Allocate CUDA pinned data array (pptr). */ @@ -181,6 +193,16 @@ void deallocCudaDeviceData(T& dptr) dptr = nullptr; } +/*! + * \brief Free managed data array. + */ +template +void deallocCudaManagedData(T& mptr) +{ + cudaErrchk( cudaFree( mptr ) ); + mptr = nullptr; +} + /*! * \brief Free pinned data array. */ @@ -191,6 +213,119 @@ void deallocCudaPinnedData(T& pptr) pptr = nullptr; } + +enum struct CudaData : int +{ + host = 0, + pinned, + managed, + device +}; + +extern CudaData cudaDataType; + + +/*! + * \brief Copy given hptr (host) data to CUDA (cptr). + * + * Method assumes both host and device data arrays are allocated + * and of proper size for copy operation to succeed. + */ +template +void initCudaData(T& cptr, const T hptr, int len) +{ + cudaErrchk( cudaMemcpy( cptr, hptr, + len * sizeof(typename std::remove_pointer::type), + cudaMemcpyDefault ) ); + + incDataInitCount(); +} + +/*! + * \brief Allocate CUDA data array (cptr). + */ +template +void allocCudaData(T& cptr, int len) +{ + switch (cudaDataType) { + case CudaData::host: + { + allocData(cptr, len); + } break; + case CudaData::pinned: + { + allocCudaPinnedData(cptr, len); + } break; + case CudaData::managed: + { + allocCudaManagedData(cptr, len); + } break; + case CudaData::device: + { + allocCudaDeviceData(cptr, len); + } break; + default: + { + throw std::invalid_argument("allocCudaData : Unknown memory type"); + } break; + } +} + +/*! + * \brief Allocate CUDA data array (cptr) and copy given hptr (host) + * data to CUDA array. + */ +template +void allocAndInitCudaData(T& cptr, const T hptr, int len) +{ + allocCudaData(cptr, len); + initCudaData(cptr, hptr, len); +} + +/*! + * \brief Free Cuda data array. + */ +template +void deallocCudaData(T& cptr) +{ + switch (cudaDataType) { + case CudaData::host: + { + deallocData(cptr); + } break; + case CudaData::pinned: + { + deallocCudaPinnedData(cptr); + } break; + case CudaData::managed: + { + deallocCudaManagedData(cptr); + } break; + case CudaData::device: + { + deallocCudaDeviceData(cptr); + } break; + default: + { + throw std::invalid_argument("deallocCudaData : Unknown memory type"); + } break; + } +} + +/*! + * \brief Copy given cptr (CUDA) data to host (hptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void getCudaData(T& hptr, const T cptr, int len) +{ + cudaErrchk( cudaMemcpy( hptr, cptr, + len * sizeof(typename std::remove_pointer::type), + cudaMemcpyDefault ) ); +} + } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_CUDA diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 6856d1f6c..1e0ae2417 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -7,6 +7,8 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "DataUtils.hpp" +#include "CudaDataUtils.hpp" +#include "HipDataUtils.hpp" #include "RAJA/internal/MemUtils_CPU.hpp" @@ -16,6 +18,13 @@ namespace rajaperf { +#if defined(RAJA_ENABLE_CUDA) +CudaData cudaDataType = CudaData::device; +#endif +#if defined(RAJA_ENABLE_HIP) +HipData hipDataType = HipData::device; +#endif + static int data_init_count = 0; /* @@ -87,6 +96,12 @@ void allocData(Int_ptr& ptr, int len) // Should we do this differently for alignment?? If so, change dealloc() ptr = new Int_type[len]; } +/// +void allocData(Index_type*& ptr, int len) +{ + // Should we do this differently for alignment?? If so, change dealloc() + ptr = new Index_type[len]; +} void allocData(Real_ptr& ptr, int len) { @@ -113,6 +128,14 @@ void deallocData(Int_ptr& ptr) } } +void deallocData(Index_type*& ptr) +{ + if (ptr) { + delete [] ptr; + ptr = 0; + } +} + void deallocData(Real_ptr& ptr) { if (ptr) { diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 887f54f82..7c8f7e657 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -96,6 +96,8 @@ void allocAndInitData(Complex_ptr& ptr, int len, */ void allocData(Int_ptr& ptr, int len); /// +void allocData(Index_type*& ptr, int len); +/// void allocData(Real_ptr& ptr, int len); /// void allocData(Complex_ptr& ptr, int len); @@ -105,6 +107,8 @@ void allocData(Complex_ptr& ptr, int len); */ void deallocData(Int_ptr& ptr); /// +void deallocData(Index_type*& ptr); +/// void deallocData(Real_ptr& ptr); /// void deallocData(Complex_ptr& ptr); diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index a3871d31e..4dbd26f70 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -15,6 +15,7 @@ #define RAJAPerf_HipDataUtils_HPP #include "RPTypes.hpp" +#include #if defined(RAJA_ENABLE_HIP) @@ -122,6 +123,17 @@ void allocHipDeviceData(T& dptr, int len) len * sizeof(typename std::remove_pointer::type) ) ); } +/*! + * \brief Allocate HIP managed data array (mptr). + */ +template +void allocHipManagedData(T& mptr, int len) +{ + hipErrchk( hipMallocManaged( (void**)&mptr, + len * sizeof(typename std::remove_pointer::type), + hipMemAttachGlobal ) ); +} + /*! * \brief Allocate HIP pinned data array (pptr). */ @@ -168,6 +180,16 @@ void deallocHipDeviceData(T& dptr) dptr = nullptr; } +/*! + * \brief Free managed data array. + */ +template +void deallocHipManagedData(T& mptr) +{ + hipErrchk( hipFree( mptr ) ); + mptr = nullptr; +} + /*! * \brief Free pinned data array. */ @@ -179,6 +201,119 @@ void deallocHipPinnedData(T& pptr) } + +enum struct HipData : int +{ + host = 0, + pinned, + managed, + device +}; + +extern HipData hipDataType; + + +/*! + * \brief Copy given hptr (host) data to HIP (cptr). + * + * Method assumes both host and device data arrays are allocated + * and of proper size for copy operation to succeed. + */ +template +void initHipData(T& cptr, const T hptr, int len) +{ + hipErrchk( hipMemcpy( cptr, hptr, + len * sizeof(typename std::remove_pointer::type), + hipMemcpyDefault ) ); + + incDataInitCount(); +} + +/*! + * \brief Allocate HIP data array (cptr). + */ +template +void allocHipData(T& cptr, int len) +{ + switch (hipDataType) { + case HipData::host: + { + allocData(cptr, len); + } break; + case HipData::pinned: + { + allocHipPinnedData(cptr, len); + } break; + case HipData::managed: + { + allocHipManagedData(cptr, len); + } break; + case HipData::device: + { + allocHipDeviceData(cptr, len); + } break; + default: + { + throw std::invalid_argument("allocHipData : Unknown memory type"); + } break; + } +} + +/*! + * \brief Allocate HIP data array (cptr) and copy given hptr (host) + * data to HIP array. + */ +template +void allocAndInitHipData(T& cptr, const T hptr, int len) +{ + allocHipData(cptr, len); + initHipData(cptr, hptr, len); +} + +/*! + * \brief Free Hip data array. + */ +template +void deallocHipData(T& cptr) +{ + switch (hipDataType) { + case HipData::host: + { + deallocData(cptr); + } break; + case HipData::pinned: + { + deallocHipPinnedData(cptr); + } break; + case HipData::managed: + { + deallocHipManagedData(cptr); + } break; + case HipData::device: + { + deallocHipDeviceData(cptr); + } break; + default: + { + throw std::invalid_argument("deallocHipData : Unknown memory type"); + } break; + } +} + +/*! + * \brief Copy given cptr (HIP) data to host (hptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void getHipData(T& hptr, const T cptr, int len) +{ + hipErrchk( hipMemcpy( hptr, cptr, + len * sizeof(typename std::remove_pointer::type), + hipMemcpyDefault ) ); +} + } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_HIP From c892a30a8faf833a324092d9e55a9f8c2cbced39 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Jun 2022 15:47:54 -0700 Subject: [PATCH 002/137] Use runtime data routines in basic kernels --- src/basic/DAXPY-Cuda.cpp | 10 +++++----- src/basic/DAXPY-Hip.cpp | 10 +++++----- src/basic/DAXPY_ATOMIC-Cuda.cpp | 10 +++++----- src/basic/DAXPY_ATOMIC-Hip.cpp | 10 +++++----- src/basic/IF_QUAD-Cuda.cpp | 24 ++++++++++++------------ src/basic/IF_QUAD-Hip.cpp | 24 ++++++++++++------------ src/basic/INDEXLIST-Cuda.cpp | 10 +++++----- src/basic/INDEXLIST-Hip.cpp | 10 +++++----- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 14 +++++++------- src/basic/INDEXLIST_3LOOP-Hip.cpp | 14 +++++++------- src/basic/INIT3-Cuda.cpp | 26 +++++++++++++------------- src/basic/INIT3-Hip.cpp | 26 +++++++++++++------------- src/basic/INIT_VIEW1D-Cuda.cpp | 6 +++--- src/basic/INIT_VIEW1D-Hip.cpp | 6 +++--- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 6 +++--- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 6 +++--- src/basic/MAT_MAT_SHARED-Cuda.cpp | 18 +++++++++--------- src/basic/MAT_MAT_SHARED-Hip.cpp | 18 +++++++++--------- src/basic/MULADDSUB-Cuda.cpp | 26 +++++++++++++------------- src/basic/MULADDSUB-Hip.cpp | 26 +++++++++++++------------- src/basic/NESTED_INIT-Cuda.cpp | 6 +++--- src/basic/NESTED_INIT-Hip.cpp | 6 +++--- src/basic/REDUCE3_INT-Cuda.cpp | 4 ++-- src/basic/REDUCE3_INT-Hip.cpp | 4 ++-- src/basic/REDUCE_STRUCT-Cuda.cpp | 8 ++++---- src/basic/REDUCE_STRUCT-Hip.cpp | 8 ++++---- 26 files changed, 168 insertions(+), 168 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index a87421c4f..ea85f3c50 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -22,13 +22,13 @@ namespace basic { #define DAXPY_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(y, m_y, iend); + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(y, m_y, iend); #define DAXPY_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); + getCudaData(m_y, y, iend); \ + deallocCudaData(x); \ + deallocCudaData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 25810c19e..313b9a570 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -22,13 +22,13 @@ namespace basic { #define DAXPY_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(y, m_y, iend); + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(y, m_y, iend); #define DAXPY_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); + getHipData(m_y, y, iend); \ + deallocHipData(x); \ + deallocHipData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 1e8210bd2..b6630877c 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -22,13 +22,13 @@ namespace basic { #define DAXPY_ATOMIC_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(y, m_y, iend); + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(y, m_y, iend); #define DAXPY_ATOMIC_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); + getCudaData(m_y, y, iend); \ + deallocCudaData(x); \ + deallocCudaData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index a1e7a6465..033f86628 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -22,13 +22,13 @@ namespace basic { #define DAXPY_ATOMIC_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(y, m_y, iend); + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(y, m_y, iend); #define DAXPY_ATOMIC_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); + getHipData(m_y, y, iend); \ + deallocHipData(x); \ + deallocHipData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 66146371c..abe8c2edd 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -22,20 +22,20 @@ namespace basic { #define IF_QUAD_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, iend); \ - allocAndInitCudaDeviceData(b, m_b, iend); \ - allocAndInitCudaDeviceData(c, m_c, iend); \ - allocAndInitCudaDeviceData(x1, m_x1, iend); \ - allocAndInitCudaDeviceData(x2, m_x2, iend); + allocAndInitCudaData(a, m_a, iend); \ + allocAndInitCudaData(b, m_b, iend); \ + allocAndInitCudaData(c, m_c, iend); \ + allocAndInitCudaData(x1, m_x1, iend); \ + allocAndInitCudaData(x2, m_x2, iend); #define IF_QUAD_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x1, x1, iend); \ - getCudaDeviceData(m_x2, x2, iend); \ - deallocCudaDeviceData(a); \ - deallocCudaDeviceData(b); \ - deallocCudaDeviceData(c); \ - deallocCudaDeviceData(x1); \ - deallocCudaDeviceData(x2); + getCudaData(m_x1, x1, iend); \ + getCudaData(m_x2, x2, iend); \ + deallocCudaData(a); \ + deallocCudaData(b); \ + deallocCudaData(c); \ + deallocCudaData(x1); \ + deallocCudaData(x2); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 6ded209a9..1cbedff25 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -22,20 +22,20 @@ namespace basic { #define IF_QUAD_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); \ - allocAndInitHipDeviceData(b, m_b, iend); \ - allocAndInitHipDeviceData(c, m_c, iend); \ - allocAndInitHipDeviceData(x1, m_x1, iend); \ - allocAndInitHipDeviceData(x2, m_x2, iend); + allocAndInitHipData(a, m_a, iend); \ + allocAndInitHipData(b, m_b, iend); \ + allocAndInitHipData(c, m_c, iend); \ + allocAndInitHipData(x1, m_x1, iend); \ + allocAndInitHipData(x2, m_x2, iend); #define IF_QUAD_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x1, x1, iend); \ - getHipDeviceData(m_x2, x2, iend); \ - deallocHipDeviceData(a); \ - deallocHipDeviceData(b); \ - deallocHipDeviceData(c); \ - deallocHipDeviceData(x1); \ - deallocHipDeviceData(x2); + getHipData(m_x1, x1, iend); \ + getHipData(m_x2, x2, iend); \ + deallocHipData(a); \ + deallocHipData(b); \ + deallocHipData(c); \ + deallocHipData(x1); \ + deallocHipData(x2); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index bfb5399e7..6a8b05642 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -34,13 +34,13 @@ namespace basic #define INDEXLIST_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(list, m_list, iend); + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(list, m_list, iend); #define INDEXLIST_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_list, list, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(list); + getCudaData(m_list, list, iend); \ + deallocCudaData(x); \ + deallocCudaData(list); // perform a grid scan on val and returns the result at each thread diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 9707f635d..e38332046 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -34,13 +34,13 @@ namespace basic #define INDEXLIST_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(list, m_list, iend); + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(list, m_list, iend); #define INDEXLIST_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_list, list, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(list); + getHipData(m_list, list, iend); \ + deallocHipData(x); \ + deallocHipData(list); // perform a grid scan on val and returns the result at each thread diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 264000ba1..92bb08f9e 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -23,15 +23,15 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ Index_type* counts; \ - allocCudaDeviceData(counts, iend+1); \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(list, m_list, iend); + allocCudaData(counts, iend+1); \ + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(list, m_list, iend); #define INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(counts); \ - getCudaDeviceData(m_list, list, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(list); + deallocCudaData(counts); \ + getCudaData(m_list, list, iend); \ + deallocCudaData(x); \ + deallocCudaData(list); template < size_t block_size > diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 3ec769058..73f554853 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -23,15 +23,15 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_HIP \ Index_type* counts; \ - allocHipDeviceData(counts, iend+1); \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(list, m_list, iend); + allocHipData(counts, iend+1); \ + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(list, m_list, iend); #define INDEXLIST_3LOOP_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(counts); \ - getHipDeviceData(m_list, list, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(list); + deallocHipData(counts); \ + getHipData(m_list, list, iend); \ + deallocHipData(x); \ + deallocHipData(list); template < size_t block_size > diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index 212a1e3a2..1ca4f769e 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -22,21 +22,21 @@ namespace basic { #define INIT3_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(out1, m_out1, iend); \ - allocAndInitCudaDeviceData(out2, m_out2, iend); \ - allocAndInitCudaDeviceData(out3, m_out3, iend); \ - allocAndInitCudaDeviceData(in1, m_in1, iend); \ - allocAndInitCudaDeviceData(in2, m_in2, iend); + allocAndInitCudaData(out1, m_out1, iend); \ + allocAndInitCudaData(out2, m_out2, iend); \ + allocAndInitCudaData(out3, m_out3, iend); \ + allocAndInitCudaData(in1, m_in1, iend); \ + allocAndInitCudaData(in2, m_in2, iend); #define INIT3_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_out1, out1, iend); \ - getCudaDeviceData(m_out2, out2, iend); \ - getCudaDeviceData(m_out3, out3, iend); \ - deallocCudaDeviceData(out1); \ - deallocCudaDeviceData(out2); \ - deallocCudaDeviceData(out3); \ - deallocCudaDeviceData(in1); \ - deallocCudaDeviceData(in2); + getCudaData(m_out1, out1, iend); \ + getCudaData(m_out2, out2, iend); \ + getCudaData(m_out3, out3, iend); \ + deallocCudaData(out1); \ + deallocCudaData(out2); \ + deallocCudaData(out3); \ + deallocCudaData(in1); \ + deallocCudaData(in2); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index af3276a7d..3b29cc701 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -22,21 +22,21 @@ namespace basic { #define INIT3_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(out1, m_out1, iend); \ - allocAndInitHipDeviceData(out2, m_out2, iend); \ - allocAndInitHipDeviceData(out3, m_out3, iend); \ - allocAndInitHipDeviceData(in1, m_in1, iend); \ - allocAndInitHipDeviceData(in2, m_in2, iend); + allocAndInitHipData(out1, m_out1, iend); \ + allocAndInitHipData(out2, m_out2, iend); \ + allocAndInitHipData(out3, m_out3, iend); \ + allocAndInitHipData(in1, m_in1, iend); \ + allocAndInitHipData(in2, m_in2, iend); #define INIT3_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_out1, out1, iend); \ - getHipDeviceData(m_out2, out2, iend); \ - getHipDeviceData(m_out3, out3, iend); \ - deallocHipDeviceData(out1); \ - deallocHipDeviceData(out2); \ - deallocHipDeviceData(out3); \ - deallocHipDeviceData(in1); \ - deallocHipDeviceData(in2); + getHipData(m_out1, out1, iend); \ + getHipData(m_out2, out2, iend); \ + getHipData(m_out3, out3, iend); \ + deallocHipData(out1); \ + deallocHipData(out2); \ + deallocHipData(out3); \ + deallocHipData(in1); \ + deallocHipData(in2); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index be7a0bf97..5d9aec41b 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -22,11 +22,11 @@ namespace basic { #define INIT_VIEW1D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, getActualProblemSize()); + allocAndInitCudaData(a, m_a, getActualProblemSize()); #define INIT_VIEW1D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_a, a, getActualProblemSize()); \ - deallocCudaDeviceData(a); + getCudaData(m_a, a, getActualProblemSize()); \ + deallocCudaData(a); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 6f9d41924..28b3dc884 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -22,11 +22,11 @@ namespace basic { #define INIT_VIEW1D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); + allocAndInitHipData(a, m_a, iend); #define INIT_VIEW1D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_a, a, iend); \ - deallocHipDeviceData(a); + getHipData(m_a, a, iend); \ + deallocHipData(a); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 2f7f6d34a..b908afbd2 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -22,11 +22,11 @@ namespace basic { #define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, getActualProblemSize()); + allocAndInitCudaData(a, m_a, getActualProblemSize()); #define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_a, a, getActualProblemSize()); \ - deallocCudaDeviceData(a); + getCudaData(m_a, a, getActualProblemSize()); \ + deallocCudaData(a); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index ae98f56ab..f534dc81c 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -22,11 +22,11 @@ namespace basic { #define INIT_VIEW1D_OFFSET_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, getActualProblemSize()); + allocAndInitHipData(a, m_a, getActualProblemSize()); #define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_a, a, getActualProblemSize()); \ - deallocHipDeviceData(a); + getHipData(m_a, a, getActualProblemSize()); \ + deallocHipData(a); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index 0f702b83f..91b8d94be 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -21,17 +21,17 @@ namespace basic { #define MAT_MAT_SHARED_DATA_SETUP_CUDA \ const Index_type NN = m_N * m_N; \ - allocAndInitCudaDeviceData(A, m_A, NN); \ - allocAndInitCudaDeviceData(B, m_B, NN); \ - allocAndInitCudaDeviceData(C, m_C, NN); + allocAndInitCudaData(A, m_A, NN); \ + allocAndInitCudaData(B, m_B, NN); \ + allocAndInitCudaData(C, m_C, NN); #define MAT_MAT_SHARED_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_A, A, NN); \ - getCudaDeviceData(m_B, B, NN); \ - getCudaDeviceData(m_C, C, NN); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); \ - deallocCudaDeviceData(C); + getCudaData(m_A, A, NN); \ + getCudaData(m_B, B, NN); \ + getCudaData(m_C, C, NN); \ + deallocCudaData(A); \ + deallocCudaData(B); \ + deallocCudaData(C); template < Index_type tile_size > __launch_bounds__(tile_size*tile_size) diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index ac1b4fb7f..15407bbf3 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -21,17 +21,17 @@ namespace basic { #define MAT_MAT_SHARED_DATA_SETUP_HIP \ const Index_type NN = m_N * m_N; \ - allocAndInitHipDeviceData(A, m_A, NN); \ - allocAndInitHipDeviceData(B, m_B, NN); \ - allocAndInitHipDeviceData(C, m_C, NN); + allocAndInitHipData(A, m_A, NN); \ + allocAndInitHipData(B, m_B, NN); \ + allocAndInitHipData(C, m_C, NN); #define MAT_MAT_SHARED_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_A, A, NN); \ - getHipDeviceData(m_B, B, NN); \ - getHipDeviceData(m_C, C, NN); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); \ - deallocHipDeviceData(C); + getHipData(m_A, A, NN); \ + getHipData(m_B, B, NN); \ + getHipData(m_C, C, NN); \ + deallocHipData(A); \ + deallocHipData(B); \ + deallocHipData(C); template < Index_type tile_size > __launch_bounds__(tile_size*tile_size) diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 3d8254c07..ea7d0bdb4 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -22,21 +22,21 @@ namespace basic { #define MULADDSUB_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(out1, m_out1, iend); \ - allocAndInitCudaDeviceData(out2, m_out2, iend); \ - allocAndInitCudaDeviceData(out3, m_out3, iend); \ - allocAndInitCudaDeviceData(in1, m_in1, iend); \ - allocAndInitCudaDeviceData(in2, m_in2, iend); + allocAndInitCudaData(out1, m_out1, iend); \ + allocAndInitCudaData(out2, m_out2, iend); \ + allocAndInitCudaData(out3, m_out3, iend); \ + allocAndInitCudaData(in1, m_in1, iend); \ + allocAndInitCudaData(in2, m_in2, iend); #define MULADDSUB_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_out1, out1, iend); \ - getCudaDeviceData(m_out2, out2, iend); \ - getCudaDeviceData(m_out3, out3, iend); \ - deallocCudaDeviceData(out1); \ - deallocCudaDeviceData(out2); \ - deallocCudaDeviceData(out3); \ - deallocCudaDeviceData(in1); \ - deallocCudaDeviceData(in2); + getCudaData(m_out1, out1, iend); \ + getCudaData(m_out2, out2, iend); \ + getCudaData(m_out3, out3, iend); \ + deallocCudaData(out1); \ + deallocCudaData(out2); \ + deallocCudaData(out3); \ + deallocCudaData(in1); \ + deallocCudaData(in2); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index cb9076b38..fa22743bf 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -22,21 +22,21 @@ namespace basic { #define MULADDSUB_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(out1, m_out1, iend); \ - allocAndInitHipDeviceData(out2, m_out2, iend); \ - allocAndInitHipDeviceData(out3, m_out3, iend); \ - allocAndInitHipDeviceData(in1, m_in1, iend); \ - allocAndInitHipDeviceData(in2, m_in2, iend); + allocAndInitHipData(out1, m_out1, iend); \ + allocAndInitHipData(out2, m_out2, iend); \ + allocAndInitHipData(out3, m_out3, iend); \ + allocAndInitHipData(in1, m_in1, iend); \ + allocAndInitHipData(in2, m_in2, iend); #define MULADDSUB_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_out1, out1, iend); \ - getHipDeviceData(m_out2, out2, iend); \ - getHipDeviceData(m_out3, out3, iend); \ - deallocHipDeviceData(out1); \ - deallocHipDeviceData(out2); \ - deallocHipDeviceData(out3); \ - deallocHipDeviceData(in1); \ - deallocHipDeviceData(in2); + getHipData(m_out1, out1, iend); \ + getHipData(m_out2, out2, iend); \ + getHipData(m_out3, out3, iend); \ + deallocHipData(out1); \ + deallocHipData(out2); \ + deallocHipData(out3); \ + deallocHipData(in1); \ + deallocHipData(in2); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 7528c5cec..442124ab4 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -42,11 +42,11 @@ namespace basic #define NESTED_INIT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(array, m_array, m_array_length); + allocAndInitCudaData(array, m_array, m_array_length); #define NESTED_INIT_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_array, array, m_array_length); \ - deallocCudaDeviceData(array); + getCudaData(m_array, array, m_array_length); \ + deallocCudaData(array); template< size_t i_block_size, size_t j_block_size, size_t k_block_size > __launch_bounds__(i_block_size*j_block_size*k_block_size) diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 49c050f6f..94c4c3c8e 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -42,11 +42,11 @@ namespace basic #define NESTED_INIT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(array, m_array, m_array_length); + allocAndInitHipData(array, m_array, m_array_length); #define NESTED_INIT_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_array, array, m_array_length); \ - deallocHipDeviceData(array); + getHipData(m_array, array, m_array_length); \ + deallocHipData(array); template< size_t i_block_size, size_t j_block_size, size_t k_block_size > __launch_bounds__(i_block_size*j_block_size*k_block_size) diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 0e7c645e7..25c36a65f 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -22,10 +22,10 @@ namespace basic { #define REDUCE3_INT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(vec, m_vec, iend); + allocAndInitCudaData(vec, m_vec, iend); #define REDUCE3_INT_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(vec); + deallocCudaData(vec); template < size_t block_size > diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 8e92cb123..62d1a2a56 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -22,10 +22,10 @@ namespace basic { #define REDUCE3_INT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(vec, m_vec, iend); + allocAndInitHipData(vec, m_vec, iend); #define REDUCE3_INT_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(vec); + deallocHipData(vec); template < size_t block_size > diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 52bec116d..25f1f4803 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -23,13 +23,13 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(points.x, m_x, points.N); \ - allocAndInitCudaDeviceData(points.y, m_y, points.N); \ + allocAndInitCudaData(points.x, m_x, points.N); \ + allocAndInitCudaData(points.y, m_y, points.N); \ #define REDUCE_STRUCT_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(points.x); \ - deallocCudaDeviceData(points.y); + deallocCudaData(points.x); \ + deallocCudaData(points.y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index f72306107..668ea9db7 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -23,12 +23,12 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(points.x, m_x, points.N); \ - allocAndInitHipDeviceData(points.y, m_y, points.N); \ + allocAndInitHipData(points.x, m_x, points.N); \ + allocAndInitHipData(points.y, m_y, points.N); \ #define REDUCE_STRUCT_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(points.x); \ - deallocHipDeviceData(points.y); + deallocHipData(points.x); \ + deallocHipData(points.y); template < size_t block_size > __launch_bounds__(block_size) From 75563a44dbf1197b4816da240fe49fc90e03830b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 08:23:18 -0700 Subject: [PATCH 003/137] Move DataSpace into RAJAPerfSuite headers This will allow its use in RunParams --- src/common/CudaDataUtils.hpp | 32 +++++--------- src/common/DataUtils.cpp | 7 --- src/common/HipDataUtils.hpp | 33 +++++---------- src/common/RAJAPerfSuite.cpp | 82 ++++++++++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.hpp | 72 +++++++++++++++++++++++++++++++ 5 files changed, 176 insertions(+), 50 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index d06401c8d..6873630c8 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -19,6 +19,7 @@ #if defined(RAJA_ENABLE_CUDA) +#include "common/RAJAPerfSuite.hpp" #include "common/GPUUtils.hpp" #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" @@ -214,17 +215,6 @@ void deallocCudaPinnedData(T& pptr) } -enum struct CudaData : int -{ - host = 0, - pinned, - managed, - device -}; - -extern CudaData cudaDataType; - - /*! * \brief Copy given hptr (host) data to CUDA (cptr). * @@ -247,20 +237,20 @@ void initCudaData(T& cptr, const T hptr, int len) template void allocCudaData(T& cptr, int len) { - switch (cudaDataType) { - case CudaData::host: + switch (cudaDataSpace) { + case CudaData::Host: { allocData(cptr, len); } break; - case CudaData::pinned: + case CudaData::Pinned: { allocCudaPinnedData(cptr, len); } break; - case CudaData::managed: + case CudaData::Managed: { allocCudaManagedData(cptr, len); } break; - case CudaData::device: + case CudaData::Device: { allocCudaDeviceData(cptr, len); } break; @@ -288,20 +278,20 @@ void allocAndInitCudaData(T& cptr, const T hptr, int len) template void deallocCudaData(T& cptr) { - switch (cudaDataType) { - case CudaData::host: + switch (cudaDataSpace) { + case CudaData::Host: { deallocData(cptr); } break; - case CudaData::pinned: + case CudaData::Pinned: { deallocCudaPinnedData(cptr); } break; - case CudaData::managed: + case CudaData::Managed: { deallocCudaManagedData(cptr); } break; - case CudaData::device: + case CudaData::Device: { deallocCudaDeviceData(cptr); } break; diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 1e0ae2417..7e3710357 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -18,13 +18,6 @@ namespace rajaperf { -#if defined(RAJA_ENABLE_CUDA) -CudaData cudaDataType = CudaData::device; -#endif -#if defined(RAJA_ENABLE_HIP) -HipData hipDataType = HipData::device; -#endif - static int data_init_count = 0; /* diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 4dbd26f70..da29f5375 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -19,6 +19,7 @@ #if defined(RAJA_ENABLE_HIP) +#include "common/RAJAPerfSuite.hpp" #include "common/GPUUtils.hpp" #include "RAJA/policy/hip/raja_hiperrchk.hpp" @@ -201,18 +202,6 @@ void deallocHipPinnedData(T& pptr) } - -enum struct HipData : int -{ - host = 0, - pinned, - managed, - device -}; - -extern HipData hipDataType; - - /*! * \brief Copy given hptr (host) data to HIP (cptr). * @@ -235,20 +224,20 @@ void initHipData(T& cptr, const T hptr, int len) template void allocHipData(T& cptr, int len) { - switch (hipDataType) { - case HipData::host: + switch (hipDataSpace) { + case HipData::Host: { allocData(cptr, len); } break; - case HipData::pinned: + case HipData::Pinned: { allocHipPinnedData(cptr, len); } break; - case HipData::managed: + case HipData::Managed: { allocHipManagedData(cptr, len); } break; - case HipData::device: + case HipData::Device: { allocHipDeviceData(cptr, len); } break; @@ -276,20 +265,20 @@ void allocAndInitHipData(T& cptr, const T hptr, int len) template void deallocHipData(T& cptr) { - switch (hipDataType) { - case HipData::host: + switch (hipDataSpace) { + case HipData::Host: { deallocData(cptr); } break; - case HipData::pinned: + case HipData::Pinned: { deallocHipPinnedData(cptr); } break; - case HipData::managed: + case HipData::Managed: { deallocHipManagedData(cptr); } break; - case HipData::device: + case HipData::Device: { deallocHipDeviceData(cptr); } break; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 7578f5afd..0a48c8c39 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -318,6 +318,60 @@ static const std::string FeatureNames [] = }; // END FeatureNames +/*! + ******************************************************************************* + * + * \brief Array of names for each Cuda Memory Space in suite. + * + * IMPORTANT: This is only modified when a new memory space is added to the suite. + * + * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH + * ENUM OF CUDADATA IDS IN HEADER FILE!!! + * + ******************************************************************************* + */ +static const std::string CudaDataNames [] = +{ + + std::string("Host"), + std::string("Pinned"), + std::string("Managed"), + std::string("Device"), + + std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... + +}; // END VariantNames + +CudaData cudaDataSpace = CudaData::Device; + + +/*! + ******************************************************************************* + * + * \brief Array of names for each Hip Memory Space in suite. + * + * IMPORTANT: This is only modified when a new memory space is added to the suite. + * + * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH + * ENUM OF HIPDATA IDS IN HEADER FILE!!! + * + ******************************************************************************* + */ +static const std::string HipDataNames [] = +{ + + std::string("Host"), + std::string("Pinned"), + std::string("Managed"), + std::string("Device"), + + std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... + +}; // END VariantNames + +HipData hipDataSpace = HipData::Device; + + /* ******************************************************************************* * @@ -371,6 +425,7 @@ const std::string& getVariantName(VariantID vid) return VariantNames[vid]; } + /*! ******************************************************************************* * @@ -506,6 +561,33 @@ const std::string& getFeatureName(FeatureID fid) return FeatureNames[fid]; } + +/* + ******************************************************************************* + * + * Return memory space name associated with CudaData enum value. + * + ******************************************************************************* + */ +const std::string& getCudaDataName(CudaData cd) +{ + return CudaDataNames[static_cast(cd)]; +} + + +/* + ******************************************************************************* + * + * Return memory space name associated with HipData enum value. + * + ******************************************************************************* + */ +const std::string& getHipDataName(HipData hd) +{ + return HipDataNames[static_cast(hd)]; +} + + /* ******************************************************************************* * diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 07feeede9..ca0734cb9 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -233,6 +233,60 @@ enum FeatureID { }; +/*! + ******************************************************************************* + * + * \brief Enumeration defining unique id for each (CUDA) Data memory space + * used in suite. + * + * IMPORTANT: This is only modified when a new memory space is used in suite. + * + * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH + * ARRAY OF MEMORY SPACE NAMES IN IMPLEMENTATION FILE!!! + * + ******************************************************************************* + */ +enum struct CudaData { + + Host = 0, + Pinned, + Managed, + Device, + + NumSpaces // Keep this one last and NEVER comment out (!!) + +}; + +extern CudaData cudaDataSpace; + + +/*! + ******************************************************************************* + * + * \brief Enumeration defining unique id for each (HIP) Data memory space + * used in suite. + * + * IMPORTANT: This is only modified when a new memory space is used in suite. + * + * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH + * ARRAY OF MEMORY SPACE NAMES IN IMPLEMENTATION FILE!!! + * + ******************************************************************************* + */ +enum struct HipData { + + Host = 0, + Pinned, + Managed, + Device, + + NumSpaces // Keep this one last and NEVER comment out (!!) + +}; + +extern HipData hipDataSpace; + + /*! ******************************************************************************* * @@ -302,6 +356,24 @@ bool isVariantGPU(VariantID vid); */ const std::string& getFeatureName(FeatureID vid); +/*! + ******************************************************************************* + * + * \brief Return memory space name associated with CudaData enum value. + * + ******************************************************************************* + */ +const std::string& getCudaDataName(CudaData cd); + +/*! + ******************************************************************************* + * + * \brief Return memory space name associated with HipData enum value. + * + ******************************************************************************* + */ +const std::string& getHipDataName(HipData hd); + /*! ******************************************************************************* * From 415f1df4764e4c2573bcc0d9a3b1a0bdb6db25ec Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 09:58:49 -0700 Subject: [PATCH 004/137] Add run params for cuda/hip memory spaces --- src/common/RunParams.cpp | 105 +++++++++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 2 + 2 files changed, 107 insertions(+) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 58e6ba086..2af4c6460 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -108,6 +108,9 @@ void RunParams::print(std::ostream& str) const str << "\n outdir = " << outdir; str << "\n outfile_prefix = " << outfile_prefix; + str << "\n cuda memory space = " << getCudaDataName(cudaDataSpace); + str << "\n hip memory space = " << getHipDataName(hipDataSpace); + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; @@ -205,6 +208,18 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) printVariantNames(getCout()); input_state = InfoRequest; + } else if ( opt == std::string("--print-cuda_memory_spaces") || + opt == std::string("-pcms") ) { + + printCudaDataNames(getCout()); + input_state = InfoRequest; + + } else if ( opt == std::string("--print-hip_memory_spaces") || + opt == std::string("-pcms") ) { + + printHipDataNames(getCout()); + input_state = InfoRequest; + } else if ( opt == std::string("--print-features") || opt == std::string("-pf") ) { @@ -424,6 +439,58 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } + } else if ( opt == std::string("--cuda_memory_space") || + opt == std::string("-cms") ) { + + bool got_someting = false; + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + for (int cms = 0; cms < static_cast(CudaData::NumSpaces); ++cms) { + if (getCudaDataName(static_cast(cms)) == opt) { + cudaDataSpace = static_cast(cms); + got_someting = true; + break; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give --cuda_memory_space a valid cuda memory space" + << std::endl; + input_state = BadInput; + } + } + } + + } else if ( opt == std::string("--hip_memory_space") || + opt == std::string("-hms") ) { + + bool got_someting = false; + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + for (int hms = 0; hms < static_cast(HipData::NumSpaces); ++hms) { + if (getHipDataName(static_cast(hms)) == opt) { + hipDataSpace = static_cast(hms); + got_someting = true; + break; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give --hip_memory_space a valid hip memory space" + << std::endl; + input_state = BadInput; + } + } + } + } else if ( std::string(argv[i]) == std::string("--features") || std::string(argv[i]) == std::string("-f") ) { @@ -554,6 +621,10 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --print-variants, -pv (print names of available variants to run)\n\n"; + str << "\t --print-cuda_memory_spaces, -pcms (print names of cuda memory spaces)\n\n"; + + str << "\t --print-hip_memory_spaces, -pcms (print names of hip memory spaces)\n\n"; + str << "\t --print-features, -pf (print names of RAJA features exercised in Suite)\n\n"; str << "\t --print-feature-kernels, -pfk \n" @@ -628,6 +699,18 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t --exclude-variants RAJA_CUDA (exclude all RAJA_CUDA kernel variants)\n" << "\t\t -ev Base_Seq RAJA_CUDA (exclude Base_Seq and RAJA_CUDA variants)\n\n"; + str << "\t --cuda_memory_space, -cms [Default is Device]\n" + << "\t (names of memory space to use)\n"; + str << "\t\t Examples...\n" + << "\t\t --cuda_memory_space Managed (run CUDA kernels with Managed memory)\n" + << "\t\t -cms Pinned (run CUDA kernels with Pinned memory)\n\n"; + + str << "\t --hip_memory_space, -hms [Default is Device]\n" + << "\t (names of memory space to use)\n"; + str << "\t\t Examples...\n" + << "\t\t --hip_memory_space Managed (run HIP kernels with Managed memory)\n" + << "\t\t -hms Pinned (run HIP kernels with Pinned memory)\n\n"; + str << "\t --features, -f [Default is run all]\n" << "\t (names of features to run)\n"; str << "\t\t Examples...\n" @@ -708,6 +791,28 @@ void RunParams::printVariantNames(std::ostream& str) const } +void RunParams::printCudaDataNames(std::ostream& str) const +{ + str << "\nAvailable cuda memory spaces:"; + str << "\n-------------------\n"; + for (int cms = 0; cms < static_cast(CudaData::NumSpaces); ++cms) { + str << getCudaDataName(static_cast(cms)) << std::endl; + } + str.flush(); +} + + +void RunParams::printHipDataNames(std::ostream& str) const +{ + str << "\nAvailable hip memory spaces:"; + str << "\n-------------------\n"; + for (int hms = 0; hms < static_cast(HipData::NumSpaces); ++hms) { + str << getHipDataName(static_cast(hms)) << std::endl; + } + str.flush(); +} + + void RunParams::printGroupNames(std::ostream& str) const { str << "\nAvailable groups:"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index d0e7d81bf..10a95d8ed 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -212,6 +212,8 @@ class RunParams { void printFullKernelNames(std::ostream& str) const; void printKernelNames(std::ostream& str) const; void printVariantNames(std::ostream& str) const; + void printCudaDataNames(std::ostream& str) const; + void printHipDataNames(std::ostream& str) const; void printGroupNames(std::ostream& str) const; void printFeatureNames(std::ostream& str) const; void printFeatureKernels(std::ostream& str) const; From 52ea38e7db182224251a613c74cf0e04486b8e17 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 13:35:21 -0700 Subject: [PATCH 005/137] Use runtime gpu allocators in algorithm --- src/algorithm/MEMCPY-Cuda.cpp | 10 +++++----- src/algorithm/MEMCPY-Hip.cpp | 10 +++++----- src/algorithm/MEMSET-Cuda.cpp | 6 +++--- src/algorithm/MEMSET-Hip.cpp | 6 +++--- src/algorithm/REDUCE_SUM-Cuda.cpp | 4 ++-- src/algorithm/REDUCE_SUM-Hip.cpp | 4 ++-- src/algorithm/SCAN-Cuda.cpp | 10 +++++----- src/algorithm/SCAN-Hip.cpp | 10 +++++----- src/algorithm/SORT-Cuda.cpp | 6 +++--- src/algorithm/SORT-Hip.cpp | 6 +++--- src/algorithm/SORTPAIRS-Cuda.cpp | 12 ++++++------ src/algorithm/SORTPAIRS-Hip.cpp | 12 ++++++------ 12 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp index 8862a02df..c33e16cec 100644 --- a/src/algorithm/MEMCPY-Cuda.cpp +++ b/src/algorithm/MEMCPY-Cuda.cpp @@ -22,13 +22,13 @@ namespace algorithm { #define MEMCPY_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(y, m_y, iend); + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(y, m_y, iend); #define MEMCPY_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); + getCudaData(m_y, y, iend); \ + deallocCudaData(x); \ + deallocCudaData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp index 742d77cf0..4beb858e8 100644 --- a/src/algorithm/MEMCPY-Hip.cpp +++ b/src/algorithm/MEMCPY-Hip.cpp @@ -22,13 +22,13 @@ namespace algorithm { #define MEMCPY_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(y, m_y, iend); + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(y, m_y, iend); #define MEMCPY_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); + getHipData(m_y, y, iend); \ + deallocHipData(x); \ + deallocHipData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp index 639169563..9e13b2446 100644 --- a/src/algorithm/MEMSET-Cuda.cpp +++ b/src/algorithm/MEMSET-Cuda.cpp @@ -22,11 +22,11 @@ namespace algorithm { #define MEMSET_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); + allocAndInitCudaData(x, m_x, iend); #define MEMSET_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, iend); \ - deallocCudaDeviceData(x); + getCudaData(m_x, x, iend); \ + deallocCudaData(x); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp index f67498171..ca91c5755 100644 --- a/src/algorithm/MEMSET-Hip.cpp +++ b/src/algorithm/MEMSET-Hip.cpp @@ -22,11 +22,11 @@ namespace algorithm { #define MEMSET_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); + allocAndInitHipData(x, m_x, iend); #define MEMSET_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, iend); \ - deallocHipDeviceData(x); + getHipData(m_x, x, iend); \ + deallocHipData(x); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index c8b4bb8e4..6812743d7 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -25,10 +25,10 @@ namespace algorithm { #define REDUCE_SUM_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); + allocAndInitCudaData(x, m_x, iend); #define REDUCE_SUM_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(x); + deallocCudaData(x); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 691db7fae..05a800852 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -30,10 +30,10 @@ namespace algorithm { #define REDUCE_SUM_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); + allocAndInitHipData(x, m_x, iend); #define REDUCE_SUM_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(x); + deallocHipData(x); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 34a9c1f1e..8448dd70f 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -25,13 +25,13 @@ namespace algorithm { #define SCAN_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(y, m_y, iend); + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(y, m_y, iend); #define SCAN_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); + getCudaData(m_y, y, iend); \ + deallocCudaData(x); \ + deallocCudaData(y); void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 14a0f647b..fcf644971 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -30,13 +30,13 @@ namespace algorithm { #define SCAN_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(y, m_y, iend); + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(y, m_y, iend); #define SCAN_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); + getHipData(m_y, y, iend); \ + deallocHipData(x); \ + deallocHipData(y); void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 599a9f246..7599722bb 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -22,11 +22,11 @@ namespace algorithm { #define SORT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend*run_reps); + allocAndInitCudaData(x, m_x, iend*run_reps); #define SORT_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, iend*run_reps); \ - deallocCudaDeviceData(x); + getCudaData(m_x, x, iend*run_reps); \ + deallocCudaData(x); void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index 6c80fbc97..4df222862 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -22,11 +22,11 @@ namespace algorithm { #define SORT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend*run_reps); + allocAndInitHipData(x, m_x, iend*run_reps); #define SORT_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, iend*run_reps); \ - deallocHipDeviceData(x); + getHipData(m_x, x, iend*run_reps); \ + deallocHipData(x); void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index 0c09bfe1f..5c3e6b535 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -22,14 +22,14 @@ namespace algorithm { #define SORTPAIRS_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend*run_reps); \ - allocAndInitCudaDeviceData(i, m_i, iend*run_reps); + allocAndInitCudaData(x, m_x, iend*run_reps); \ + allocAndInitCudaData(i, m_i, iend*run_reps); #define SORTPAIRS_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, iend*run_reps); \ - getCudaDeviceData(m_i, i, iend*run_reps); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(i); + getCudaData(m_x, x, iend*run_reps); \ + getCudaData(m_i, i, iend*run_reps); \ + deallocCudaData(x); \ + deallocCudaData(i); void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index fbdbc660d..7fb106e24 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -22,14 +22,14 @@ namespace algorithm { #define SORTPAIRS_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend*run_reps); \ - allocAndInitHipDeviceData(i, m_i, iend*run_reps); + allocAndInitHipData(x, m_x, iend*run_reps); \ + allocAndInitHipData(i, m_i, iend*run_reps); #define SORTPAIRS_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, iend*run_reps); \ - getHipDeviceData(m_i, i, iend*run_reps); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(i); + getHipData(m_x, x, iend*run_reps); \ + getHipData(m_i, i, iend*run_reps); \ + deallocHipData(x); \ + deallocHipData(i); void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) From cca0b14ed09e90626d340be8337c954fd74e5ba4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 13:35:49 -0700 Subject: [PATCH 006/137] Use runtime GPU allocation in stream --- src/stream/ADD-Cuda.cpp | 14 +++++++------- src/stream/ADD-Hip.cpp | 14 +++++++------- src/stream/COPY-Cuda.cpp | 10 +++++----- src/stream/COPY-Hip.cpp | 10 +++++----- src/stream/DOT-Cuda.cpp | 8 ++++---- src/stream/DOT-Hip.cpp | 8 ++++---- src/stream/MUL-Cuda.cpp | 10 +++++----- src/stream/MUL-Hip.cpp | 10 +++++----- src/stream/TRIAD-Cuda.cpp | 14 +++++++------- src/stream/TRIAD-Hip.cpp | 14 +++++++------- 10 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 102774a13..e50a5bf91 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -22,15 +22,15 @@ namespace stream { #define ADD_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, iend); \ - allocAndInitCudaDeviceData(b, m_b, iend); \ - allocAndInitCudaDeviceData(c, m_c, iend); + allocAndInitCudaData(a, m_a, iend); \ + allocAndInitCudaData(b, m_b, iend); \ + allocAndInitCudaData(c, m_c, iend); #define ADD_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_c, c, iend); \ - deallocCudaDeviceData(a); \ - deallocCudaDeviceData(b); \ - deallocCudaDeviceData(c); + getCudaData(m_c, c, iend); \ + deallocCudaData(a); \ + deallocCudaData(b); \ + deallocCudaData(c); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 5e53500c8..e427bacaa 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -22,15 +22,15 @@ namespace stream { #define ADD_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); \ - allocAndInitHipDeviceData(b, m_b, iend); \ - allocAndInitHipDeviceData(c, m_c, iend); + allocAndInitHipData(a, m_a, iend); \ + allocAndInitHipData(b, m_b, iend); \ + allocAndInitHipData(c, m_c, iend); #define ADD_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_c, c, iend); \ - deallocHipDeviceData(a); \ - deallocHipDeviceData(b); \ - deallocHipDeviceData(c); + getHipData(m_c, c, iend); \ + deallocHipData(a); \ + deallocHipData(b); \ + deallocHipData(c); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index cddf986ac..4061dea92 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -22,13 +22,13 @@ namespace stream { #define COPY_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, iend); \ - allocAndInitCudaDeviceData(c, m_c, iend); + allocAndInitCudaData(a, m_a, iend); \ + allocAndInitCudaData(c, m_c, iend); #define COPY_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_c, c, iend); \ - deallocCudaDeviceData(a); \ - deallocCudaDeviceData(c); + getCudaData(m_c, c, iend); \ + deallocCudaData(a); \ + deallocCudaData(c); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index fe302a7fc..7bf934dd3 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -22,13 +22,13 @@ namespace stream { #define COPY_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); \ - allocAndInitHipDeviceData(c, m_c, iend); + allocAndInitHipData(a, m_a, iend); \ + allocAndInitHipData(c, m_c, iend); #define COPY_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_c, c, iend); \ - deallocHipDeviceData(a); \ - deallocHipDeviceData(c); + getHipData(m_c, c, iend); \ + deallocHipData(a); \ + deallocHipData(c); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index de23c290b..924b45ea7 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -23,12 +23,12 @@ namespace stream { #define DOT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, iend); \ - allocAndInitCudaDeviceData(b, m_b, iend); + allocAndInitCudaData(a, m_a, iend); \ + allocAndInitCudaData(b, m_b, iend); #define DOT_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(a); \ - deallocCudaDeviceData(b); + deallocCudaData(a); \ + deallocCudaData(b); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 3e75e64ef..d030a563d 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -23,12 +23,12 @@ namespace stream { #define DOT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); \ - allocAndInitHipDeviceData(b, m_b, iend); + allocAndInitHipData(a, m_a, iend); \ + allocAndInitHipData(b, m_b, iend); #define DOT_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(a); \ - deallocHipDeviceData(b); + deallocHipData(a); \ + deallocHipData(b); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 8db12d087..33a97f168 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -22,13 +22,13 @@ namespace stream { #define MUL_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(b, m_b, iend); \ - allocAndInitCudaDeviceData(c, m_c, iend); + allocAndInitCudaData(b, m_b, iend); \ + allocAndInitCudaData(c, m_c, iend); #define MUL_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_b, b, iend); \ - deallocCudaDeviceData(b); \ - deallocCudaDeviceData(c) + getCudaData(m_b, b, iend); \ + deallocCudaData(b); \ + deallocCudaData(c) template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 3e5e3f9f0..00e1cc2c2 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -22,13 +22,13 @@ namespace stream { #define MUL_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(b, m_b, iend); \ - allocAndInitHipDeviceData(c, m_c, iend); + allocAndInitHipData(b, m_b, iend); \ + allocAndInitHipData(c, m_c, iend); #define MUL_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_b, b, iend); \ - deallocHipDeviceData(b); \ - deallocHipDeviceData(c) + getHipData(m_b, b, iend); \ + deallocHipData(b); \ + deallocHipData(c) template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 234683493..10e63132a 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -22,15 +22,15 @@ namespace stream { #define TRIAD_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, iend); \ - allocAndInitCudaDeviceData(b, m_b, iend); \ - allocAndInitCudaDeviceData(c, m_c, iend); + allocAndInitCudaData(a, m_a, iend); \ + allocAndInitCudaData(b, m_b, iend); \ + allocAndInitCudaData(c, m_c, iend); #define TRIAD_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_a, a, iend); \ - deallocCudaDeviceData(a); \ - deallocCudaDeviceData(b); \ - deallocCudaDeviceData(c); + getCudaData(m_a, a, iend); \ + deallocCudaData(a); \ + deallocCudaData(b); \ + deallocCudaData(c); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 740727530..25b798ad5 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -22,15 +22,15 @@ namespace stream { #define TRIAD_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); \ - allocAndInitHipDeviceData(b, m_b, iend); \ - allocAndInitHipDeviceData(c, m_c, iend); + allocAndInitHipData(a, m_a, iend); \ + allocAndInitHipData(b, m_b, iend); \ + allocAndInitHipData(c, m_c, iend); #define TRIAD_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_a, a, iend); \ - deallocHipDeviceData(a); \ - deallocHipDeviceData(b); \ - deallocHipDeviceData(c); + getHipData(m_a, a, iend); \ + deallocHipData(a); \ + deallocHipData(b); \ + deallocHipData(c); template < size_t block_size > __launch_bounds__(block_size) From 3dc84017052cb6db766f701bfe7712023db25b2c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 13:40:50 -0700 Subject: [PATCH 007/137] Use runtime allocators in polybench --- src/polybench/POLYBENCH_2MM-Cuda.cpp | 22 +++++------ src/polybench/POLYBENCH_2MM-Hip.cpp | 22 +++++------ src/polybench/POLYBENCH_3MM-Cuda.cpp | 30 +++++++-------- src/polybench/POLYBENCH_3MM-Hip.cpp | 30 +++++++-------- src/polybench/POLYBENCH_ADI-Cuda.cpp | 18 ++++----- src/polybench/POLYBENCH_ADI-Hip.cpp | 18 ++++----- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 18 ++++----- src/polybench/POLYBENCH_ATAX-Hip.cpp | 18 ++++----- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 16 ++++---- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 16 ++++---- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 10 ++--- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 10 ++--- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 14 +++---- src/polybench/POLYBENCH_GEMM-Hip.cpp | 14 +++---- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 38 +++++++++---------- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 38 +++++++++---------- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 18 ++++----- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 18 ++++----- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 12 +++--- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 12 +++--- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 12 +++--- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 12 +++--- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 12 +++--- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 12 +++--- src/polybench/POLYBENCH_MVT-Cuda.cpp | 24 ++++++------ src/polybench/POLYBENCH_MVT-Hip.cpp | 24 ++++++------ 26 files changed, 244 insertions(+), 244 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 40b1f5ca3..19cbfe59b 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -45,20 +45,20 @@ namespace polybench #define POLYBENCH_2MM_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(tmp, m_tmp, m_ni * m_nj); \ - allocAndInitCudaDeviceData(A, m_A, m_ni * m_nk); \ - allocAndInitCudaDeviceData(B, m_B, m_nk * m_nj); \ - allocAndInitCudaDeviceData(C, m_C, m_nj * m_nl); \ - allocAndInitCudaDeviceData(D, m_D, m_ni * m_nl); + allocAndInitCudaData(tmp, m_tmp, m_ni * m_nj); \ + allocAndInitCudaData(A, m_A, m_ni * m_nk); \ + allocAndInitCudaData(B, m_B, m_nk * m_nj); \ + allocAndInitCudaData(C, m_C, m_nj * m_nl); \ + allocAndInitCudaData(D, m_D, m_ni * m_nl); #define POLYBENCH_2MM_TEARDOWN_CUDA \ - getCudaDeviceData(m_D, D, m_ni * m_nl); \ - deallocCudaDeviceData(tmp); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); \ - deallocCudaDeviceData(C); \ - deallocCudaDeviceData(D); + getCudaData(m_D, D, m_ni * m_nl); \ + deallocCudaData(tmp); \ + deallocCudaData(A); \ + deallocCudaData(B); \ + deallocCudaData(C); \ + deallocCudaData(D); template < size_t in_block_size, size_t out_block_size > diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 15ffa80df..8ce7ed4bc 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -45,20 +45,20 @@ namespace polybench #define POLYBENCH_2MM_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(tmp, m_tmp, m_ni * m_nj); \ - allocAndInitHipDeviceData(A, m_A, m_ni * m_nk); \ - allocAndInitHipDeviceData(B, m_B, m_nk * m_nj); \ - allocAndInitHipDeviceData(C, m_C, m_nj * m_nl); \ - allocAndInitHipDeviceData(D, m_D, m_ni * m_nl); + allocAndInitHipData(tmp, m_tmp, m_ni * m_nj); \ + allocAndInitHipData(A, m_A, m_ni * m_nk); \ + allocAndInitHipData(B, m_B, m_nk * m_nj); \ + allocAndInitHipData(C, m_C, m_nj * m_nl); \ + allocAndInitHipData(D, m_D, m_ni * m_nl); #define POLYBENCH_2MM_TEARDOWN_HIP \ - getHipDeviceData(m_D, D, m_ni * m_nl); \ - deallocHipDeviceData(tmp); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); \ - deallocHipDeviceData(C); \ - deallocHipDeviceData(D); + getHipData(m_D, D, m_ni * m_nl); \ + deallocHipData(tmp); \ + deallocHipData(A); \ + deallocHipData(B); \ + deallocHipData(C); \ + deallocHipData(D); template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index f9b151ebf..0294e8a53 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -50,24 +50,24 @@ namespace polybench #define POLYBENCH_3MM_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(A, m_A, m_ni * m_nk); \ - allocAndInitCudaDeviceData(B, m_B, m_nk * m_nj); \ - allocAndInitCudaDeviceData(C, m_C, m_nj * m_nm); \ - allocAndInitCudaDeviceData(D, m_D, m_nm * m_nl); \ - allocAndInitCudaDeviceData(E, m_E, m_ni * m_nj); \ - allocAndInitCudaDeviceData(F, m_F, m_nj * m_nl); \ - allocAndInitCudaDeviceData(G, m_G, m_ni * m_nl); + allocAndInitCudaData(A, m_A, m_ni * m_nk); \ + allocAndInitCudaData(B, m_B, m_nk * m_nj); \ + allocAndInitCudaData(C, m_C, m_nj * m_nm); \ + allocAndInitCudaData(D, m_D, m_nm * m_nl); \ + allocAndInitCudaData(E, m_E, m_ni * m_nj); \ + allocAndInitCudaData(F, m_F, m_nj * m_nl); \ + allocAndInitCudaData(G, m_G, m_ni * m_nl); #define POLYBENCH_3MM_TEARDOWN_CUDA \ - getCudaDeviceData(m_G, G, m_ni * m_nl); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); \ - deallocCudaDeviceData(C); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(E); \ - deallocCudaDeviceData(F); \ - deallocCudaDeviceData(G); + getCudaData(m_G, G, m_ni * m_nl); \ + deallocCudaData(A); \ + deallocCudaData(B); \ + deallocCudaData(C); \ + deallocCudaData(D); \ + deallocCudaData(E); \ + deallocCudaData(F); \ + deallocCudaData(G); template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 4199f0c44..3f84ad5bc 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -50,24 +50,24 @@ namespace polybench #define POLYBENCH_3MM_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(A, m_A, m_ni * m_nk); \ - allocAndInitHipDeviceData(B, m_B, m_nk * m_nj); \ - allocAndInitHipDeviceData(C, m_C, m_nj * m_nm); \ - allocAndInitHipDeviceData(D, m_D, m_nm * m_nl); \ - allocAndInitHipDeviceData(E, m_E, m_ni * m_nj); \ - allocAndInitHipDeviceData(F, m_F, m_nj * m_nl); \ - allocAndInitHipDeviceData(G, m_G, m_ni * m_nl); + allocAndInitHipData(A, m_A, m_ni * m_nk); \ + allocAndInitHipData(B, m_B, m_nk * m_nj); \ + allocAndInitHipData(C, m_C, m_nj * m_nm); \ + allocAndInitHipData(D, m_D, m_nm * m_nl); \ + allocAndInitHipData(E, m_E, m_ni * m_nj); \ + allocAndInitHipData(F, m_F, m_nj * m_nl); \ + allocAndInitHipData(G, m_G, m_ni * m_nl); #define POLYBENCH_3MM_TEARDOWN_HIP \ - getHipDeviceData(m_G, G, m_ni * m_nl); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); \ - deallocHipDeviceData(C); \ - deallocHipDeviceData(D); \ - deallocHipDeviceData(E); \ - deallocHipDeviceData(F); \ - deallocHipDeviceData(G); + getHipData(m_G, G, m_ni * m_nl); \ + deallocHipData(A); \ + deallocHipData(B); \ + deallocHipData(C); \ + deallocHipData(D); \ + deallocHipData(E); \ + deallocHipData(F); \ + deallocHipData(G); template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index a4f92f213..02a572412 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -22,17 +22,17 @@ namespace polybench { #define POLYBENCH_ADI_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(U, m_U, m_n * m_n); \ - allocAndInitCudaDeviceData(V, m_V, m_n * m_n); \ - allocAndInitCudaDeviceData(P, m_P, m_n * m_n); \ - allocAndInitCudaDeviceData(Q, m_Q, m_n * m_n); + allocAndInitCudaData(U, m_U, m_n * m_n); \ + allocAndInitCudaData(V, m_V, m_n * m_n); \ + allocAndInitCudaData(P, m_P, m_n * m_n); \ + allocAndInitCudaData(Q, m_Q, m_n * m_n); #define POLYBENCH_ADI_TEARDOWN_CUDA \ - getCudaDeviceData(m_U, U, m_n * m_n); \ - deallocCudaDeviceData(U); \ - deallocCudaDeviceData(V); \ - deallocCudaDeviceData(P); \ - deallocCudaDeviceData(Q); + getCudaData(m_U, U, m_n * m_n); \ + deallocCudaData(U); \ + deallocCudaData(V); \ + deallocCudaData(P); \ + deallocCudaData(Q); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index f87ec84f1..2b99237b9 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -22,18 +22,18 @@ namespace polybench { #define POLYBENCH_ADI_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(U, m_U, m_n * m_n); \ - allocAndInitHipDeviceData(V, m_V, m_n * m_n); \ - allocAndInitHipDeviceData(P, m_P, m_n * m_n); \ - allocAndInitHipDeviceData(Q, m_Q, m_n * m_n); + allocAndInitHipData(U, m_U, m_n * m_n); \ + allocAndInitHipData(V, m_V, m_n * m_n); \ + allocAndInitHipData(P, m_P, m_n * m_n); \ + allocAndInitHipData(Q, m_Q, m_n * m_n); #define POLYBENCH_ADI_TEARDOWN_HIP \ - getHipDeviceData(m_U, U, m_n * m_n); \ - deallocHipDeviceData(U); \ - deallocHipDeviceData(V); \ - deallocHipDeviceData(P); \ - deallocHipDeviceData(Q); + getHipData(m_U, U, m_n * m_n); \ + deallocHipData(U); \ + deallocHipData(V); \ + deallocHipData(P); \ + deallocHipData(Q); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 66b0d3218..f82017b06 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -22,18 +22,18 @@ namespace polybench { #define POLYBENCH_ATAX_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(tmp, m_tmp, N); \ - allocAndInitCudaDeviceData(y, m_y, N); \ - allocAndInitCudaDeviceData(x, m_x, N); \ - allocAndInitCudaDeviceData(A, m_A, N * N); + allocAndInitCudaData(tmp, m_tmp, N); \ + allocAndInitCudaData(y, m_y, N); \ + allocAndInitCudaData(x, m_x, N); \ + allocAndInitCudaData(A, m_A, N * N); #define POLYBENCH_ATAX_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, N); \ - deallocCudaDeviceData(tmp); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(A); + getCudaData(m_y, y, N); \ + deallocCudaData(tmp); \ + deallocCudaData(y); \ + deallocCudaData(x); \ + deallocCudaData(A); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 8e1078c89..5c02bc553 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -22,18 +22,18 @@ namespace polybench { #define POLYBENCH_ATAX_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(tmp, m_tmp, N); \ - allocAndInitHipDeviceData(y, m_y, N); \ - allocAndInitHipDeviceData(x, m_x, N); \ - allocAndInitHipDeviceData(A, m_A, N * N); + allocAndInitHipData(tmp, m_tmp, N); \ + allocAndInitHipData(y, m_y, N); \ + allocAndInitHipData(x, m_x, N); \ + allocAndInitHipData(A, m_A, N * N); #define POLYBENCH_ATAX_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, N); \ - deallocHipDeviceData(tmp); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(A); + getHipData(m_y, y, N); \ + deallocHipData(tmp); \ + deallocHipData(y); \ + deallocHipData(x); \ + deallocHipData(A); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 6b4e8c636..5d8001eb9 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -40,17 +40,17 @@ namespace polybench #define POLYBENCH_FDTD_2D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(hz, m_hz, m_nx * m_ny); \ - allocAndInitCudaDeviceData(ex, m_ex, m_nx * m_ny); \ - allocAndInitCudaDeviceData(ey, m_ey, m_nx * m_ny); \ - allocAndInitCudaDeviceData(fict, m_fict, m_tsteps); + allocAndInitCudaData(hz, m_hz, m_nx * m_ny); \ + allocAndInitCudaData(ex, m_ex, m_nx * m_ny); \ + allocAndInitCudaData(ey, m_ey, m_nx * m_ny); \ + allocAndInitCudaData(fict, m_fict, m_tsteps); #define POLYBENCH_FDTD_2D_TEARDOWN_CUDA \ - getCudaDeviceData(m_hz, hz, m_nx * m_ny); \ - deallocCudaDeviceData(ex); \ - deallocCudaDeviceData(ey); \ - deallocCudaDeviceData(fict); + getCudaData(m_hz, hz, m_nx * m_ny); \ + deallocCudaData(ex); \ + deallocCudaData(ey); \ + deallocCudaData(fict); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index 0ca25f1e0..7bc1df7bf 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -39,17 +39,17 @@ namespace polybench static_cast(1)); #define POLYBENCH_FDTD_2D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(hz, m_hz, m_nx * m_ny); \ - allocAndInitHipDeviceData(ex, m_ex, m_nx * m_ny); \ - allocAndInitHipDeviceData(ey, m_ey, m_nx * m_ny); \ - allocAndInitHipDeviceData(fict, m_fict, m_tsteps); + allocAndInitHipData(hz, m_hz, m_nx * m_ny); \ + allocAndInitHipData(ex, m_ex, m_nx * m_ny); \ + allocAndInitHipData(ey, m_ey, m_nx * m_ny); \ + allocAndInitHipData(fict, m_fict, m_tsteps); #define POLYBENCH_FDTD_2D_TEARDOWN_HIP \ - getHipDeviceData(m_hz, hz, m_nx * m_ny); \ - deallocHipDeviceData(ex); \ - deallocHipDeviceData(ey); \ - deallocHipDeviceData(fict); + getHipData(m_hz, hz, m_nx * m_ny); \ + deallocHipData(ex); \ + deallocHipData(ey); \ + deallocHipData(fict); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 30e9a54b4..2b48e03ca 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -38,14 +38,14 @@ namespace polybench #define POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(pin, m_pin, m_N * m_N); \ - allocAndInitCudaDeviceData(pout, m_pout, m_N * m_N); + allocAndInitCudaData(pin, m_pin, m_N * m_N); \ + allocAndInitCudaData(pout, m_pout, m_N * m_N); #define POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA \ - getCudaDeviceData(m_pout, pout, m_N * m_N); \ - deallocCudaDeviceData(pin); \ - deallocCudaDeviceData(pout); + getCudaData(m_pout, pout, m_N * m_N); \ + deallocCudaData(pin); \ + deallocCudaData(pout); template < size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 99b8ea303..4f455f3ed 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -38,14 +38,14 @@ namespace polybench #define POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(pin, m_pin, m_N * m_N); \ - allocAndInitHipDeviceData(pout, m_pout, m_N * m_N); + allocAndInitHipData(pin, m_pin, m_N * m_N); \ + allocAndInitHipData(pout, m_pout, m_N * m_N); #define POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP \ - getHipDeviceData(m_pout, pout, m_N * m_N); \ - deallocHipDeviceData(pin); \ - deallocHipDeviceData(pout); + getHipData(m_pout, pout, m_N * m_N); \ + deallocHipData(pin); \ + deallocHipData(pout); template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index 5101ebc00..7c2b8fef7 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -40,16 +40,16 @@ namespace polybench #define POLYBENCH_GEMM_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(A, m_A, ni*nk); \ - allocAndInitCudaDeviceData(B, m_B, nk*nj); \ - allocAndInitCudaDeviceData(C, m_C, ni*nj); + allocAndInitCudaData(A, m_A, ni*nk); \ + allocAndInitCudaData(B, m_B, nk*nj); \ + allocAndInitCudaData(C, m_C, ni*nj); #define POLYBENCH_GEMM_TEARDOWN_CUDA \ - getCudaDeviceData(m_C, C, ni*nj); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); \ - deallocCudaDeviceData(C); + getCudaData(m_C, C, ni*nj); \ + deallocCudaData(A); \ + deallocCudaData(B); \ + deallocCudaData(C); template < size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index ed2c7fcff..6d7f53289 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -40,16 +40,16 @@ namespace polybench #define POLYBENCH_GEMM_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(A, m_A, ni*nk); \ - allocAndInitHipDeviceData(B, m_B, nk*nj); \ - allocAndInitHipDeviceData(C, m_C, ni*nj); + allocAndInitHipData(A, m_A, ni*nk); \ + allocAndInitHipData(B, m_B, nk*nj); \ + allocAndInitHipData(C, m_C, ni*nj); #define POLYBENCH_GEMM_TEARDOWN_HIP \ - getHipDeviceData(m_C, C, ni*nj); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); \ - deallocHipDeviceData(C); + getHipData(m_C, C, ni*nj); \ + deallocHipData(A); \ + deallocHipData(B); \ + deallocHipData(C); template < size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 652bbf761..9524f8c3a 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -40,28 +40,28 @@ namespace polybench #define POLYBENCH_GEMVER_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(A, m_A, m_n * m_n); \ - allocAndInitCudaDeviceData(u1, m_u1, m_n); \ - allocAndInitCudaDeviceData(v1, m_v1, m_n); \ - allocAndInitCudaDeviceData(u2, m_u2, m_n); \ - allocAndInitCudaDeviceData(v2, m_v2, m_n); \ - allocAndInitCudaDeviceData(w, m_w, m_n); \ - allocAndInitCudaDeviceData(x, m_x, m_n); \ - allocAndInitCudaDeviceData(y, m_y, m_n); \ - allocAndInitCudaDeviceData(z, m_z, m_n); + allocAndInitCudaData(A, m_A, m_n * m_n); \ + allocAndInitCudaData(u1, m_u1, m_n); \ + allocAndInitCudaData(v1, m_v1, m_n); \ + allocAndInitCudaData(u2, m_u2, m_n); \ + allocAndInitCudaData(v2, m_v2, m_n); \ + allocAndInitCudaData(w, m_w, m_n); \ + allocAndInitCudaData(x, m_x, m_n); \ + allocAndInitCudaData(y, m_y, m_n); \ + allocAndInitCudaData(z, m_z, m_n); #define POLYBENCH_GEMVER_TEARDOWN_CUDA \ - getCudaDeviceData(m_w, w, m_n); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(u1); \ - deallocCudaDeviceData(v1); \ - deallocCudaDeviceData(u2); \ - deallocCudaDeviceData(v2); \ - deallocCudaDeviceData(w); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(z); + getCudaData(m_w, w, m_n); \ + deallocCudaData(A); \ + deallocCudaData(u1); \ + deallocCudaData(v1); \ + deallocCudaData(u2); \ + deallocCudaData(v2); \ + deallocCudaData(w); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(z); template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 943958e31..d2573f147 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -40,28 +40,28 @@ namespace polybench #define POLYBENCH_GEMVER_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(A, m_A, m_n * m_n); \ - allocAndInitHipDeviceData(u1, m_u1, m_n); \ - allocAndInitHipDeviceData(v1, m_v1, m_n); \ - allocAndInitHipDeviceData(u2, m_u2, m_n); \ - allocAndInitHipDeviceData(v2, m_v2, m_n); \ - allocAndInitHipDeviceData(w, m_w, m_n); \ - allocAndInitHipDeviceData(x, m_x, m_n); \ - allocAndInitHipDeviceData(y, m_y, m_n); \ - allocAndInitHipDeviceData(z, m_z, m_n); + allocAndInitHipData(A, m_A, m_n * m_n); \ + allocAndInitHipData(u1, m_u1, m_n); \ + allocAndInitHipData(v1, m_v1, m_n); \ + allocAndInitHipData(u2, m_u2, m_n); \ + allocAndInitHipData(v2, m_v2, m_n); \ + allocAndInitHipData(w, m_w, m_n); \ + allocAndInitHipData(x, m_x, m_n); \ + allocAndInitHipData(y, m_y, m_n); \ + allocAndInitHipData(z, m_z, m_n); #define POLYBENCH_GEMVER_TEARDOWN_HIP \ - getHipDeviceData(m_w, w, m_n); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(u1); \ - deallocHipDeviceData(v1); \ - deallocHipDeviceData(u2); \ - deallocHipDeviceData(v2); \ - deallocHipDeviceData(w); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(z); + getHipData(m_w, w, m_n); \ + deallocHipData(A); \ + deallocHipData(u1); \ + deallocHipData(v1); \ + deallocHipData(u2); \ + deallocHipData(v2); \ + deallocHipData(w); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(z); template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 535e24efa..05ccc18c4 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -22,18 +22,18 @@ namespace polybench { #define POLYBENCH_GESUMMV_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, N); \ - allocAndInitCudaDeviceData(y, m_y, N); \ - allocAndInitCudaDeviceData(A, m_A, N*N); \ - allocAndInitCudaDeviceData(B, m_B, N*N); + allocAndInitCudaData(x, m_x, N); \ + allocAndInitCudaData(y, m_y, N); \ + allocAndInitCudaData(A, m_A, N*N); \ + allocAndInitCudaData(B, m_B, N*N); #define POLYBENCH_GESUMMV_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, N); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); + getCudaData(m_y, y, N); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(A); \ + deallocCudaData(B); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index ee39f9c6e..4e2db368a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -22,18 +22,18 @@ namespace polybench { #define POLYBENCH_GESUMMV_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, N); \ - allocAndInitHipDeviceData(y, m_y, N); \ - allocAndInitHipDeviceData(A, m_A, N*N); \ - allocAndInitHipDeviceData(B, m_B, N*N); + allocAndInitHipData(x, m_x, N); \ + allocAndInitHipData(y, m_y, N); \ + allocAndInitHipData(A, m_A, N*N); \ + allocAndInitHipData(B, m_B, N*N); #define POLYBENCH_GESUMMV_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, N); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); + getHipData(m_y, y, N); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(A); \ + deallocHipData(B); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index ce6e7769e..7b48b13c8 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -41,16 +41,16 @@ namespace polybench #define POLYBENCH_HEAT_3D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(A, m_Ainit, m_N*m_N*m_N); \ - allocAndInitCudaDeviceData(B, m_Binit, m_N*m_N*m_N); \ + allocAndInitCudaData(A, m_Ainit, m_N*m_N*m_N); \ + allocAndInitCudaData(B, m_Binit, m_N*m_N*m_N); \ static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size"); #define POLYBENCH_HEAT_3D_TEARDOWN_CUDA \ - getCudaDeviceData(m_A, A, m_N*m_N*m_N); \ - getCudaDeviceData(m_B, B, m_N*m_N*m_N); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); + getCudaData(m_A, A, m_N*m_N*m_N); \ + getCudaData(m_B, B, m_N*m_N*m_N); \ + deallocCudaData(A); \ + deallocCudaData(B); template < size_t k_block_size, size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 00e68aebd..df9e28361 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -41,16 +41,16 @@ namespace polybench #define POLYBENCH_HEAT_3D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(A, m_Ainit, m_N*m_N*m_N); \ - allocAndInitHipDeviceData(B, m_Binit, m_N*m_N*m_N); \ + allocAndInitHipData(A, m_Ainit, m_N*m_N*m_N); \ + allocAndInitHipData(B, m_Binit, m_N*m_N*m_N); \ static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size"); #define POLYBENCH_HEAT_3D_TEARDOWN_HIP \ - getHipDeviceData(m_A, A, m_N*m_N*m_N); \ - getHipDeviceData(m_B, B, m_N*m_N*m_N); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); + getHipData(m_A, A, m_N*m_N*m_N); \ + getHipData(m_B, B, m_N*m_N*m_N); \ + deallocHipData(A); \ + deallocHipData(B); template < size_t k_block_size, size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index a48e70a84..d6e69fb55 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -22,15 +22,15 @@ namespace polybench { #define POLYBENCH_JACOBI_1D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(A, m_Ainit, m_N); \ - allocAndInitCudaDeviceData(B, m_Binit, m_N); + allocAndInitCudaData(A, m_Ainit, m_N); \ + allocAndInitCudaData(B, m_Binit, m_N); #define POLYBENCH_JACOBI_1D_TEARDOWN_CUDA \ - getCudaDeviceData(m_A, A, m_N); \ - getCudaDeviceData(m_B, B, m_N); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); + getCudaData(m_A, A, m_N); \ + getCudaData(m_B, B, m_N); \ + deallocCudaData(A); \ + deallocCudaData(B); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index a5ff60dfc..e7374e6d2 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -22,15 +22,15 @@ namespace polybench { #define POLYBENCH_JACOBI_1D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(A, m_Ainit, m_N); \ - allocAndInitHipDeviceData(B, m_Binit, m_N); + allocAndInitHipData(A, m_Ainit, m_N); \ + allocAndInitHipData(B, m_Binit, m_N); #define POLYBENCH_JACOBI_1D_TEARDOWN_HIP \ - getHipDeviceData(m_A, A, m_N); \ - getHipDeviceData(m_B, B, m_N); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); + getHipData(m_A, A, m_N); \ + getHipData(m_B, B, m_N); \ + deallocHipData(A); \ + deallocHipData(B); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index ca6a485ec..1a0ab19f7 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -40,15 +40,15 @@ namespace polybench #define POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(A, m_Ainit, m_N*m_N); \ - allocAndInitCudaDeviceData(B, m_Binit, m_N*m_N); + allocAndInitCudaData(A, m_Ainit, m_N*m_N); \ + allocAndInitCudaData(B, m_Binit, m_N*m_N); #define POLYBENCH_JACOBI_2D_TEARDOWN_CUDA \ - getCudaDeviceData(m_A, A, m_N*m_N); \ - getCudaDeviceData(m_B, B, m_N*m_N); \ - deallocCudaDeviceData(A); \ - deallocCudaDeviceData(B); + getCudaData(m_A, A, m_N*m_N); \ + getCudaData(m_B, B, m_N*m_N); \ + deallocCudaData(A); \ + deallocCudaData(B); template < size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index bf03f9b86..f5e7255ac 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -40,15 +40,15 @@ namespace polybench #define POLYBENCH_JACOBI_2D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(A, m_Ainit, m_N*m_N); \ - allocAndInitHipDeviceData(B, m_Binit, m_N*m_N); + allocAndInitHipData(A, m_Ainit, m_N*m_N); \ + allocAndInitHipData(B, m_Binit, m_N*m_N); #define POLYBENCH_JACOBI_2D_TEARDOWN_HIP \ - getHipDeviceData(m_A, A, m_N*m_N); \ - getHipDeviceData(m_B, B, m_N*m_N); \ - deallocHipDeviceData(A); \ - deallocHipDeviceData(B); + getHipData(m_A, A, m_N*m_N); \ + getHipData(m_B, B, m_N*m_N); \ + deallocHipData(A); \ + deallocHipData(B); template < size_t j_block_size, size_t i_block_size > diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 2795cadbb..b7761e59b 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -22,21 +22,21 @@ namespace polybench { #define POLYBENCH_MVT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x1, m_x1, N); \ - allocAndInitCudaDeviceData(x2, m_x2, N); \ - allocAndInitCudaDeviceData(y1, m_y1, N); \ - allocAndInitCudaDeviceData(y2, m_y2, N); \ - allocAndInitCudaDeviceData(A, m_A, N * N); + allocAndInitCudaData(x1, m_x1, N); \ + allocAndInitCudaData(x2, m_x2, N); \ + allocAndInitCudaData(y1, m_y1, N); \ + allocAndInitCudaData(y2, m_y2, N); \ + allocAndInitCudaData(A, m_A, N * N); #define POLYBENCH_MVT_TEARDOWN_CUDA \ - getCudaDeviceData(m_x1, x1, N); \ - getCudaDeviceData(m_x2, x2, N); \ - deallocCudaDeviceData(x1); \ - deallocCudaDeviceData(x2); \ - deallocCudaDeviceData(y1); \ - deallocCudaDeviceData(y2); \ - deallocCudaDeviceData(A); + getCudaData(m_x1, x1, N); \ + getCudaData(m_x2, x2, N); \ + deallocCudaData(x1); \ + deallocCudaData(x2); \ + deallocCudaData(y1); \ + deallocCudaData(y2); \ + deallocCudaData(A); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 176c41710..efd21d65e 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -22,21 +22,21 @@ namespace polybench { #define POLYBENCH_MVT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x1, m_x1, N); \ - allocAndInitHipDeviceData(x2, m_x2, N); \ - allocAndInitHipDeviceData(y1, m_y1, N); \ - allocAndInitHipDeviceData(y2, m_y2, N); \ - allocAndInitHipDeviceData(A, m_A, N * N); + allocAndInitHipData(x1, m_x1, N); \ + allocAndInitHipData(x2, m_x2, N); \ + allocAndInitHipData(y1, m_y1, N); \ + allocAndInitHipData(y2, m_y2, N); \ + allocAndInitHipData(A, m_A, N * N); #define POLYBENCH_MVT_TEARDOWN_HIP \ - getHipDeviceData(m_x1, x1, N); \ - getHipDeviceData(m_x2, x2, N); \ - deallocHipDeviceData(x1); \ - deallocHipDeviceData(x2); \ - deallocHipDeviceData(y1); \ - deallocHipDeviceData(y2); \ - deallocHipDeviceData(A); + getHipData(m_x1, x1, N); \ + getHipData(m_x2, x2, N); \ + deallocHipData(x1); \ + deallocHipData(x2); \ + deallocHipData(y1); \ + deallocHipData(y2); \ + deallocHipData(A); template < size_t block_size > From 57533047b4a7a0cc1cf902b119aac88bf615dfec Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 13:41:41 -0700 Subject: [PATCH 008/137] Use runtime allocators in lcals --- src/lcals/DIFF_PREDICT-Cuda.cpp | 10 +++---- src/lcals/DIFF_PREDICT-Hip.cpp | 10 +++---- src/lcals/EOS-Cuda.cpp | 18 ++++++------ src/lcals/EOS-Hip.cpp | 18 ++++++------ src/lcals/FIRST_DIFF-Cuda.cpp | 10 +++---- src/lcals/FIRST_DIFF-Hip.cpp | 10 +++---- src/lcals/FIRST_MIN-Cuda.cpp | 4 +-- src/lcals/FIRST_MIN-Hip.cpp | 4 +-- src/lcals/FIRST_SUM-Cuda.cpp | 10 +++---- src/lcals/FIRST_SUM-Hip.cpp | 10 +++---- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 18 ++++++------ src/lcals/GEN_LIN_RECUR-Hip.cpp | 18 ++++++------ src/lcals/HYDRO_1D-Cuda.cpp | 14 +++++----- src/lcals/HYDRO_1D-Hip.cpp | 14 +++++----- src/lcals/HYDRO_2D-Cuda.cpp | 48 ++++++++++++++++---------------- src/lcals/HYDRO_2D-Hip.cpp | 48 ++++++++++++++++---------------- src/lcals/INT_PREDICT-Cuda.cpp | 6 ++-- src/lcals/INT_PREDICT-Hip.cpp | 6 ++-- src/lcals/PLANCKIAN-Cuda.cpp | 22 +++++++-------- src/lcals/PLANCKIAN-Hip.cpp | 22 +++++++-------- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 18 ++++++------ src/lcals/TRIDIAG_ELIM-Hip.cpp | 18 ++++++------ 22 files changed, 178 insertions(+), 178 deletions(-) diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 0ef286507..ba053974d 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -22,13 +22,13 @@ namespace lcals { #define DIFF_PREDICT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(px, m_px, m_array_length); \ - allocAndInitCudaDeviceData(cx, m_cx, m_array_length); + allocAndInitCudaData(px, m_px, m_array_length); \ + allocAndInitCudaData(cx, m_cx, m_array_length); #define DIFF_PREDICT_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_px, px, m_array_length); \ - deallocCudaDeviceData(px); \ - deallocCudaDeviceData(cx); + getCudaData(m_px, px, m_array_length); \ + deallocCudaData(px); \ + deallocCudaData(cx); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 4f076157d..14332f026 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -22,13 +22,13 @@ namespace lcals { #define DIFF_PREDICT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(px, m_px, m_array_length); \ - allocAndInitHipDeviceData(cx, m_cx, m_array_length); + allocAndInitHipData(px, m_px, m_array_length); \ + allocAndInitHipData(cx, m_cx, m_array_length); #define DIFF_PREDICT_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_px, px, m_array_length); \ - deallocHipDeviceData(px); \ - deallocHipDeviceData(cx); + getHipData(m_px, px, m_array_length); \ + deallocHipData(px); \ + deallocHipData(cx); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index f99828d46..29fef6981 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -22,17 +22,17 @@ namespace lcals { #define EOS_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_array_length); \ - allocAndInitCudaDeviceData(y, m_y, m_array_length); \ - allocAndInitCudaDeviceData(z, m_z, m_array_length); \ - allocAndInitCudaDeviceData(u, m_u, m_array_length); + allocAndInitCudaData(x, m_x, m_array_length); \ + allocAndInitCudaData(y, m_y, m_array_length); \ + allocAndInitCudaData(z, m_z, m_array_length); \ + allocAndInitCudaData(u, m_u, m_array_length); #define EOS_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, m_array_length); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(z); \ - deallocCudaDeviceData(u); + getCudaData(m_x, x, m_array_length); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(z); \ + deallocCudaData(u); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 0912ce5ce..3b68cb691 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -22,17 +22,17 @@ namespace lcals { #define EOS_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_array_length); \ - allocAndInitHipDeviceData(y, m_y, m_array_length); \ - allocAndInitHipDeviceData(z, m_z, m_array_length); \ - allocAndInitHipDeviceData(u, m_u, m_array_length); + allocAndInitHipData(x, m_x, m_array_length); \ + allocAndInitHipData(y, m_y, m_array_length); \ + allocAndInitHipData(z, m_z, m_array_length); \ + allocAndInitHipData(u, m_u, m_array_length); #define EOS_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, m_array_length); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(z); \ - deallocHipDeviceData(u); + getHipData(m_x, x, m_array_length); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(z); \ + deallocHipData(u); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index b195c0d46..4cf6b658c 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -22,13 +22,13 @@ namespace lcals { #define FIRST_DIFF_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_N); \ - allocAndInitCudaDeviceData(y, m_y, m_N); + allocAndInitCudaData(x, m_x, m_N); \ + allocAndInitCudaData(y, m_y, m_N); #define FIRST_DIFF_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, m_N); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); + getCudaData(m_x, x, m_N); \ + deallocCudaData(x); \ + deallocCudaData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 382fa107e..fa53b02bb 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -22,13 +22,13 @@ namespace lcals { #define FIRST_DIFF_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_N); \ - allocAndInitHipDeviceData(y, m_y, m_N); + allocAndInitHipData(x, m_x, m_N); \ + allocAndInitHipData(y, m_y, m_N); #define FIRST_DIFF_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, m_N); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); + getHipData(m_x, x, m_N); \ + deallocHipData(x); \ + deallocHipData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index f98982860..b879a9a65 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -22,10 +22,10 @@ namespace lcals { #define FIRST_MIN_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_N); + allocAndInitCudaData(x, m_x, m_N); #define FIRST_MIN_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(x); + deallocCudaData(x); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index e2b2763cf..4ad8878a3 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -22,10 +22,10 @@ namespace lcals { #define FIRST_MIN_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_N); + allocAndInitHipData(x, m_x, m_N); #define FIRST_MIN_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(x); + deallocHipData(x); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 85db3d39d..ad5ff10d4 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -22,13 +22,13 @@ namespace lcals { #define FIRST_SUM_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_N); \ - allocAndInitCudaDeviceData(y, m_y, m_N); + allocAndInitCudaData(x, m_x, m_N); \ + allocAndInitCudaData(y, m_y, m_N); #define FIRST_SUM_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, m_N); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); + getCudaData(m_x, x, m_N); \ + deallocCudaData(x); \ + deallocCudaData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 1a03619e5..7b287555f 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -22,13 +22,13 @@ namespace lcals { #define FIRST_SUM_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_N); \ - allocAndInitHipDeviceData(y, m_y, m_N); + allocAndInitHipData(x, m_x, m_N); \ + allocAndInitHipData(y, m_y, m_N); #define FIRST_SUM_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, m_N); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); + getHipData(m_x, x, m_N); \ + deallocHipData(x); \ + deallocHipData(y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 76f840294..6cd45b9d6 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -22,17 +22,17 @@ namespace lcals { #define GEN_LIN_RECUR_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(b5, m_b5, m_N); \ - allocAndInitCudaDeviceData(stb5, m_stb5, m_N); \ - allocAndInitCudaDeviceData(sa, m_sa, m_N); \ - allocAndInitCudaDeviceData(sb, m_sb, m_N); + allocAndInitCudaData(b5, m_b5, m_N); \ + allocAndInitCudaData(stb5, m_stb5, m_N); \ + allocAndInitCudaData(sa, m_sa, m_N); \ + allocAndInitCudaData(sb, m_sb, m_N); #define GEN_LIN_RECUR_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_b5, b5, m_N); \ - deallocCudaDeviceData(b5); \ - deallocCudaDeviceData(stb5); \ - deallocCudaDeviceData(sa); \ - deallocCudaDeviceData(sb); + getCudaData(m_b5, b5, m_N); \ + deallocCudaData(b5); \ + deallocCudaData(stb5); \ + deallocCudaData(sa); \ + deallocCudaData(sb); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 65fef4e8b..94c25e1ba 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -22,17 +22,17 @@ namespace lcals { #define GEN_LIN_RECUR_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(b5, m_b5, m_N); \ - allocAndInitHipDeviceData(stb5, m_stb5, m_N); \ - allocAndInitHipDeviceData(sa, m_sa, m_N); \ - allocAndInitHipDeviceData(sb, m_sb, m_N); + allocAndInitHipData(b5, m_b5, m_N); \ + allocAndInitHipData(stb5, m_stb5, m_N); \ + allocAndInitHipData(sa, m_sa, m_N); \ + allocAndInitHipData(sb, m_sb, m_N); #define GEN_LIN_RECUR_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_b5, b5, m_N); \ - deallocHipDeviceData(b5); \ - deallocHipDeviceData(stb5); \ - deallocHipDeviceData(sa); \ - deallocHipDeviceData(sb); + getHipData(m_b5, b5, m_N); \ + deallocHipData(b5); \ + deallocHipData(stb5); \ + deallocHipData(sa); \ + deallocHipData(sb); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 901ca786b..811aa1ee4 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -22,15 +22,15 @@ namespace lcals { #define HYDRO_1D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_array_length); \ - allocAndInitCudaDeviceData(y, m_y, m_array_length); \ - allocAndInitCudaDeviceData(z, m_z, m_array_length); + allocAndInitCudaData(x, m_x, m_array_length); \ + allocAndInitCudaData(y, m_y, m_array_length); \ + allocAndInitCudaData(z, m_z, m_array_length); #define HYDRO_1D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, m_array_length); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(z); \ + getCudaData(m_x, x, m_array_length); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(z); \ template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index d39ec0f7e..091dd978f 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -22,15 +22,15 @@ namespace lcals { #define HYDRO_1D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_array_length); \ - allocAndInitHipDeviceData(y, m_y, m_array_length); \ - allocAndInitHipDeviceData(z, m_z, m_array_length); + allocAndInitHipData(x, m_x, m_array_length); \ + allocAndInitHipData(y, m_y, m_array_length); \ + allocAndInitHipData(z, m_z, m_array_length); #define HYDRO_1D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, m_array_length); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(z); \ + getHipData(m_x, x, m_array_length); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(z); \ template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 21c320a60..147cad90b 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -42,33 +42,33 @@ namespace lcals #define HYDRO_2D_DATA_SETUP_CUDA \ \ - allocAndInitCudaDeviceData(zadat, m_za, m_array_length); \ - allocAndInitCudaDeviceData(zbdat, m_zb, m_array_length); \ - allocAndInitCudaDeviceData(zmdat, m_zm, m_array_length); \ - allocAndInitCudaDeviceData(zpdat, m_zp, m_array_length); \ - allocAndInitCudaDeviceData(zqdat, m_zq, m_array_length); \ - allocAndInitCudaDeviceData(zrdat, m_zr, m_array_length); \ - allocAndInitCudaDeviceData(zudat, m_zu, m_array_length); \ - allocAndInitCudaDeviceData(zvdat, m_zv, m_array_length); \ - allocAndInitCudaDeviceData(zzdat, m_zz, m_array_length); \ - allocAndInitCudaDeviceData(zroutdat, m_zrout, m_array_length); \ - allocAndInitCudaDeviceData(zzoutdat, m_zzout, m_array_length); + allocAndInitCudaData(zadat, m_za, m_array_length); \ + allocAndInitCudaData(zbdat, m_zb, m_array_length); \ + allocAndInitCudaData(zmdat, m_zm, m_array_length); \ + allocAndInitCudaData(zpdat, m_zp, m_array_length); \ + allocAndInitCudaData(zqdat, m_zq, m_array_length); \ + allocAndInitCudaData(zrdat, m_zr, m_array_length); \ + allocAndInitCudaData(zudat, m_zu, m_array_length); \ + allocAndInitCudaData(zvdat, m_zv, m_array_length); \ + allocAndInitCudaData(zzdat, m_zz, m_array_length); \ + allocAndInitCudaData(zroutdat, m_zrout, m_array_length); \ + allocAndInitCudaData(zzoutdat, m_zzout, m_array_length); #define HYDRO_2D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_zrout, zroutdat, m_array_length); \ - getCudaDeviceData(m_zzout, zzoutdat, m_array_length); \ - deallocCudaDeviceData(zadat); \ - deallocCudaDeviceData(zbdat); \ - deallocCudaDeviceData(zmdat); \ - deallocCudaDeviceData(zpdat); \ - deallocCudaDeviceData(zqdat); \ - deallocCudaDeviceData(zrdat); \ - deallocCudaDeviceData(zudat); \ - deallocCudaDeviceData(zvdat); \ - deallocCudaDeviceData(zzdat); \ - deallocCudaDeviceData(zroutdat); \ - deallocCudaDeviceData(zzoutdat); + getCudaData(m_zrout, zroutdat, m_array_length); \ + getCudaData(m_zzout, zzoutdat, m_array_length); \ + deallocCudaData(zadat); \ + deallocCudaData(zbdat); \ + deallocCudaData(zmdat); \ + deallocCudaData(zpdat); \ + deallocCudaData(zqdat); \ + deallocCudaData(zrdat); \ + deallocCudaData(zudat); \ + deallocCudaData(zvdat); \ + deallocCudaData(zzdat); \ + deallocCudaData(zroutdat); \ + deallocCudaData(zzoutdat); template < size_t j_block_size, size_t k_block_size > __launch_bounds__(j_block_size*k_block_size) diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 3180c5c10..c8e306bef 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -41,33 +41,33 @@ namespace lcals #define HYDRO_2D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(zadat, m_za, m_array_length); \ - allocAndInitHipDeviceData(zbdat, m_zb, m_array_length); \ - allocAndInitHipDeviceData(zmdat, m_zm, m_array_length); \ - allocAndInitHipDeviceData(zpdat, m_zp, m_array_length); \ - allocAndInitHipDeviceData(zqdat, m_zq, m_array_length); \ - allocAndInitHipDeviceData(zrdat, m_zr, m_array_length); \ - allocAndInitHipDeviceData(zudat, m_zu, m_array_length); \ - allocAndInitHipDeviceData(zvdat, m_zv, m_array_length); \ - allocAndInitHipDeviceData(zzdat, m_zz, m_array_length); \ - allocAndInitHipDeviceData(zroutdat, m_zrout, m_array_length); \ - allocAndInitHipDeviceData(zzoutdat, m_zzout, m_array_length); + allocAndInitHipData(zadat, m_za, m_array_length); \ + allocAndInitHipData(zbdat, m_zb, m_array_length); \ + allocAndInitHipData(zmdat, m_zm, m_array_length); \ + allocAndInitHipData(zpdat, m_zp, m_array_length); \ + allocAndInitHipData(zqdat, m_zq, m_array_length); \ + allocAndInitHipData(zrdat, m_zr, m_array_length); \ + allocAndInitHipData(zudat, m_zu, m_array_length); \ + allocAndInitHipData(zvdat, m_zv, m_array_length); \ + allocAndInitHipData(zzdat, m_zz, m_array_length); \ + allocAndInitHipData(zroutdat, m_zrout, m_array_length); \ + allocAndInitHipData(zzoutdat, m_zzout, m_array_length); #define HYDRO_2D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_zrout, zroutdat, m_array_length); \ - getHipDeviceData(m_zzout, zzoutdat, m_array_length); \ - deallocHipDeviceData(zadat); \ - deallocHipDeviceData(zbdat); \ - deallocHipDeviceData(zmdat); \ - deallocHipDeviceData(zpdat); \ - deallocHipDeviceData(zqdat); \ - deallocHipDeviceData(zrdat); \ - deallocHipDeviceData(zudat); \ - deallocHipDeviceData(zvdat); \ - deallocHipDeviceData(zzdat); \ - deallocHipDeviceData(zroutdat); \ - deallocHipDeviceData(zzoutdat); + getHipData(m_zrout, zroutdat, m_array_length); \ + getHipData(m_zzout, zzoutdat, m_array_length); \ + deallocHipData(zadat); \ + deallocHipData(zbdat); \ + deallocHipData(zmdat); \ + deallocHipData(zpdat); \ + deallocHipData(zqdat); \ + deallocHipData(zrdat); \ + deallocHipData(zudat); \ + deallocHipData(zvdat); \ + deallocHipData(zzdat); \ + deallocHipData(zroutdat); \ + deallocHipData(zzoutdat); template < size_t j_block_size, size_t k_block_size > __launch_bounds__(j_block_size*k_block_size) diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index aaed2219e..ef1681011 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -22,11 +22,11 @@ namespace lcals { #define INT_PREDICT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(px, m_px, m_array_length); + allocAndInitCudaData(px, m_px, m_array_length); #define INT_PREDICT_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_px, px, m_array_length); \ - deallocCudaDeviceData(px); + getCudaData(m_px, px, m_array_length); \ + deallocCudaData(px); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 22914bff3..e55b16266 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -22,11 +22,11 @@ namespace lcals { #define INT_PREDICT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(px, m_px, m_array_length); + allocAndInitHipData(px, m_px, m_array_length); #define INT_PREDICT_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_px, px, m_array_length); \ - deallocHipDeviceData(px); + getHipData(m_px, px, m_array_length); \ + deallocHipData(px); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index c831aab2e..f2c59fffe 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -23,19 +23,19 @@ namespace lcals { #define PLANCKIAN_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(y, m_y, iend); \ - allocAndInitCudaDeviceData(u, m_u, iend); \ - allocAndInitCudaDeviceData(v, m_v, iend); \ - allocAndInitCudaDeviceData(w, m_w, iend); + allocAndInitCudaData(x, m_x, iend); \ + allocAndInitCudaData(y, m_y, iend); \ + allocAndInitCudaData(u, m_u, iend); \ + allocAndInitCudaData(v, m_v, iend); \ + allocAndInitCudaData(w, m_w, iend); #define PLANCKIAN_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_w, w, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(u); \ - deallocCudaDeviceData(v); \ - deallocCudaDeviceData(w); + getCudaData(m_w, w, iend); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(u); \ + deallocCudaData(v); \ + deallocCudaData(w); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index 1b8c6050b..67d209d31 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -23,19 +23,19 @@ namespace lcals { #define PLANCKIAN_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(y, m_y, iend); \ - allocAndInitHipDeviceData(u, m_u, iend); \ - allocAndInitHipDeviceData(v, m_v, iend); \ - allocAndInitHipDeviceData(w, m_w, iend); + allocAndInitHipData(x, m_x, iend); \ + allocAndInitHipData(y, m_y, iend); \ + allocAndInitHipData(u, m_u, iend); \ + allocAndInitHipData(v, m_v, iend); \ + allocAndInitHipData(w, m_w, iend); #define PLANCKIAN_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_w, w, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(u); \ - deallocHipDeviceData(v); \ - deallocHipDeviceData(w); + getHipData(m_w, w, iend); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(u); \ + deallocHipData(v); \ + deallocHipData(w); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 654d027a9..efe1c8b70 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -22,17 +22,17 @@ namespace lcals { #define TRIDIAG_ELIM_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(xout, m_xout, m_N); \ - allocAndInitCudaDeviceData(xin, m_xin, m_N); \ - allocAndInitCudaDeviceData(y, m_y, m_N); \ - allocAndInitCudaDeviceData(z, m_z, m_N); + allocAndInitCudaData(xout, m_xout, m_N); \ + allocAndInitCudaData(xin, m_xin, m_N); \ + allocAndInitCudaData(y, m_y, m_N); \ + allocAndInitCudaData(z, m_z, m_N); #define TRIDIAG_ELIM_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_xout, xout, m_N); \ - deallocCudaDeviceData(xout); \ - deallocCudaDeviceData(xin); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(z); + getCudaData(m_xout, xout, m_N); \ + deallocCudaData(xout); \ + deallocCudaData(xin); \ + deallocCudaData(y); \ + deallocCudaData(z); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index dab19cc07..cb4877846 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -22,17 +22,17 @@ namespace lcals { #define TRIDIAG_ELIM_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(xout, m_xout, m_N); \ - allocAndInitHipDeviceData(xin, m_xin, m_N); \ - allocAndInitHipDeviceData(y, m_y, m_N); \ - allocAndInitHipDeviceData(z, m_z, m_N); + allocAndInitHipData(xout, m_xout, m_N); \ + allocAndInitHipData(xin, m_xin, m_N); \ + allocAndInitHipData(y, m_y, m_N); \ + allocAndInitHipData(z, m_z, m_N); #define TRIDIAG_ELIM_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_xout, xout, m_N); \ - deallocHipDeviceData(xout); \ - deallocHipDeviceData(xin); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(z); + getHipData(m_xout, xout, m_N); \ + deallocHipData(xout); \ + deallocHipData(xin); \ + deallocHipData(y); \ + deallocHipData(z); template < size_t block_size > __launch_bounds__(block_size) From 5066eb09ba419dfc838a488f93afec30a956629b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Jun 2022 13:42:47 -0700 Subject: [PATCH 009/137] use runtime allocators in apps --- src/apps/CONVECTION3DPA-Cuda.cpp | 26 +++++----- src/apps/CONVECTION3DPA-Hip.cpp | 26 +++++----- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 26 +++++----- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 26 +++++----- src/apps/DIFFUSION3DPA-Cuda.cpp | 22 ++++----- src/apps/DIFFUSION3DPA-Hip.cpp | 22 ++++----- src/apps/ENERGY-Cuda.cpp | 64 ++++++++++++------------- src/apps/ENERGY-Hip.cpp | 64 ++++++++++++------------- src/apps/FIR-Cuda.cpp | 24 +++++----- src/apps/FIR-Hip.cpp | 24 +++++----- src/apps/HALOEXCHANGE-Cuda.cpp | 18 +++---- src/apps/HALOEXCHANGE-Hip.cpp | 18 +++---- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 18 +++---- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 18 +++---- src/apps/LTIMES-Cuda.cpp | 14 +++--- src/apps/LTIMES-Hip.cpp | 14 +++--- src/apps/LTIMES_NOVIEW-Cuda.cpp | 14 +++--- src/apps/LTIMES_NOVIEW-Hip.cpp | 14 +++--- src/apps/MASS3DPA-Cuda.cpp | 22 ++++----- src/apps/MASS3DPA-Hip.cpp | 22 ++++----- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 14 +++--- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 14 +++--- src/apps/PRESSURE-Cuda.cpp | 22 ++++----- src/apps/PRESSURE-Hip.cpp | 22 ++++----- src/apps/VOL3D-Cuda.cpp | 18 +++---- src/apps/VOL3D-Hip.cpp | 18 +++---- 26 files changed, 302 insertions(+), 302 deletions(-) diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp index bd9d7c32b..947eda54c 100644 --- a/src/apps/CONVECTION3DPA-Cuda.cpp +++ b/src/apps/CONVECTION3DPA-Cuda.cpp @@ -20,21 +20,21 @@ namespace rajaperf { namespace apps { #define CONVECTION3DPA_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(Basis, m_B, CPA_Q1D *CPA_D1D); \ - allocAndInitCudaDeviceData(tBasis, m_Bt, CPA_Q1D *CPA_D1D); \ - allocAndInitCudaDeviceData(dBasis, m_G, CPA_Q1D *CPA_D1D); \ - allocAndInitCudaDeviceData(D, m_D, CPA_Q1D *CPA_Q1D *CPA_Q1D *CPA_VDIM *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ - allocAndInitCudaDeviceData(Y, m_Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); + allocAndInitCudaData(Basis, m_B, CPA_Q1D *CPA_D1D); \ + allocAndInitCudaData(tBasis, m_Bt, CPA_Q1D *CPA_D1D); \ + allocAndInitCudaData(dBasis, m_G, CPA_Q1D *CPA_D1D); \ + allocAndInitCudaData(D, m_D, CPA_Q1D *CPA_Q1D *CPA_Q1D *CPA_VDIM *m_NE); \ + allocAndInitCudaData(X, m_X, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ + allocAndInitCudaData(Y, m_Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); #define CONVECTION3DPA_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_Y, Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ - deallocCudaDeviceData(Basis); \ - deallocCudaDeviceData(tBasis); \ - deallocCudaDeviceData(dBasis); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(X); \ - deallocCudaDeviceData(Y); + getCudaData(m_Y, Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ + deallocCudaData(Basis); \ + deallocCudaData(tBasis); \ + deallocCudaData(dBasis); \ + deallocCudaData(D); \ + deallocCudaData(X); \ + deallocCudaData(Y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp index 81c683bcf..5f9531b60 100644 --- a/src/apps/CONVECTION3DPA-Hip.cpp +++ b/src/apps/CONVECTION3DPA-Hip.cpp @@ -20,21 +20,21 @@ namespace rajaperf { namespace apps { #define CONVECTION3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(Basis, m_B, CPA_Q1D *CPA_D1D); \ - allocAndInitHipDeviceData(tBasis, m_Bt, CPA_Q1D *CPA_D1D); \ - allocAndInitHipDeviceData(dBasis, m_G, CPA_Q1D *CPA_D1D); \ - allocAndInitHipDeviceData(D, m_D, CPA_Q1D *CPA_Q1D *CPA_Q1D *CPA_VDIM *m_NE); \ - allocAndInitHipDeviceData(X, m_X, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ - allocAndInitHipDeviceData(Y, m_Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); + allocAndInitHipData(Basis, m_B, CPA_Q1D *CPA_D1D); \ + allocAndInitHipData(tBasis, m_Bt, CPA_Q1D *CPA_D1D); \ + allocAndInitHipData(dBasis, m_G, CPA_Q1D *CPA_D1D); \ + allocAndInitHipData(D, m_D, CPA_Q1D *CPA_Q1D *CPA_Q1D *CPA_VDIM *m_NE); \ + allocAndInitHipData(X, m_X, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ + allocAndInitHipData(Y, m_Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); #define CONVECTION3DPA_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_Y, Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ - deallocHipDeviceData(Basis); \ - deallocHipDeviceData(tBasis); \ - deallocHipDeviceData(dBasis); \ - deallocHipDeviceData(D); \ - deallocHipDeviceData(X); \ - deallocHipDeviceData(Y); + getHipData(m_Y, Y, CPA_D1D *CPA_D1D *CPA_D1D *m_NE); \ + deallocHipData(Basis); \ + deallocHipData(tBasis); \ + deallocHipData(dBasis); \ + deallocHipData(D); \ + deallocHipData(X); \ + deallocHipData(Y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index c19d0770e..bf9420130 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -26,21 +26,21 @@ namespace apps { #define DEL_DOT_VEC_2D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_array_length); \ - allocAndInitCudaDeviceData(y, m_y, m_array_length); \ - allocAndInitCudaDeviceData(xdot, m_xdot, m_array_length); \ - allocAndInitCudaDeviceData(ydot, m_ydot, m_array_length); \ - allocAndInitCudaDeviceData(div, m_div, m_array_length); \ - allocAndInitCudaDeviceData(real_zones, m_domain->real_zones, iend); + allocAndInitCudaData(x, m_x, m_array_length); \ + allocAndInitCudaData(y, m_y, m_array_length); \ + allocAndInitCudaData(xdot, m_xdot, m_array_length); \ + allocAndInitCudaData(ydot, m_ydot, m_array_length); \ + allocAndInitCudaData(div, m_div, m_array_length); \ + allocAndInitCudaData(real_zones, m_domain->real_zones, iend); #define DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_div, div, m_array_length); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(xdot); \ - deallocCudaDeviceData(ydot); \ - deallocCudaDeviceData(div); \ - deallocCudaDeviceData(real_zones); + getCudaData(m_div, div, m_array_length); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(xdot); \ + deallocCudaData(ydot); \ + deallocCudaData(div); \ + deallocCudaData(real_zones); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 782e4099c..1a63bbc93 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -26,21 +26,21 @@ namespace apps { #define DEL_DOT_VEC_2D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_array_length); \ - allocAndInitHipDeviceData(y, m_y, m_array_length); \ - allocAndInitHipDeviceData(xdot, m_xdot, m_array_length); \ - allocAndInitHipDeviceData(ydot, m_ydot, m_array_length); \ - allocAndInitHipDeviceData(div, m_div, m_array_length); \ - allocAndInitHipDeviceData(real_zones, m_domain->real_zones, iend); + allocAndInitHipData(x, m_x, m_array_length); \ + allocAndInitHipData(y, m_y, m_array_length); \ + allocAndInitHipData(xdot, m_xdot, m_array_length); \ + allocAndInitHipData(ydot, m_ydot, m_array_length); \ + allocAndInitHipData(div, m_div, m_array_length); \ + allocAndInitHipData(real_zones, m_domain->real_zones, iend); #define DEL_DOT_VEC_2D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_div, div, m_array_length); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(xdot); \ - deallocHipDeviceData(ydot); \ - deallocHipDeviceData(div); \ - deallocHipDeviceData(real_zones); + getHipData(m_div, div, m_array_length); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(xdot); \ + deallocHipData(ydot); \ + deallocHipData(div); \ + deallocHipData(real_zones); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index e52dfed0a..66e7d2ac9 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -23,19 +23,19 @@ namespace rajaperf { namespace apps { #define DIFFUSION3DPA_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ - allocAndInitCudaDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ - allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - allocAndInitCudaDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); + allocAndInitCudaData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ + allocAndInitCudaData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + allocAndInitCudaData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); #define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - deallocCudaDeviceData(Basis); \ - deallocCudaDeviceData(dBasis); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(X); \ - deallocCudaDeviceData(Y); + getCudaData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + deallocCudaData(Basis); \ + deallocCudaData(dBasis); \ + deallocCudaData(D); \ + deallocCudaData(X); \ + deallocCudaData(Y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 58a40a77e..c8fbda2f0 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -23,19 +23,19 @@ namespace rajaperf { namespace apps { #define DIFFUSION3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ - allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ - allocAndInitHipDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ - allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - allocAndInitHipDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); + allocAndInitHipData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitHipData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ + allocAndInitHipData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ + allocAndInitHipData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + allocAndInitHipData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); #define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - deallocHipDeviceData(Basis); \ - deallocHipDeviceData(dBasis); \ - deallocHipDeviceData(D); \ - deallocHipDeviceData(X); \ - deallocHipDeviceData(Y); + getHipData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + deallocHipData(Basis); \ + deallocHipData(dBasis); \ + deallocHipData(D); \ + deallocHipData(X); \ + deallocHipData(Y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index a99a928e3..59b406605 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -22,40 +22,40 @@ namespace apps { #define ENERGY_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(e_new, m_e_new, iend); \ - allocAndInitCudaDeviceData(e_old, m_e_old, iend); \ - allocAndInitCudaDeviceData(delvc, m_delvc, iend); \ - allocAndInitCudaDeviceData(p_new, m_p_new, iend); \ - allocAndInitCudaDeviceData(p_old, m_p_old, iend); \ - allocAndInitCudaDeviceData(q_new, m_q_new, iend); \ - allocAndInitCudaDeviceData(q_old, m_q_old, iend); \ - allocAndInitCudaDeviceData(work, m_work, iend); \ - allocAndInitCudaDeviceData(compHalfStep, m_compHalfStep, iend); \ - allocAndInitCudaDeviceData(pHalfStep, m_pHalfStep, iend); \ - allocAndInitCudaDeviceData(bvc, m_bvc, iend); \ - allocAndInitCudaDeviceData(pbvc, m_pbvc, iend); \ - allocAndInitCudaDeviceData(ql_old, m_ql_old, iend); \ - allocAndInitCudaDeviceData(qq_old, m_qq_old, iend); \ - allocAndInitCudaDeviceData(vnewc, m_vnewc, iend); + allocAndInitCudaData(e_new, m_e_new, iend); \ + allocAndInitCudaData(e_old, m_e_old, iend); \ + allocAndInitCudaData(delvc, m_delvc, iend); \ + allocAndInitCudaData(p_new, m_p_new, iend); \ + allocAndInitCudaData(p_old, m_p_old, iend); \ + allocAndInitCudaData(q_new, m_q_new, iend); \ + allocAndInitCudaData(q_old, m_q_old, iend); \ + allocAndInitCudaData(work, m_work, iend); \ + allocAndInitCudaData(compHalfStep, m_compHalfStep, iend); \ + allocAndInitCudaData(pHalfStep, m_pHalfStep, iend); \ + allocAndInitCudaData(bvc, m_bvc, iend); \ + allocAndInitCudaData(pbvc, m_pbvc, iend); \ + allocAndInitCudaData(ql_old, m_ql_old, iend); \ + allocAndInitCudaData(qq_old, m_qq_old, iend); \ + allocAndInitCudaData(vnewc, m_vnewc, iend); #define ENERGY_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_e_new, e_new, iend); \ - getCudaDeviceData(m_q_new, q_new, iend); \ - deallocCudaDeviceData(e_new); \ - deallocCudaDeviceData(e_old); \ - deallocCudaDeviceData(delvc); \ - deallocCudaDeviceData(p_new); \ - deallocCudaDeviceData(p_old); \ - deallocCudaDeviceData(q_new); \ - deallocCudaDeviceData(q_old); \ - deallocCudaDeviceData(work); \ - deallocCudaDeviceData(compHalfStep); \ - deallocCudaDeviceData(pHalfStep); \ - deallocCudaDeviceData(bvc); \ - deallocCudaDeviceData(pbvc); \ - deallocCudaDeviceData(ql_old); \ - deallocCudaDeviceData(qq_old); \ - deallocCudaDeviceData(vnewc); + getCudaData(m_e_new, e_new, iend); \ + getCudaData(m_q_new, q_new, iend); \ + deallocCudaData(e_new); \ + deallocCudaData(e_old); \ + deallocCudaData(delvc); \ + deallocCudaData(p_new); \ + deallocCudaData(p_old); \ + deallocCudaData(q_new); \ + deallocCudaData(q_old); \ + deallocCudaData(work); \ + deallocCudaData(compHalfStep); \ + deallocCudaData(pHalfStep); \ + deallocCudaData(bvc); \ + deallocCudaData(pbvc); \ + deallocCudaData(ql_old); \ + deallocCudaData(qq_old); \ + deallocCudaData(vnewc); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index e7e882cff..72fa7a737 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -22,40 +22,40 @@ namespace apps { #define ENERGY_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(e_new, m_e_new, iend); \ - allocAndInitHipDeviceData(e_old, m_e_old, iend); \ - allocAndInitHipDeviceData(delvc, m_delvc, iend); \ - allocAndInitHipDeviceData(p_new, m_p_new, iend); \ - allocAndInitHipDeviceData(p_old, m_p_old, iend); \ - allocAndInitHipDeviceData(q_new, m_q_new, iend); \ - allocAndInitHipDeviceData(q_old, m_q_old, iend); \ - allocAndInitHipDeviceData(work, m_work, iend); \ - allocAndInitHipDeviceData(compHalfStep, m_compHalfStep, iend); \ - allocAndInitHipDeviceData(pHalfStep, m_pHalfStep, iend); \ - allocAndInitHipDeviceData(bvc, m_bvc, iend); \ - allocAndInitHipDeviceData(pbvc, m_pbvc, iend); \ - allocAndInitHipDeviceData(ql_old, m_ql_old, iend); \ - allocAndInitHipDeviceData(qq_old, m_qq_old, iend); \ - allocAndInitHipDeviceData(vnewc, m_vnewc, iend); + allocAndInitHipData(e_new, m_e_new, iend); \ + allocAndInitHipData(e_old, m_e_old, iend); \ + allocAndInitHipData(delvc, m_delvc, iend); \ + allocAndInitHipData(p_new, m_p_new, iend); \ + allocAndInitHipData(p_old, m_p_old, iend); \ + allocAndInitHipData(q_new, m_q_new, iend); \ + allocAndInitHipData(q_old, m_q_old, iend); \ + allocAndInitHipData(work, m_work, iend); \ + allocAndInitHipData(compHalfStep, m_compHalfStep, iend); \ + allocAndInitHipData(pHalfStep, m_pHalfStep, iend); \ + allocAndInitHipData(bvc, m_bvc, iend); \ + allocAndInitHipData(pbvc, m_pbvc, iend); \ + allocAndInitHipData(ql_old, m_ql_old, iend); \ + allocAndInitHipData(qq_old, m_qq_old, iend); \ + allocAndInitHipData(vnewc, m_vnewc, iend); #define ENERGY_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_e_new, e_new, iend); \ - getHipDeviceData(m_q_new, q_new, iend); \ - deallocHipDeviceData(e_new); \ - deallocHipDeviceData(e_old); \ - deallocHipDeviceData(delvc); \ - deallocHipDeviceData(p_new); \ - deallocHipDeviceData(p_old); \ - deallocHipDeviceData(q_new); \ - deallocHipDeviceData(q_old); \ - deallocHipDeviceData(work); \ - deallocHipDeviceData(compHalfStep); \ - deallocHipDeviceData(pHalfStep); \ - deallocHipDeviceData(bvc); \ - deallocHipDeviceData(pbvc); \ - deallocHipDeviceData(ql_old); \ - deallocHipDeviceData(qq_old); \ - deallocHipDeviceData(vnewc); + getHipData(m_e_new, e_new, iend); \ + getHipData(m_q_new, q_new, iend); \ + deallocHipData(e_new); \ + deallocHipData(e_old); \ + deallocHipData(delvc); \ + deallocHipData(p_new); \ + deallocHipData(p_old); \ + deallocHipData(q_new); \ + deallocHipData(q_old); \ + deallocHipData(work); \ + deallocHipData(compHalfStep); \ + deallocHipData(pHalfStep); \ + deallocHipData(bvc); \ + deallocHipData(pbvc); \ + deallocHipData(ql_old); \ + deallocHipData(qq_old); \ + deallocHipData(vnewc); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 4dea7c82e..946844d97 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -30,15 +30,15 @@ namespace apps __constant__ Real_type coeff[FIR_COEFFLEN]; #define FIR_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(in, m_in, getActualProblemSize()); \ - allocAndInitCudaDeviceData(out, m_out, getActualProblemSize()); \ + allocAndInitCudaData(in, m_in, getActualProblemSize()); \ + allocAndInitCudaData(out, m_out, getActualProblemSize()); \ cudaMemcpyToSymbol(coeff, coeff_array, FIR_COEFFLEN * sizeof(Real_type)); #define FIR_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_out, out, getActualProblemSize()); \ - deallocCudaDeviceData(in); \ - deallocCudaDeviceData(out); + getCudaData(m_out, out, getActualProblemSize()); \ + deallocCudaData(in); \ + deallocCudaData(out); template < size_t block_size > __launch_bounds__(block_size) @@ -57,17 +57,17 @@ __global__ void fir(Real_ptr out, Real_ptr in, #define FIR_DATA_SETUP_CUDA \ Real_ptr coeff; \ \ - allocAndInitCudaDeviceData(in, m_in, getActualProblemSize()); \ - allocAndInitCudaDeviceData(out, m_out, getActualProblemSize()); \ + allocAndInitCudaData(in, m_in, getActualProblemSize()); \ + allocAndInitCudaData(out, m_out, getActualProblemSize()); \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitCudaDeviceData(coeff, tcoeff, FIR_COEFFLEN); + allocAndInitCudaData(coeff, tcoeff, FIR_COEFFLEN); #define FIR_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_out, out, getActualProblemSize()); \ - deallocCudaDeviceData(in); \ - deallocCudaDeviceData(out); \ - deallocCudaDeviceData(coeff); + getCudaData(m_out, out, getActualProblemSize()); \ + deallocCudaData(in); \ + deallocCudaData(out); \ + deallocCudaData(coeff); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 42e3503e0..014c082f1 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -30,15 +30,15 @@ namespace apps __constant__ Real_type coeff[FIR_COEFFLEN]; #define FIR_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(in, m_in, getActualProblemSize()); \ - allocAndInitHipDeviceData(out, m_out, getActualProblemSize()); \ + allocAndInitHipData(in, m_in, getActualProblemSize()); \ + allocAndInitHipData(out, m_out, getActualProblemSize()); \ hipMemcpyToSymbol(HIP_SYMBOL(coeff), coeff_array, FIR_COEFFLEN * sizeof(Real_type), 0, hipMemcpyHostToDevice); #define FIR_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_out, out, getActualProblemSize()); \ - deallocHipDeviceData(in); \ - deallocHipDeviceData(out); + getHipData(m_out, out, getActualProblemSize()); \ + deallocHipData(in); \ + deallocHipData(out); template < size_t block_size > __launch_bounds__(block_size) @@ -57,17 +57,17 @@ __global__ void fir(Real_ptr out, Real_ptr in, #define FIR_DATA_SETUP_HIP \ Real_ptr coeff; \ \ - allocAndInitHipDeviceData(in, m_in, getActualProblemSize()); \ - allocAndInitHipDeviceData(out, m_out, getActualProblemSize()); \ + allocAndInitHipData(in, m_in, getActualProblemSize()); \ + allocAndInitHipData(out, m_out, getActualProblemSize()); \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitHipDeviceData(coeff, tcoeff, FIR_COEFFLEN); + allocAndInitHipData(coeff, tcoeff, FIR_COEFFLEN); #define FIR_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_out, out, getActualProblemSize()); \ - deallocHipDeviceData(in); \ - deallocHipDeviceData(out); \ - deallocHipDeviceData(coeff); + getHipData(m_out, out, getActualProblemSize()); \ + deallocHipData(in); \ + deallocHipData(out); \ + deallocHipData(coeff); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index cab4f911d..21211776b 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -23,23 +23,23 @@ namespace apps #define HALOEXCHANGE_DATA_SETUP_CUDA \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - allocAndInitCudaDeviceData(vars[v], m_vars[v], m_var_size); \ + allocAndInitCudaData(vars[v], m_vars[v], m_var_size); \ } \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - allocAndInitCudaDeviceData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ - allocAndInitCudaDeviceData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ - allocAndInitCudaDeviceData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ + allocAndInitCudaData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ + allocAndInitCudaData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ + allocAndInitCudaData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ } #define HALOEXCHANGE_DATA_TEARDOWN_CUDA \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - deallocCudaDeviceData(unpack_index_lists[l]); \ - deallocCudaDeviceData(pack_index_lists[l]); \ - deallocCudaDeviceData(buffers[l]); \ + deallocCudaData(unpack_index_lists[l]); \ + deallocCudaData(pack_index_lists[l]); \ + deallocCudaData(buffers[l]); \ } \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - getCudaDeviceData(m_vars[v], vars[v], m_var_size); \ - deallocCudaDeviceData(vars[v]); \ + getCudaData(m_vars[v], vars[v], m_var_size); \ + deallocCudaData(vars[v]); \ } template < size_t block_size > diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index 4070edc72..b2464467f 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -23,23 +23,23 @@ namespace apps #define HALOEXCHANGE_DATA_SETUP_HIP \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - allocAndInitHipDeviceData(vars[v], m_vars[v], m_var_size); \ + allocAndInitHipData(vars[v], m_vars[v], m_var_size); \ } \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - allocAndInitHipDeviceData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ - allocAndInitHipDeviceData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ - allocAndInitHipDeviceData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ + allocAndInitHipData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ + allocAndInitHipData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ + allocAndInitHipData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ } #define HALOEXCHANGE_DATA_TEARDOWN_HIP \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - deallocHipDeviceData(unpack_index_lists[l]); \ - deallocHipDeviceData(pack_index_lists[l]); \ - deallocHipDeviceData(buffers[l]); \ + deallocHipData(unpack_index_lists[l]); \ + deallocHipData(pack_index_lists[l]); \ + deallocHipData(buffers[l]); \ } \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - getHipDeviceData(m_vars[v], vars[v], m_var_size); \ - deallocHipDeviceData(vars[v]); \ + getHipData(m_vars[v], vars[v], m_var_size); \ + deallocHipData(vars[v]); \ } template < size_t block_size > diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 52d1fca5c..d22531364 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -23,24 +23,24 @@ namespace apps #define HALOEXCHANGE_FUSED_DATA_SETUP_CUDA \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - allocAndInitCudaDeviceData(vars[v], m_vars[v], m_var_size); \ + allocAndInitCudaData(vars[v], m_vars[v], m_var_size); \ } \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - allocAndInitCudaDeviceData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ - allocAndInitCudaDeviceData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ - allocAndInitCudaDeviceData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ + allocAndInitCudaData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ + allocAndInitCudaData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ + allocAndInitCudaData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ } #define HALOEXCHANGE_FUSED_DATA_TEARDOWN_CUDA \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - deallocCudaDeviceData(unpack_index_lists[l]); \ - deallocCudaDeviceData(pack_index_lists[l]); \ - deallocCudaDeviceData(buffers[l]); \ + deallocCudaData(unpack_index_lists[l]); \ + deallocCudaData(pack_index_lists[l]); \ + deallocCudaData(buffers[l]); \ } \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - getCudaDeviceData(m_vars[v], vars[v], m_var_size); \ - deallocCudaDeviceData(vars[v]); \ + getCudaData(m_vars[v], vars[v], m_var_size); \ + deallocCudaData(vars[v]); \ } #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 7d64d86f7..a7a922b02 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -23,23 +23,23 @@ namespace apps #define HALOEXCHANGE_FUSED_DATA_SETUP_HIP \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - allocAndInitHipDeviceData(vars[v], m_vars[v], m_var_size); \ + allocAndInitHipData(vars[v], m_vars[v], m_var_size); \ } \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - allocAndInitHipDeviceData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ - allocAndInitHipDeviceData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ - allocAndInitHipDeviceData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ + allocAndInitHipData(buffers[l], m_buffers[l], m_num_vars*m_pack_index_list_lengths[l]); \ + allocAndInitHipData(pack_index_lists[l], m_pack_index_lists[l], m_pack_index_list_lengths[l]); \ + allocAndInitHipData(unpack_index_lists[l], m_unpack_index_lists[l], m_unpack_index_list_lengths[l]); \ } #define HALOEXCHANGE_FUSED_DATA_TEARDOWN_HIP \ for (Index_type l = 0; l < num_neighbors; ++l) { \ - deallocHipDeviceData(unpack_index_lists[l]); \ - deallocHipDeviceData(pack_index_lists[l]); \ - deallocHipDeviceData(buffers[l]); \ + deallocHipData(unpack_index_lists[l]); \ + deallocHipData(pack_index_lists[l]); \ + deallocHipData(buffers[l]); \ } \ for (Index_type v = 0; v < m_num_vars; ++v) { \ - getHipDeviceData(m_vars[v], vars[v], m_var_size); \ - deallocHipDeviceData(vars[v]); \ + getHipData(m_vars[v], vars[v], m_var_size); \ + deallocHipData(vars[v]); \ } #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 4e38f769b..9537d46d8 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -42,15 +42,15 @@ namespace apps #define LTIMES_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(phidat, m_phidat, m_philen); \ - allocAndInitCudaDeviceData(elldat, m_elldat, m_elllen); \ - allocAndInitCudaDeviceData(psidat, m_psidat, m_psilen); + allocAndInitCudaData(phidat, m_phidat, m_philen); \ + allocAndInitCudaData(elldat, m_elldat, m_elllen); \ + allocAndInitCudaData(psidat, m_psidat, m_psilen); #define LTIMES_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_phidat, phidat, m_philen); \ - deallocCudaDeviceData(phidat); \ - deallocCudaDeviceData(elldat); \ - deallocCudaDeviceData(psidat); + getCudaData(m_phidat, phidat, m_philen); \ + deallocCudaData(phidat); \ + deallocCudaData(elldat); \ + deallocCudaData(psidat); template < size_t m_block_size, size_t g_block_size, size_t z_block_size > __launch_bounds__(m_block_size*g_block_size*z_block_size) diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index a78394d25..b01064650 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -41,15 +41,15 @@ namespace apps #define LTIMES_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(phidat, m_phidat, m_philen); \ - allocAndInitHipDeviceData(elldat, m_elldat, m_elllen); \ - allocAndInitHipDeviceData(psidat, m_psidat, m_psilen); + allocAndInitHipData(phidat, m_phidat, m_philen); \ + allocAndInitHipData(elldat, m_elldat, m_elllen); \ + allocAndInitHipData(psidat, m_psidat, m_psilen); #define LTIMES_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_phidat, phidat, m_philen); \ - deallocHipDeviceData(phidat); \ - deallocHipDeviceData(elldat); \ - deallocHipDeviceData(psidat); + getHipData(m_phidat, phidat, m_philen); \ + deallocHipData(phidat); \ + deallocHipData(elldat); \ + deallocHipData(psidat); template < size_t m_block_size, size_t g_block_size, size_t z_block_size > __launch_bounds__(m_block_size*g_block_size*z_block_size) diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index b363f0049..2701f0357 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -42,15 +42,15 @@ namespace apps #define LTIMES_NOVIEW_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(phidat, m_phidat, m_philen); \ - allocAndInitCudaDeviceData(elldat, m_elldat, m_elllen); \ - allocAndInitCudaDeviceData(psidat, m_psidat, m_psilen); + allocAndInitCudaData(phidat, m_phidat, m_philen); \ + allocAndInitCudaData(elldat, m_elldat, m_elllen); \ + allocAndInitCudaData(psidat, m_psidat, m_psilen); #define LTIMES_NOVIEW_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_phidat, phidat, m_philen); \ - deallocCudaDeviceData(phidat); \ - deallocCudaDeviceData(elldat); \ - deallocCudaDeviceData(psidat); + getCudaData(m_phidat, phidat, m_philen); \ + deallocCudaData(phidat); \ + deallocCudaData(elldat); \ + deallocCudaData(psidat); template < size_t m_block_size, size_t g_block_size, size_t z_block_size > __launch_bounds__(m_block_size*g_block_size*z_block_size) diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 47a8c8956..0138cf09d 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -41,15 +41,15 @@ namespace apps #define LTIMES_NOVIEW_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(phidat, m_phidat, m_philen); \ - allocAndInitHipDeviceData(elldat, m_elldat, m_elllen); \ - allocAndInitHipDeviceData(psidat, m_psidat, m_psilen); + allocAndInitHipData(phidat, m_phidat, m_philen); \ + allocAndInitHipData(elldat, m_elldat, m_elllen); \ + allocAndInitHipData(psidat, m_psidat, m_psilen); #define LTIMES_NOVIEW_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_phidat, phidat, m_philen); \ - deallocHipDeviceData(phidat); \ - deallocHipDeviceData(elldat); \ - deallocHipDeviceData(psidat); + getHipData(m_phidat, phidat, m_philen); \ + deallocHipData(phidat); \ + deallocHipData(elldat); \ + deallocHipData(psidat); template < size_t m_block_size, size_t g_block_size, size_t z_block_size > __launch_bounds__(m_block_size*g_block_size*z_block_size) diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 1562ef11d..43943472d 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -23,19 +23,19 @@ namespace rajaperf { namespace apps { #define MASS3DPA_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(B, m_B, MPA_Q1D *MPA_D1D); \ - allocAndInitCudaDeviceData(Bt, m_Bt, MPA_Q1D *MPA_D1D); \ - allocAndInitCudaDeviceData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ - allocAndInitCudaDeviceData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); + allocAndInitCudaData(B, m_B, MPA_Q1D *MPA_D1D); \ + allocAndInitCudaData(Bt, m_Bt, MPA_Q1D *MPA_D1D); \ + allocAndInitCudaData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE); \ + allocAndInitCudaData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ + allocAndInitCudaData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); #define MASS3DPA_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ - deallocCudaDeviceData(B); \ - deallocCudaDeviceData(Bt); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(X); \ - deallocCudaDeviceData(Y); + getCudaData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ + deallocCudaData(B); \ + deallocCudaData(Bt); \ + deallocCudaData(D); \ + deallocCudaData(X); \ + deallocCudaData(Y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 804a858fa..51188e320 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -23,19 +23,19 @@ namespace rajaperf { namespace apps { #define MASS3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(B, m_B, MPA_Q1D *MPA_D1D); \ - allocAndInitHipDeviceData(Bt, m_Bt, MPA_Q1D *MPA_D1D); \ - allocAndInitHipDeviceData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE); \ - allocAndInitHipDeviceData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ - allocAndInitHipDeviceData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); + allocAndInitHipData(B, m_B, MPA_Q1D *MPA_D1D); \ + allocAndInitHipData(Bt, m_Bt, MPA_Q1D *MPA_D1D); \ + allocAndInitHipData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE); \ + allocAndInitHipData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ + allocAndInitHipData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); #define MASS3DPA_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ - deallocHipDeviceData(B); \ - deallocHipDeviceData(Bt); \ - deallocHipDeviceData(D); \ - deallocHipDeviceData(X); \ - deallocHipDeviceData(Y); + getHipData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ + deallocHipData(B); \ + deallocHipData(Bt); \ + deallocHipData(D); \ + deallocHipData(X); \ + deallocHipData(Y); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index ef1013067..f3cd6b18d 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -24,15 +24,15 @@ namespace apps { #define NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_nodal_array_length); \ - allocAndInitCudaDeviceData(vol, m_vol, m_zonal_array_length); \ - allocAndInitCudaDeviceData(real_zones, m_domain->real_zones, iend); + allocAndInitCudaData(x, m_x, m_nodal_array_length); \ + allocAndInitCudaData(vol, m_vol, m_zonal_array_length); \ + allocAndInitCudaData(real_zones, m_domain->real_zones, iend); #define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x, x, m_nodal_array_length); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(vol); \ - deallocCudaDeviceData(real_zones); + getCudaData(m_x, x, m_nodal_array_length); \ + deallocCudaData(x); \ + deallocCudaData(vol); \ + deallocCudaData(real_zones); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index 38ea0a2ca..627a335c0 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -24,15 +24,15 @@ namespace apps { #define NODAL_ACCUMULATION_3D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_nodal_array_length); \ - allocAndInitHipDeviceData(vol, m_vol, m_zonal_array_length); \ - allocAndInitHipDeviceData(real_zones, m_domain->real_zones, iend); + allocAndInitHipData(x, m_x, m_nodal_array_length); \ + allocAndInitHipData(vol, m_vol, m_zonal_array_length); \ + allocAndInitHipData(real_zones, m_domain->real_zones, iend); #define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x, x, m_nodal_array_length); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(vol); \ - deallocHipDeviceData(real_zones); + getHipData(m_x, x, m_nodal_array_length); \ + deallocHipData(x); \ + deallocHipData(vol); \ + deallocHipData(real_zones); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 14ad2ae34..5d6264863 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -22,19 +22,19 @@ namespace apps { #define PRESSURE_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(compression, m_compression, iend); \ - allocAndInitCudaDeviceData(bvc, m_bvc, iend); \ - allocAndInitCudaDeviceData(p_new, m_p_new, iend); \ - allocAndInitCudaDeviceData(e_old, m_e_old, iend); \ - allocAndInitCudaDeviceData(vnewc, m_vnewc, iend); + allocAndInitCudaData(compression, m_compression, iend); \ + allocAndInitCudaData(bvc, m_bvc, iend); \ + allocAndInitCudaData(p_new, m_p_new, iend); \ + allocAndInitCudaData(e_old, m_e_old, iend); \ + allocAndInitCudaData(vnewc, m_vnewc, iend); #define PRESSURE_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_p_new, p_new, iend); \ - deallocCudaDeviceData(compression); \ - deallocCudaDeviceData(bvc); \ - deallocCudaDeviceData(p_new); \ - deallocCudaDeviceData(e_old); \ - deallocCudaDeviceData(vnewc); + getCudaData(m_p_new, p_new, iend); \ + deallocCudaData(compression); \ + deallocCudaData(bvc); \ + deallocCudaData(p_new); \ + deallocCudaData(e_old); \ + deallocCudaData(vnewc); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 03c9e04fb..b8a895597 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -22,19 +22,19 @@ namespace apps { #define PRESSURE_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(compression, m_compression, iend); \ - allocAndInitHipDeviceData(bvc, m_bvc, iend); \ - allocAndInitHipDeviceData(p_new, m_p_new, iend); \ - allocAndInitHipDeviceData(e_old, m_e_old, iend); \ - allocAndInitHipDeviceData(vnewc, m_vnewc, iend); + allocAndInitHipData(compression, m_compression, iend); \ + allocAndInitHipData(bvc, m_bvc, iend); \ + allocAndInitHipData(p_new, m_p_new, iend); \ + allocAndInitHipData(e_old, m_e_old, iend); \ + allocAndInitHipData(vnewc, m_vnewc, iend); #define PRESSURE_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_p_new, p_new, iend); \ - deallocHipDeviceData(compression); \ - deallocHipDeviceData(bvc); \ - deallocHipDeviceData(p_new); \ - deallocHipDeviceData(e_old); \ - deallocHipDeviceData(vnewc); + getHipData(m_p_new, p_new, iend); \ + deallocHipData(compression); \ + deallocHipData(bvc); \ + deallocHipData(p_new); \ + deallocHipData(e_old); \ + deallocHipData(vnewc); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 3f65c1b8a..6204c0043 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -24,17 +24,17 @@ namespace apps { #define VOL3D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, m_array_length); \ - allocAndInitCudaDeviceData(y, m_y, m_array_length); \ - allocAndInitCudaDeviceData(z, m_z, m_array_length); \ - allocAndInitCudaDeviceData(vol, m_vol, m_array_length); + allocAndInitCudaData(x, m_x, m_array_length); \ + allocAndInitCudaData(y, m_y, m_array_length); \ + allocAndInitCudaData(z, m_z, m_array_length); \ + allocAndInitCudaData(vol, m_vol, m_array_length); #define VOL3D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_vol, vol, m_array_length); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); \ - deallocCudaDeviceData(z); \ - deallocCudaDeviceData(vol); + getCudaData(m_vol, vol, m_array_length); \ + deallocCudaData(x); \ + deallocCudaData(y); \ + deallocCudaData(z); \ + deallocCudaData(vol); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 70f121e09..9eb208285 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -24,17 +24,17 @@ namespace apps { #define VOL3D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, m_array_length); \ - allocAndInitHipDeviceData(y, m_y, m_array_length); \ - allocAndInitHipDeviceData(z, m_z, m_array_length); \ - allocAndInitHipDeviceData(vol, m_vol, m_array_length); + allocAndInitHipData(x, m_x, m_array_length); \ + allocAndInitHipData(y, m_y, m_array_length); \ + allocAndInitHipData(z, m_z, m_array_length); \ + allocAndInitHipData(vol, m_vol, m_array_length); #define VOL3D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_vol, vol, m_array_length); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); \ - deallocHipDeviceData(z); \ - deallocHipDeviceData(vol); + getHipData(m_vol, vol, m_array_length); \ + deallocHipData(x); \ + deallocHipData(y); \ + deallocHipData(z); \ + deallocHipData(vol); template < size_t block_size > __launch_bounds__(block_size) From 56ba55f6afa8dfad6bd9cbbe91dca2bf4243768a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 29 Jun 2022 08:06:24 -0700 Subject: [PATCH 010/137] Store gpu DataSpaces in RunParams --- src/common/CudaDataUtils.hpp | 14 ++++----- src/common/HipDataUtils.hpp | 14 ++++----- src/common/KernelBase.hpp | 60 ++++++++++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 4 --- src/common/RAJAPerfSuite.hpp | 4 --- src/common/RunParams.hpp | 7 +++++ 6 files changed, 81 insertions(+), 22 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 6873630c8..bdeb2de4a 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -222,7 +222,7 @@ void deallocCudaPinnedData(T& pptr) * and of proper size for copy operation to succeed. */ template -void initCudaData(T& cptr, const T hptr, int len) +void initCudaData(CudaData, T& cptr, const T hptr, int len) { cudaErrchk( cudaMemcpy( cptr, hptr, len * sizeof(typename std::remove_pointer::type), @@ -235,7 +235,7 @@ void initCudaData(T& cptr, const T hptr, int len) * \brief Allocate CUDA data array (cptr). */ template -void allocCudaData(T& cptr, int len) +void allocCudaData(CudaData cudaDataSpace, T& cptr, int len) { switch (cudaDataSpace) { case CudaData::Host: @@ -266,17 +266,17 @@ void allocCudaData(T& cptr, int len) * data to CUDA array. */ template -void allocAndInitCudaData(T& cptr, const T hptr, int len) +void allocAndInitCudaData(CudaData cudaDataSpace, T& cptr, const T hptr, int len) { - allocCudaData(cptr, len); - initCudaData(cptr, hptr, len); + allocCudaData(cudaDataSpace, cptr, len); + initCudaData(cudaDataSpace, cptr, hptr, len); } /*! * \brief Free Cuda data array. */ template -void deallocCudaData(T& cptr) +void deallocCudaData(CudaData cudaDataSpace, T& cptr) { switch (cudaDataSpace) { case CudaData::Host: @@ -309,7 +309,7 @@ void deallocCudaData(T& cptr) * and of propoer size for copy operation to succeed. */ template -void getCudaData(T& hptr, const T cptr, int len) +void getCudaData(CudaData, T& hptr, const T cptr, int len) { cudaErrchk( cudaMemcpy( hptr, cptr, len * sizeof(typename std::remove_pointer::type), diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index da29f5375..1ee6bb230 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -209,7 +209,7 @@ void deallocHipPinnedData(T& pptr) * and of proper size for copy operation to succeed. */ template -void initHipData(T& cptr, const T hptr, int len) +void initHipData(HipData, T& cptr, const T hptr, int len) { hipErrchk( hipMemcpy( cptr, hptr, len * sizeof(typename std::remove_pointer::type), @@ -222,7 +222,7 @@ void initHipData(T& cptr, const T hptr, int len) * \brief Allocate HIP data array (cptr). */ template -void allocHipData(T& cptr, int len) +void allocHipData(HipData hipDataSpace, T& cptr, int len) { switch (hipDataSpace) { case HipData::Host: @@ -253,17 +253,17 @@ void allocHipData(T& cptr, int len) * data to HIP array. */ template -void allocAndInitHipData(T& cptr, const T hptr, int len) +void allocAndInitHipData(HipData hipDataSpace, T& cptr, const T hptr, int len) { - allocHipData(cptr, len); - initHipData(cptr, hptr, len); + allocHipData(hipDataSpace, cptr, len); + initHipData(hipDataSpace, cptr, hptr, len); } /*! * \brief Free Hip data array. */ template -void deallocHipData(T& cptr) +void deallocHipData(HipData hipDataSpace, T& cptr) { switch (hipDataSpace) { case HipData::Host: @@ -296,7 +296,7 @@ void deallocHipData(T& cptr) * and of propoer size for copy operation to succeed. */ template -void getHipData(T& hptr, const T cptr, int len) +void getHipData(HipData, T& hptr, const T cptr, int len) { hipErrchk( hipMemcpy( hptr, cptr, len * sizeof(typename std::remove_pointer::type), diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index ed3429643..526aaca2b 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -14,6 +14,8 @@ #include "common/DataUtils.hpp" #include "common/RunParams.hpp" #include "common/GPUUtils.hpp" +#include "common/CudaDataUtils.hpp" +#include "common/HipDataUtils.hpp" #include "RAJA/util/Timer.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) @@ -188,6 +190,64 @@ class KernelBase #endif } +#if defined(RAJA_ENABLE_CUDA) + CudaData getCudaDataSpace() const { return run_params.getCudaDataSpace(); } + template + void initCudaData(T& cptr, const T hptr, int len) + { + rajaperf::initCudaData(getCudaDataSpace(), cptr, hptr, len); + } + template + void allocCudaData(T& cptr, int len) + { + rajaperf::allocCudaData(getCudaDataSpace(), cptr, len); + } + template + void allocAndInitCudaData(T& cptr, const T hptr, int len) + { + rajaperf::allocAndInitCudaData(getCudaDataSpace(), cptr, hptr, len); + } + template + void deallocCudaData(T& cptr) + { + rajaperf::deallocCudaData(getCudaDataSpace(), cptr); + } + template + void getCudaData(T& hptr, const T cptr, int len) + { + rajaperf::getCudaData(getCudaDataSpace(), hptr, cptr, len); + } +#endif + +#if defined(RAJA_ENABLE_HIP) + HipData getHipDataSpace() const { return run_params.getHipDataSpace(); } + template + void initHipData(T& cptr, const T hptr, int len) + { + rajaperf::initHipData(getHipDataSpace(), cptr, hptr, len); + } + template + void allocHipData(T& cptr, int len) + { + rajaperf::allocHipData(getHipDataSpace(), cptr, len); + } + template + void allocAndInitHipData(T& cptr, const T hptr, int len) + { + rajaperf::allocAndInitHipData(getHipDataSpace(), cptr, hptr, len); + } + template + void deallocHipData(T& cptr) + { + rajaperf::deallocHipData(getHipDataSpace(), cptr); + } + template + void getHipData(T& hptr, const T cptr, int len) + { + rajaperf::getHipData(getHipDataSpace(), hptr, cptr, len); + } +#endif + void startTimer() { synchronize(); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 0a48c8c39..31c021f1d 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -342,8 +342,6 @@ static const std::string CudaDataNames [] = }; // END VariantNames -CudaData cudaDataSpace = CudaData::Device; - /*! ******************************************************************************* @@ -369,8 +367,6 @@ static const std::string HipDataNames [] = }; // END VariantNames -HipData hipDataSpace = HipData::Device; - /* ******************************************************************************* diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index ca0734cb9..0e37cad2a 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -257,8 +257,6 @@ enum struct CudaData { }; -extern CudaData cudaDataSpace; - /*! ******************************************************************************* @@ -284,8 +282,6 @@ enum struct HipData { }; -extern HipData hipDataSpace; - /*! ******************************************************************************* diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 10a95d8ed..df1da66a4 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -136,6 +136,10 @@ class RunParams { return false; } + CudaData getCudaDataSpace() const { return cudaDataSpace; } + + HipData getHipDataSpace() const { return hipDataSpace; } + double getPFTolerance() const { return pf_tol; } int getCheckRunReps() const { return checkrun_reps; } @@ -244,6 +248,9 @@ class RunParams { std::string reference_variant; /*!< Name of reference variant for speedup calculations */ + CudaData cudaDataSpace = CudaData::Device; + HipData hipDataSpace = HipData::Device; + // // Arrays to hold input strings for valid/invalid input. Helpful for // debugging command line args. From bbeb1200ed5c0c63fd5a8ab1211435887bfe108a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 29 Jun 2022 08:11:30 -0700 Subject: [PATCH 011/137] rename gpu Data to DataSpace --- src/common/CudaDataUtils.hpp | 26 +++++++++++++------------- src/common/HipDataUtils.hpp | 26 +++++++++++++------------- src/common/KernelBase.hpp | 4 ++-- src/common/RAJAPerfSuite.cpp | 16 ++++++++-------- src/common/RAJAPerfSuite.hpp | 12 ++++++------ src/common/RunParams.cpp | 32 ++++++++++++++++---------------- src/common/RunParams.hpp | 12 ++++++------ 7 files changed, 64 insertions(+), 64 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index bdeb2de4a..86eac6d19 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -222,7 +222,7 @@ void deallocCudaPinnedData(T& pptr) * and of proper size for copy operation to succeed. */ template -void initCudaData(CudaData, T& cptr, const T hptr, int len) +void initCudaData(CudaDataSpace, T& cptr, const T hptr, int len) { cudaErrchk( cudaMemcpy( cptr, hptr, len * sizeof(typename std::remove_pointer::type), @@ -235,22 +235,22 @@ void initCudaData(CudaData, T& cptr, const T hptr, int len) * \brief Allocate CUDA data array (cptr). */ template -void allocCudaData(CudaData cudaDataSpace, T& cptr, int len) +void allocCudaData(CudaDataSpace cudaDataSpace, T& cptr, int len) { switch (cudaDataSpace) { - case CudaData::Host: + case CudaDataSpace::Host: { allocData(cptr, len); } break; - case CudaData::Pinned: + case CudaDataSpace::Pinned: { allocCudaPinnedData(cptr, len); } break; - case CudaData::Managed: + case CudaDataSpace::Managed: { allocCudaManagedData(cptr, len); } break; - case CudaData::Device: + case CudaDataSpace::Device: { allocCudaDeviceData(cptr, len); } break; @@ -266,7 +266,7 @@ void allocCudaData(CudaData cudaDataSpace, T& cptr, int len) * data to CUDA array. */ template -void allocAndInitCudaData(CudaData cudaDataSpace, T& cptr, const T hptr, int len) +void allocAndInitCudaData(CudaDataSpace cudaDataSpace, T& cptr, const T hptr, int len) { allocCudaData(cudaDataSpace, cptr, len); initCudaData(cudaDataSpace, cptr, hptr, len); @@ -276,22 +276,22 @@ void allocAndInitCudaData(CudaData cudaDataSpace, T& cptr, const T hptr, int len * \brief Free Cuda data array. */ template -void deallocCudaData(CudaData cudaDataSpace, T& cptr) +void deallocCudaData(CudaDataSpace cudaDataSpace, T& cptr) { switch (cudaDataSpace) { - case CudaData::Host: + case CudaDataSpace::Host: { deallocData(cptr); } break; - case CudaData::Pinned: + case CudaDataSpace::Pinned: { deallocCudaPinnedData(cptr); } break; - case CudaData::Managed: + case CudaDataSpace::Managed: { deallocCudaManagedData(cptr); } break; - case CudaData::Device: + case CudaDataSpace::Device: { deallocCudaDeviceData(cptr); } break; @@ -309,7 +309,7 @@ void deallocCudaData(CudaData cudaDataSpace, T& cptr) * and of propoer size for copy operation to succeed. */ template -void getCudaData(CudaData, T& hptr, const T cptr, int len) +void getCudaData(CudaDataSpace, T& hptr, const T cptr, int len) { cudaErrchk( cudaMemcpy( hptr, cptr, len * sizeof(typename std::remove_pointer::type), diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 1ee6bb230..26b29dce7 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -209,7 +209,7 @@ void deallocHipPinnedData(T& pptr) * and of proper size for copy operation to succeed. */ template -void initHipData(HipData, T& cptr, const T hptr, int len) +void initHipData(HipDataSpace, T& cptr, const T hptr, int len) { hipErrchk( hipMemcpy( cptr, hptr, len * sizeof(typename std::remove_pointer::type), @@ -222,22 +222,22 @@ void initHipData(HipData, T& cptr, const T hptr, int len) * \brief Allocate HIP data array (cptr). */ template -void allocHipData(HipData hipDataSpace, T& cptr, int len) +void allocHipData(HipDataSpace hipDataSpace, T& cptr, int len) { switch (hipDataSpace) { - case HipData::Host: + case HipDataSpace::Host: { allocData(cptr, len); } break; - case HipData::Pinned: + case HipDataSpace::Pinned: { allocHipPinnedData(cptr, len); } break; - case HipData::Managed: + case HipDataSpace::Managed: { allocHipManagedData(cptr, len); } break; - case HipData::Device: + case HipDataSpace::Device: { allocHipDeviceData(cptr, len); } break; @@ -253,7 +253,7 @@ void allocHipData(HipData hipDataSpace, T& cptr, int len) * data to HIP array. */ template -void allocAndInitHipData(HipData hipDataSpace, T& cptr, const T hptr, int len) +void allocAndInitHipData(HipDataSpace hipDataSpace, T& cptr, const T hptr, int len) { allocHipData(hipDataSpace, cptr, len); initHipData(hipDataSpace, cptr, hptr, len); @@ -263,22 +263,22 @@ void allocAndInitHipData(HipData hipDataSpace, T& cptr, const T hptr, int len) * \brief Free Hip data array. */ template -void deallocHipData(HipData hipDataSpace, T& cptr) +void deallocHipData(HipDataSpace hipDataSpace, T& cptr) { switch (hipDataSpace) { - case HipData::Host: + case HipDataSpace::Host: { deallocData(cptr); } break; - case HipData::Pinned: + case HipDataSpace::Pinned: { deallocHipPinnedData(cptr); } break; - case HipData::Managed: + case HipDataSpace::Managed: { deallocHipManagedData(cptr); } break; - case HipData::Device: + case HipDataSpace::Device: { deallocHipDeviceData(cptr); } break; @@ -296,7 +296,7 @@ void deallocHipData(HipData hipDataSpace, T& cptr) * and of propoer size for copy operation to succeed. */ template -void getHipData(HipData, T& hptr, const T cptr, int len) +void getHipData(HipDataSpace, T& hptr, const T cptr, int len) { hipErrchk( hipMemcpy( hptr, cptr, len * sizeof(typename std::remove_pointer::type), diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 526aaca2b..0fe3aded9 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -191,7 +191,7 @@ class KernelBase } #if defined(RAJA_ENABLE_CUDA) - CudaData getCudaDataSpace() const { return run_params.getCudaDataSpace(); } + CudaDataSpace getCudaDataSpace() const { return run_params.getCudaDataSpace(); } template void initCudaData(T& cptr, const T hptr, int len) { @@ -220,7 +220,7 @@ class KernelBase #endif #if defined(RAJA_ENABLE_HIP) - HipData getHipDataSpace() const { return run_params.getHipDataSpace(); } + HipDataSpace getHipDataSpace() const { return run_params.getHipDataSpace(); } template void initHipData(T& cptr, const T hptr, int len) { diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 31c021f1d..921c52910 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -330,7 +330,7 @@ static const std::string FeatureNames [] = * ******************************************************************************* */ -static const std::string CudaDataNames [] = +static const std::string CudaDataSpaceNames [] = { std::string("Host"), @@ -355,7 +355,7 @@ static const std::string CudaDataNames [] = * ******************************************************************************* */ -static const std::string HipDataNames [] = +static const std::string HipDataSpaceNames [] = { std::string("Host"), @@ -561,26 +561,26 @@ const std::string& getFeatureName(FeatureID fid) /* ******************************************************************************* * - * Return memory space name associated with CudaData enum value. + * Return memory space name associated with CudaDataSpace enum value. * ******************************************************************************* */ -const std::string& getCudaDataName(CudaData cd) +const std::string& getCudaDataSpaceName(CudaDataSpace cd) { - return CudaDataNames[static_cast(cd)]; + return CudaDataSpaceNames[static_cast(cd)]; } /* ******************************************************************************* * - * Return memory space name associated with HipData enum value. + * Return memory space name associated with HipDataSpace enum value. * ******************************************************************************* */ -const std::string& getHipDataName(HipData hd) +const std::string& getHipDataSpaceName(HipDataSpace hd) { - return HipDataNames[static_cast(hd)]; + return HipDataSpaceNames[static_cast(hd)]; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 0e37cad2a..469b09c0d 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -246,7 +246,7 @@ enum FeatureID { * ******************************************************************************* */ -enum struct CudaData { +enum struct CudaDataSpace { Host = 0, Pinned, @@ -271,7 +271,7 @@ enum struct CudaData { * ******************************************************************************* */ -enum struct HipData { +enum struct HipDataSpace { Host = 0, Pinned, @@ -355,20 +355,20 @@ const std::string& getFeatureName(FeatureID vid); /*! ******************************************************************************* * - * \brief Return memory space name associated with CudaData enum value. + * \brief Return memory space name associated with CudaDataSpace enum value. * ******************************************************************************* */ -const std::string& getCudaDataName(CudaData cd); +const std::string& getCudaDataSpaceName(CudaDataSpace cd); /*! ******************************************************************************* * - * \brief Return memory space name associated with HipData enum value. + * \brief Return memory space name associated with HipDataSpace enum value. * ******************************************************************************* */ -const std::string& getHipDataName(HipData hd); +const std::string& getHipDataSpaceName(HipDataSpace hd); /*! ******************************************************************************* diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 2af4c6460..d49a6e316 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -108,8 +108,8 @@ void RunParams::print(std::ostream& str) const str << "\n outdir = " << outdir; str << "\n outfile_prefix = " << outfile_prefix; - str << "\n cuda memory space = " << getCudaDataName(cudaDataSpace); - str << "\n hip memory space = " << getHipDataName(hipDataSpace); + str << "\n cuda memory space = " << getCudaDataSpaceName(cudaDataSpace); + str << "\n hip memory space = " << getHipDataSpaceName(hipDataSpace); str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { @@ -211,13 +211,13 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt == std::string("--print-cuda_memory_spaces") || opt == std::string("-pcms") ) { - printCudaDataNames(getCout()); + printCudaDataSpaceNames(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-hip_memory_spaces") || opt == std::string("-pcms") ) { - printHipDataNames(getCout()); + printHipDataSpaceNames(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-features") || @@ -449,9 +449,9 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt.at(0) == '-' ) { i--; } else { - for (int cms = 0; cms < static_cast(CudaData::NumSpaces); ++cms) { - if (getCudaDataName(static_cast(cms)) == opt) { - cudaDataSpace = static_cast(cms); + for (int cms = 0; cms < static_cast(CudaDataSpace::NumSpaces); ++cms) { + if (getCudaDataSpaceName(static_cast(cms)) == opt) { + cudaDataSpace = static_cast(cms); got_someting = true; break; } @@ -475,9 +475,9 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt.at(0) == '-' ) { i--; } else { - for (int hms = 0; hms < static_cast(HipData::NumSpaces); ++hms) { - if (getHipDataName(static_cast(hms)) == opt) { - hipDataSpace = static_cast(hms); + for (int hms = 0; hms < static_cast(HipDataSpace::NumSpaces); ++hms) { + if (getHipDataSpaceName(static_cast(hms)) == opt) { + hipDataSpace = static_cast(hms); got_someting = true; break; } @@ -791,23 +791,23 @@ void RunParams::printVariantNames(std::ostream& str) const } -void RunParams::printCudaDataNames(std::ostream& str) const +void RunParams::printCudaDataSpaceNames(std::ostream& str) const { str << "\nAvailable cuda memory spaces:"; str << "\n-------------------\n"; - for (int cms = 0; cms < static_cast(CudaData::NumSpaces); ++cms) { - str << getCudaDataName(static_cast(cms)) << std::endl; + for (int cms = 0; cms < static_cast(CudaDataSpace::NumSpaces); ++cms) { + str << getCudaDataSpaceName(static_cast(cms)) << std::endl; } str.flush(); } -void RunParams::printHipDataNames(std::ostream& str) const +void RunParams::printHipDataSpaceNames(std::ostream& str) const { str << "\nAvailable hip memory spaces:"; str << "\n-------------------\n"; - for (int hms = 0; hms < static_cast(HipData::NumSpaces); ++hms) { - str << getHipDataName(static_cast(hms)) << std::endl; + for (int hms = 0; hms < static_cast(HipDataSpace::NumSpaces); ++hms) { + str << getHipDataSpaceName(static_cast(hms)) << std::endl; } str.flush(); } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index df1da66a4..9858c8853 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -136,9 +136,9 @@ class RunParams { return false; } - CudaData getCudaDataSpace() const { return cudaDataSpace; } + CudaDataSpace getCudaDataSpace() const { return cudaDataSpace; } - HipData getHipDataSpace() const { return hipDataSpace; } + HipDataSpace getHipDataSpace() const { return hipDataSpace; } double getPFTolerance() const { return pf_tol; } @@ -216,8 +216,8 @@ class RunParams { void printFullKernelNames(std::ostream& str) const; void printKernelNames(std::ostream& str) const; void printVariantNames(std::ostream& str) const; - void printCudaDataNames(std::ostream& str) const; - void printHipDataNames(std::ostream& str) const; + void printCudaDataSpaceNames(std::ostream& str) const; + void printHipDataSpaceNames(std::ostream& str) const; void printGroupNames(std::ostream& str) const; void printFeatureNames(std::ostream& str) const; void printFeatureKernels(std::ostream& str) const; @@ -248,8 +248,8 @@ class RunParams { std::string reference_variant; /*!< Name of reference variant for speedup calculations */ - CudaData cudaDataSpace = CudaData::Device; - HipData hipDataSpace = HipData::Device; + CudaDataSpace cudaDataSpace = CudaDataSpace::Device; + HipDataSpace hipDataSpace = HipDataSpace::Device; // // Arrays to hold input strings for valid/invalid input. Helpful for From 82e3cfe81de866599999e203c3c7ad3211ad6c6c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 29 Jun 2022 08:47:30 -0700 Subject: [PATCH 012/137] Update src/common/RAJAPerfSuite.cpp --- src/common/RAJAPerfSuite.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 921c52910..730c9824d 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -421,7 +421,6 @@ const std::string& getVariantName(VariantID vid) return VariantNames[vid]; } - /*! ******************************************************************************* * From 6105ef8f82c603e78938f09e1191bf0aaf999246 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 11 Jul 2022 10:29:40 -0700 Subject: [PATCH 013/137] Add hip Coarse and Fine memory spaces --- src/common/HipDataUtils.hpp | 94 ++++++++++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 7 +++ src/common/RAJAPerfSuite.hpp | 7 +++ 3 files changed, 108 insertions(+) diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 26b29dce7..2aedab3be 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -98,6 +98,16 @@ __device__ inline Index_type lambda_hip_get_index() { return blockIdx.z; } +/*! + * \brief Get current hip device. + */ +inline int getHipDevice() +{ + int device = hipInvalidDeviceId; + hipErrchk( hipGetDevice( &device ) ); + return device; +} + /*! * \brief Copy given hptr (host) data to HIP device (dptr). * @@ -124,6 +134,17 @@ void allocHipDeviceData(T& dptr, int len) len * sizeof(typename std::remove_pointer::type) ) ); } +/*! + * \brief Allocate HIP fine-grained device data array (dfptr). + */ +template +void allocHipDeviceFineData(T& dfptr, int len) +{ + hipErrchk( hipExtMallocWithFlags( (void**)&dfptr, + len * sizeof(typename std::remove_pointer::type), + hipDeviceMallocFinegrained ) ); +} + /*! * \brief Allocate HIP managed data array (mptr). */ @@ -146,6 +167,39 @@ void allocHipPinnedData(T& pptr, int len) hipHostMallocMapped ) ); } +/*! + * \brief Allocate HIP fine-grained pinned data array (pfptr). + */ +template +void allocHipPinnedFineData(T& pfptr, int len) +{ + hipErrchk( hipHostMalloc( (void**)&pfptr, + len * sizeof(typename std::remove_pointer::type), + hipHostMallocMapped | hipHostMallocCoherent ) ); +} + +/*! + * \brief Allocate HIP coarse-grained pinned data array (pcptr). + */ +template +void allocHipPinnedCoarseData(T& pcptr, int len) +{ + hipErrchk( hipHostMalloc( (void**)&pcptr, + len * sizeof(typename std::remove_pointer::type), + hipHostMallocMapped | hipHostMallocNonCoherent ) ); +} + +/*! + * \brief Apply mem advice to HIP data array (ptr). + */ +template +void adviseHipData(T& ptr, int len, hipMemoryAdvise advice, int device) +{ + hipErrchk( hipMemAdvise( (void*)ptr, + len * sizeof(typename std::remove_pointer::type), + advice, device ) ); +} + /*! * \brief Allocate HIP device data array (dptr) and copy given hptr (host) * data to device array. @@ -225,22 +279,55 @@ template void allocHipData(HipDataSpace hipDataSpace, T& cptr, int len) { switch (hipDataSpace) { + case HipDataSpace::Host: { allocData(cptr, len); } break; + case HipDataSpace::HostAdviseFine: + { + allocData(cptr, len); + adviseHipData(cptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); + } break; + case HipDataSpace::HostAdviseCoarse: + { + allocData(cptr, len); + adviseHipData(cptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); + } break; case HipDataSpace::Pinned: { allocHipPinnedData(cptr, len); } break; + case HipDataSpace::PinnedFine: + { + allocHipPinnedFineData(cptr, len); + } break; + case HipDataSpace::PinnedCoarse: + { + allocHipPinnedCoarseData(cptr, len); + } break; case HipDataSpace::Managed: { allocHipManagedData(cptr, len); } break; + case HipDataSpace::ManagedAdviseFine: + { + allocHipManagedData(cptr, len); + adviseHipData(cptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); + } break; + case HipDataSpace::ManagedAdviseCoarse: + { + allocHipManagedData(cptr, len); + adviseHipData(cptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); + } break; case HipDataSpace::Device: { allocHipDeviceData(cptr, len); } break; + case HipDataSpace::DeviceFine: + { + allocHipDeviceFineData(cptr, len); + } break; default: { throw std::invalid_argument("allocHipData : Unknown memory type"); @@ -267,18 +354,25 @@ void deallocHipData(HipDataSpace hipDataSpace, T& cptr) { switch (hipDataSpace) { case HipDataSpace::Host: + case HipDataSpace::HostAdviseFine: + case HipDataSpace::HostAdviseCoarse: { deallocData(cptr); } break; case HipDataSpace::Pinned: + case HipDataSpace::PinnedFine: + case HipDataSpace::PinnedCoarse: { deallocHipPinnedData(cptr); } break; case HipDataSpace::Managed: + case HipDataSpace::ManagedAdviseFine: + case HipDataSpace::ManagedAdviseCoarse: { deallocHipManagedData(cptr); } break; case HipDataSpace::Device: + case HipDataSpace::DeviceFine: { deallocHipDeviceData(cptr); } break; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 730c9824d..6b6cb751c 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -359,9 +359,16 @@ static const std::string HipDataSpaceNames [] = { std::string("Host"), + std::string("HostAdviseFine"), + std::string("HostAdviseCoarse"), std::string("Pinned"), + std::string("PinnedFine"), + std::string("PinnedCoarse"), std::string("Managed"), + std::string("ManagedAdviseFine"), + std::string("ManagedAdviseCoarse"), std::string("Device"), + std::string("DeviceFine"), std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 469b09c0d..facf406e5 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -274,9 +274,16 @@ enum struct CudaDataSpace { enum struct HipDataSpace { Host = 0, + HostAdviseFine, + HostAdviseCoarse, Pinned, + PinnedFine, + PinnedCoarse, Managed, + ManagedAdviseFine, + ManagedAdviseCoarse, Device, + DeviceFine, NumSpaces // Keep this one last and NEVER comment out (!!) From fc9d302dbc7e2b45c193ec1482368ed367a36c8b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 18 Jul 2022 13:48:57 -0700 Subject: [PATCH 014/137] Guard memory advice from rocm 5.2 --- src/common/HipDataUtils.hpp | 16 ++++++++++++++++ src/rajaperf_config.hpp.in | 9 +++++++++ 2 files changed, 25 insertions(+) diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 2aedab3be..f28a93b35 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -286,13 +286,21 @@ void allocHipData(HipDataSpace hipDataSpace, T& cptr, int len) } break; case HipDataSpace::HostAdviseFine: { +#ifdef RAJAPERF_USE_MEMADVISE_COARSE allocData(cptr, len); adviseHipData(cptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); +#else + throw std::invalid_argument("allocHipData : HostAdviseFine not available with this hip version"); +#endif } break; case HipDataSpace::HostAdviseCoarse: { +#ifdef RAJAPERF_USE_MEMADVISE_COARSE allocData(cptr, len); adviseHipData(cptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); +#else + throw std::invalid_argument("allocHipData : HostAdviseCoarse not available with this hip version"); +#endif } break; case HipDataSpace::Pinned: { @@ -312,13 +320,21 @@ void allocHipData(HipDataSpace hipDataSpace, T& cptr, int len) } break; case HipDataSpace::ManagedAdviseFine: { +#ifdef RAJAPERF_USE_MEMADVISE_COARSE allocHipManagedData(cptr, len); adviseHipData(cptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); +#else + throw std::invalid_argument("allocHipData : ManagedAdviseFine not available with this hip version"); +#endif } break; case HipDataSpace::ManagedAdviseCoarse: { +#ifdef RAJAPERF_USE_MEMADVISE_COARSE allocHipManagedData(cptr, len); adviseHipData(cptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); +#else + throw std::invalid_argument("allocHipData : ManagedAdviseCoarse not available with this hip version"); +#endif } break; case HipDataSpace::Device: { diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 43ae990a9..2146536d8 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -28,6 +28,15 @@ #cmakedefine RAJA_PERFSUITE_ENABLE_MPI #cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN +#if defined(RAJA_ENABLE_HIP) +#include +#if (HIP_VERSION_MAJOR > 5) || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) +// enable memadvise coarse with rocm version >= 4.3 +#define RAJAPERF_USE_MEMADVISE_COARSE +#endif +#endif + namespace rajaperf { struct configuration { From 6fe301be91b5a43d703c208bce971dddcde93979 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 8 Feb 2023 12:11:06 -0500 Subject: [PATCH 015/137] add aligned host memory allocator --- src/common/DataUtils.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 280269ad8..651f084b7 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -14,6 +14,7 @@ #include "RAJA/internal/MemUtils_CPU.hpp" #include +#include namespace rajaperf { @@ -109,6 +110,22 @@ void allocData(Complex_ptr& ptr, int len) ptr = new Complex_type[len]; } +template +void allocAlignedData(T& aptr, int alignment, int len) +{ + if(posix_memalign((void**)&aptr, alignment, + len * sizeof(typename std::remove_pointer::type)) != 0){ + aptr = nullptr; + throw std::bad_alloc(); + } +} + + +inline int getDefaultAlignment() +{ + return getpagesize(); +} + /* * Free data arrays of given type. From 6a5cda4805bb2d59a432ba8144fca4546e6e8879 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Feb 2023 13:14:14 -0800 Subject: [PATCH 016/137] Add top-level sphinx docs structure. --- .gitignore | 15 +- CMakeLists.txt | 4 + docs/CMakeLists.txt | 18 ++ docs/conf.py | 351 ++++++++++++++++++++++++++ docs/index.rst | 70 +++++ docs/sphinx/.gitignore | 3 + docs/sphinx/dev_guide/index.rst | 23 ++ docs/sphinx/rajaperf_license.rst | 33 +++ docs/sphinx/user_guide/CMakeLists.txt | 28 ++ docs/sphinx/user_guide/index.rst | 26 ++ 10 files changed, 569 insertions(+), 2 deletions(-) create mode 100644 docs/CMakeLists.txt create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/sphinx/.gitignore create mode 100644 docs/sphinx/dev_guide/index.rst create mode 100644 docs/sphinx/rajaperf_license.rst create mode 100644 docs/sphinx/user_guide/CMakeLists.txt create mode 100644 docs/sphinx/user_guide/index.rst diff --git a/.gitignore b/.gitignore index f85f0f8ec..10b3b40f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,16 @@ +.cache +.idea +*.sync-conflict-* *.pyc *.o +*.a *.exe -build*/ -install*/ +*.gch +/*.sublime-* +/build/ +/build_*/ +/build-*/ +/install/ +/install_*/ +/install-*/ +/Debug/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c7233236..f34947517 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,3 +200,7 @@ add_subdirectory(src) if (RAJA_PERFSUITE_ENABLE_TESTS) add_subdirectory(test) endif() + +if (RAJA_PERFSUITE_ENABLE_DOCUMENTATION) + add_subdirectory(docs) +endif () diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt new file mode 100644 index 000000000..ac86f5bcc --- /dev/null +++ b/docs/CMakeLists.txt @@ -0,0 +1,18 @@ +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if (RAJA_PERFSUITE_ENABLE_DOCUMENTATION) + if (SPHINX_FOUND) + blt_add_sphinx_target( raja_docs ) + endif() + + if ( NOT SPHINX_FOUND ) + message(WARNING "RAJA_PERFSUITE_ENABLE_DOCUMENTATION=On, but Sphinx not found. \ + Documentation won't be built.") + endif () +endif() diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..246d2edc1 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,351 @@ +# -*- coding: utf-8 -*- +# +# RAJA documentation build configuration file, created by +# sphinx-quickstart on Fri Apr 29 15:12:14 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import subprocess + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('_exts')) + +# Call doxygen in ReadtheDocs +#read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' +#if read_the_docs_build: +# # Generate an RST file for Doxygen index, this is replaced by the real +# # index.html by hooking into the Sphinx build-finished event at the bottom of +# # this file +# cwd=os.getcwd() +# fpath=os.path.join(cwd,"doxygen/html") +# if (os.path.isdir(fpath) == 0): +# os.makedirs(fpath) +# with open(os.path.join(fpath,"index.rst"), 'w') as f: +# print("Writing file {}", f) +# f.write(".. _doxygen:\n") +# f.write("\n") +# f.write("*******\n") +# f.write("Doxygen\n") +# f.write("*******\n") +# +# Get current directory +conf_directory = os.path.dirname(os.path.realpath(__file__)) + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = [os.path.join(conf_directory, 'sphinx/_templates')] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'RAJAPerf' +copyright = u'2017-2023, Lawrence Livermore National Security, LLNS' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'2022.10' +# The full version, including alpha/beta/rc tags. +release = u'2022.10.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML help pages. See the documentation for +# list of built-in themes. +try: + import sphinx_rtd_theme +except: + html_theme = 'classic' + html_theme_options = { + 'codebgcolor': 'lightgrey', + 'stickysidebar': 'true' + } + html_theme_path = [] +else: + html_theme = 'sphinx_rtd_theme' + html_theme_options = {} + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {'codebgcolor': 'lightgrey'} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [os.path.join(conf_directory, 'sphinx/_static')] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'RAJAdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'RAJA.tex', u'RAJA Documentation', + u'LLNL', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'rajaperf', u'RAJA Perf Suite Documentation', + [u'RAJA Team'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'RAJAPerf', u'RAJA Perf Documentation', + 'RAJA Team', 'RAJAPerf', 'Performance Portability for HPC Simulations', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + +# Generate Doxygen, and overwrite the index.rst in doxygen/html +# Only do this on readthedocs +#def gendoxy(app, exception): +# if read_the_docs_build: +# buildpath=os.path.join(conf_directory,"_build/html/doxygen/html") +# if (os.path.isdir(buildpath) == 0): +# os.makedirs(buildpath) +# +# if (os.path.exists(os.path.join(buildpath, 'index.html"'))): +# print("Removing existing index.html") +# os.remove(os.path.join(buildpath, "index.html")) +# +# # Call doxygen +# from subprocess import call +# call(['doxygen', "./doxygen/Doxyfile"]) +# +# +#def setup(app): +# app.connect('build-finished', gendoxy) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..969d4fe1a --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,70 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + + +######################## +RAJA Performance Suite +######################## + +============================= +Background and Motivation +============================= + +================================= +Git Repository and Issue Tracking +================================= + +The main interaction hub for the RAJA Performance Suite is +`GitHub `_ There you will find the Git +source code repository, issue tracker, release history, and other information +about the project. + +================================ +Communicating with the RAJA Team +================================ + +If you have questions, find a bug, have ideas about expanding the +functionality or applicability, or wish to contribute to RAJA Performance Suite +development, please do not hesitate to contact us. We are always +interested in improving the Suite and exploring new ways to use it. + +The best way to communicate with us is via our email list: ``raja-dev@llnl.gov`` + +A brief description of how to start a contribution to RAJA can be found in +:ref:`contributing-label`. + +=========================================== +RAJA Performance Suite User Documentation +=========================================== + + * :doc:`RAJA Performance Suite User Guide ` + +================================================ +RAJA Performance Suite Developer Documentation +================================================ + + * :doc:`RAJA Developer Guide ` + +====================================================== +RAJA Copyright and License Information +====================================================== + +Please see :ref:`raja-copyright`. + +.. toctree:: + :hidden: + :caption: User Documentation + + sphinx/user_guide/index + +.. toctree:: + :hidden: + :caption: Developer Documentation + + sphinx/dev_guide/index + sphinx/rajaperf_license diff --git a/docs/sphinx/.gitignore b/docs/sphinx/.gitignore new file mode 100644 index 000000000..bf4af2473 --- /dev/null +++ b/docs/sphinx/.gitignore @@ -0,0 +1,3 @@ +package_list.rst +command_index.rst +_build diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst new file mode 100644 index 000000000..2a6bebebd --- /dev/null +++ b/docs/sphinx/dev_guide/index.rst @@ -0,0 +1,23 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + + +######################################## +RAJA Performance Suite Developer Guide +######################################## + +RAJA Performance Suite development follows nearly identical processes as +RAJA development, which are documented in the +`RAJA Developer Guide `_ +This guide contains information about differences in development and +maintenance specific to the RAJA Performance Suite. + +.. toctree:: + :maxdepth: 1 + +.. ## ci_tasks diff --git a/docs/sphinx/rajaperf_license.rst b/docs/sphinx/rajaperf_license.rst new file mode 100644 index 000000000..a7985861f --- /dev/null +++ b/docs/sphinx/rajaperf_license.rst @@ -0,0 +1,33 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _rajaperf-copyright: + +========================================================== +RAJA Performance Suite Copyright and License Information +========================================================== + +Copyright (c) 2017-23, Lawrence Livermore National Security, LLC. + +Produced at the Lawrence Livermore National Laboratory. + +All rights reserved. See additional details below. + +Unlimited Open Source - BSD Distribution + +LLNL-CODE-738930 + +OCEC-17-159 + + +RAJA Performance Suite License +-------------------------------- + +.. include:: ../../LICENSE + +.. include:: ../../NOTICE diff --git a/docs/sphinx/user_guide/CMakeLists.txt b/docs/sphinx/user_guide/CMakeLists.txt new file mode 100644 index 000000000..912f38a7a --- /dev/null +++ b/docs/sphinx/user_guide/CMakeLists.txt @@ -0,0 +1,28 @@ +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA erformance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build") +set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html") + +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py" + "${SPHINX_BUILD_DIR}/conf.py" + @ONLY) + +add_custom_target(rajaperf-userguide-sphinx + ${SPHINX_EXECUTABLE} + -q -b html + -c "${SPHINX_BUILD_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}" + "${SPHINX_HTML_DIR}" + COMMENT "Building HTML user documentation with Sphinx") + +install(DIRECTORY "${SPHINX_HTML_DIR}" + DESTINATION "docs/user_guide/sphinx/" OPTIONAL) + +add_dependencies(rajaperf-docs + rajaperf-userguide-sphinx) diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst new file mode 100644 index 000000000..9854f87f8 --- /dev/null +++ b/docs/sphinx/user_guide/index.rst @@ -0,0 +1,26 @@ +.. ## +.. ## Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + + +################################### +RAJA Performance Suite User Guide +################################### + +If you have some familiarity with RAJA and want to get up and running quickly, +check out :ref:`getting_started-label`. This guide contains information +about accessing the RAJA Performance Suite code, building it, and basic usage. + +If you want information about RAJA, please see the +`RAJA User Guide `_ +That Guidecontains a detailed discussion of RAJA features and a RAJA tutorial, +both of which contain code examples. + +.. toctree:: + :maxdepth: 3 + +.. ## getting_started From 77eecfc7bcdb627009fdab8e858123a3db5e45de Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Feb 2023 13:20:40 -0800 Subject: [PATCH 017/137] Add readthedocs yaml file --- .readthedocs.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .readthedocs.yml diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 000000000..c796cf536 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,34 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +## +## Code for debugging directory structure in readthedocs +## to fix broken links, etc. +## +# Print tree +#build: +# os: ubuntu-20.04 +# apt_packages: +# - tree +# tools: +# python: "3.7" +# jobs: +# post_build: +# - tree -J + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build docs in add'l formats such as PDF and ePub +#formats: all + +# Optionally set the version of Python and requirements to build the docs +python: + version: 3.7 + install: + - requirements: docs/requirements.txt From 54b17b42bee355f82c7405bbb7352df5fc8b6025 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Feb 2023 13:24:54 -0800 Subject: [PATCH 018/137] add raja logo to read the docs --- docs/conf.py | 2 +- .../RAJA_LOGO_CMYK_White_Background_large.png | Bin 0 -> 112034 bytes 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 share/raja/logo/RAJA_LOGO_CMYK_White_Background_large.png diff --git a/docs/conf.py b/docs/conf.py index 246d2edc1..3e1e9e534 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -174,7 +174,7 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = +html_logo = '../share/raja/logo/RAJA_LOGO_CMYK_White_Background_large.png' # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 diff --git a/share/raja/logo/RAJA_LOGO_CMYK_White_Background_large.png b/share/raja/logo/RAJA_LOGO_CMYK_White_Background_large.png new file mode 100644 index 0000000000000000000000000000000000000000..dd449495c56fd90ed781897f9c53a0de85a5c463 GIT binary patch literal 112034 zcmY)VbyQUE_dX7v0fz2w5TyoC5DAeON?=6kZlzng839Q}YCu9ta!8SG20=nV>25?s zy8Cx{y}zGlt>+Jxz&ht<&%XD*_O-9QjZ%N1L_)|&2mkh73vmF4kPa24CEyUXTB231n*}#cUk;nNb3O62@+s_n&Ej<>B|QI@~Lm$cK+V@akCg| zJmj)Zi%b*x_q5|D2bP&OU|f@fXu41O%!Ijed4m%GsyOSd?O19Thku-6UPJQFN5 z=^fw%9@p;Xvb*mtJkKBp7Q>d@$B!hN34sQoyBURKEI)?gMVR&l+j{)~^jdB@RFZ~- z-6cGtC_F`)nKdc*9tQw86O@}Jll#b#9ugDDms!3bWdDpWjE!9Q_!SH8RTJ)hsEei6 zlcUr5TRfGwJiZ1P{TU+1_l6r#f2d9SWjgcAoMb*3fE1F?;iff?vni?ng2R3nJOJ(` zWCI1-1ej}am68GR2lX<|l0+6o{5aX+5t+1)f3gBSp*ek;Tb&^qbJ~em0DE$`Yljtq zn9>`f!!V-z{Zno@NTu6zt|hxOXnhT=3T~^-Wx2SU-052yQFeoO30?r2XgJZn8Sl=e7 z%D)R76vL8SAVoR$YR?R`0l?)b>`l?tlB_r(5JyMU>5vM2Cr)mpR=-&FE*$|d|D>E( zUg}T<+FgGbK-x*<$1?Rpu@CiW23<1RQZ}CfXdz>5+Z6&kpr;zUQ%*zvW}wy&02U|$ zunw0BD3Oo$(n*2HkjH#L-e-}^?ff`G^X~B7w8<&bX!Wc!m zK1b~Hqgz+}F#>4G1F(Q1dO|sY{1c#^=l3Skgb4$47<78S?grTNc5nmj;^DBTWo^PU<&5`g|&pB5MD7*@!cLb33V8j8&yZZED zSin1l0$^85?exsV|g&rEIQEd)?T!l?oHlum}5m#NHiwvl~u%AOcY8hF@?= zK>%0P%JKR*Ffi1MX?i|>$85T;ka15F!}qjl0QOBk%vM}DmWJdPC@}SlK0J$7xByw_ zhaq(4V~}H?NIvi<1zh#jd_r9xhvKjo@)TiNYAMkliZX^&(I8*O-~(`0PcXG#eSMhoa`y5Qh2~=*keAVq&>k-{v)AtIG1PG)SXvj_nmOEL&SEAHs~NgJ9Y_} zZbhU}AZXE=H}3hYI@bQ%1$O}X;aa*B7Yg{cJ8QzF>F4$*JOKORNXeJ`PypB+Rrf4f z`?pC33@(E&IfZ2-0~UGR0OSxW5fKRhPo@aM+V9fG@3RD=uMQzVWC6EvoLR4~FR)0A zQG-j9hhSB`zYSJ?b{r^s|HrS`XO{PHtYCoRDFFa@-dfbSp#)UM?o!1)01tOueo6?i zKbs^agVPGx;O#&6$L~`BqMddC;Aey&H82Uj*TXHR4zG2+3kBj7!LYC-aQ6Zz!?Ugb zG~%J305nO90ov`eVbMBFz$n2kbIgSl=2Qf)xk-F$E!>pwJCL z1ng+p0MWZt0B}qmjs;vz_A&K{ZCTYp^uL1v4<84?FAll|K5l%kerGDJTFV2P&7*+_ z!2MX6Xu%hX?5N^hhP~n2Ecg@R_#d%@2+N~rg}eadyOBm6VnDweI}&($aIz%P@{k2m zwAhQj3d;fjWfKsK+TInmgK)XHG3DvOfczQ0>kmPig=}V52wMZlj6e9H7r-KwR}z+B zW<&yYz)uQft^x}HY(FNKR{0`H48sP{D{n}Fu}f4Wf4&Xi@}EzXqDt-oNJUv&cPbWA zb7rZflefby19=c0#3%$^{Ll&KP2bV7Ja;)2T&e#(YiNu~akD46umGgv9|C#xXF4Y% zP(ZCFLM;Jk_nFkian*AD?9K8D8)(lCX2A`Gv~#{+3dD;>1L_q1e3B@E-D!Lo2oT3W zvXoqVfk7TnhSS1H`8h9f*u&Qx-{Ohe*ls=n0M~iVS`79gdu<7BoPzsDwP=24C{JxG zHo$&C8b-&}Kgmi4$9?@A8{YK_rwdY8c?90X-7kT{K#KYo8YHRIQd&^n1?n}nx?><1 z8?BPG^Tv@i-|qwB?;IEbG#GvnfHIvJ$h7EL+;xF+{MuHBmyZBj^0SU_;*cVCk=Br3 z5P&^JG)%S#MJxk+c-=`2r)1$6mLtO|cCg_y0)VHU$g*8PoP9F+fC3i_`BW^tqfcn; zn@@|?J5m5>s9`6T+p)Qz16Yj)7a_{oslMcKjiMN^7QHLxy==ijRzKR8!f8k5Yg}87 z5~}At0x`i=%0k@FU^LJh!fqvNyQF~&fy+s1kmVH$1+ag90|L3Zs?<^jO5_P(htCQ( zRvp3?G&j|!@zd6T!Q+R)B%^b{@BDSqNlq<>1*yM3OvM2R#&V(IkYc;LmpB0Y9?9$k zc3c7j7(MilwlBF5P#!dR7f2pfqIHs$?G7R!S>uWT`9k{**1yl^U<&T3LkZ!GMb|j( z)lA6iLOPkovH{YJf*BA+{ z7Wms3c4{DnpP-Wv$foK*w6n7Q1kE97%K{viW$yr@)!~~sI`2VHeGLJCd$Qz&NFx0J zh&h%{HsJ3A2MrOqEjjvfw5PHnOy>?QN zlK|PV3mSlE93NmlQgo0GKAhx^2LKWjVOZu&L@j`-VS5Iypfkbk{VV4gD9{eS1&HtJ z0qq69cbC>Fo)WVxO58;r@o?}?+Srag(M>l0=mKXV*By|Rr@=7 z06SD?UFm_70 zj%<-9N0l_OAb=!f{R_H<^|C2-6E2UlSD(MxCd*8ptt+8H7bMw2G( z52fR%o+{TMEa3I7Ivy~ie+WHX7KQ@IAY(r4pS=2YW~v{slr;{z$T&@z|95-!tD8%; zK1mz6~>20F9NnctX*}~e_4HsttHgHgy)fX$QU^O78`=Dn=Nwht$^gKL0<1UUV>X(7Vn{1G_oS|$SM_!q4y5nw*pfTtT!YS$XeitwGIYJ$STcKz(=v9y!})A^=R*L}C#DFP9DJ zkR`Yi)Ztj_+D}3687P7$Ufen(t?S1Vj0@o zzmJE!yaRxN82CJl5m5N%4F%T6!qim3oL%jmP_&dfU}vYgJr-^Kn-LE7Mqt9?k3)?LWHsw;s?% z8d_UOefgU*3Ggp1LST8=^duA2KywGJ5BS(RRE%dLsz>u2dsoikv;hiCP>h3FhumcR zScgRqh428aw)eo#2)O1bO;VEWV#PcF0w^451Vew_8o7cGhty9J*S29V#?#)$B3S=8 zSr?_jZv!yiAQS+`b;w0&WQ?l-Q9GX}F&+3JbmWbeuV`kLA%hkdPoTjUm?!(*O-84-b3pTYZLJCH{sofNh&Yz9_YrYeT4_EZqA|vzv_2LsDZ(EST zp8uD{B^sBGg9T)cG~4cnf>bRsX_9R{C#DM8;Xm;|cKEeg$AQ8}y@@HKSc%S@moh0i0ir*7?dl2Qdz)0H8a=*=CR+JCPp&vW zqpb-*hv#H-7z=-*P+MCyy}_u3p)&N`4{Y}5q45;RK76)L#0nH;Y;x~Tsb$s!BI}aI zp_Ut&51yHq^_#PQZn7vY&qt%^0{o<<`Pn^>IFH?s?hGp1HMi*gF%*nU&^9yD8CkZTDRTOa1S(j~^|v1fQ?#=0>gWMC2(I zuFF4iWFH%L*4yP{oBZQUnMBNLvG0xvh*=q)4Z+XxvKb~S2>V=azWpO2*X{c>&1dt> zV){q3x4ia6gK#Dm2H|DNS#&Q9^-egjJW1rH1w{uf()IkraGdK!#1GIA~G>L z0Z3XCx;^{R?g|NxQS#V8#)JHp2Ty)7YOnR%EE!j5dr5M?^)>U#7Bb}kpm2P6F|SkL zc4)AYJ^|zCoEO_1JxIfgh7FC`OIPtg7q{O3ct)$s$oq*Z@5<3`m;Ow}vF9q)KE;cXn*3)q8>v2>iNkcSc0Yrze()>aBfa?U=v|5ju zi9hyTYBTv$mhORjfT+zRn*D%sJ}id6?N_i`CHIBym*JZJbSg5-tLr^7M_P?xl9B^p5=>&a_+6g%Me-~~}qHmnn)Ypn$1=3ya zq;rq;zE5;mi!I^m9ZMPQ6-gPVcqd2u2(L*sx(Bn{2Km@EVnZ0RwNpju@lZxjQQN_b z0*NpuFxtQpjH=W`RJ;&;n%?*!P*Dwt7V2#oo=?duuZZa*uxBhXC0y0zn_1}6^R-D# zQe!(Ro7ymsr8QX6+djz4e?i}@qy!)!M(z2bmBY$msHgiiXiZAHVA0&x>)&7VJ6??` z8f}WGp-~Kpo2#*W+6l zWX=!s1~)G<`U$ia>^^bIcOk)Be1bvjDqXRVJ-az~5q|l&aD^}XG^Ws!1Hz-zA+Ic; zErsV6AtDaVH+fJv1N(!19iD_lHPyFRMT{c@5AGCxli>ngnRp#W3Jb+*UeY8hn z>OOO3!CktVqcLt7z>=JnF2g4W^fBGfvcdl3k6XtHGk!AkX56Rv{pM?SJ;Tn3Kni)^ znUTHPwpa7-UQ=_Kf<$xAlL%7ur;i|%TA~sc^>fN90oX~%AAN^ifo zH&wHz!+yNHHTTles4l?T`7?Ie-r(}CHKjWgh~l}Kb{*j~(HXa-PavWxnCGbx-br}V z!enBv%eA^^%-3=qSZt;MAT%8G_cDC>;>d7VX^8(#w_B1Le!2RTa0Gr%W-v`CbQ%BO zO~3tMk-wyR>|5zMzj*9>p+!LpWBGYfE}yad{40> zZOi28wKHpoRHSofs}@f7C{!#l`C{4!IM#XkU>`d%fSAu}RkAqU-Fq8bJz({SXUkXR z?|EZ=d{Mz0&?2Uet}Bl4C#}QVNp)@Vzene*k+Sby+Q^a7gEj3qVB8g@IegGkjB-s$JqY~yq*pNOgJQg197Uf@Hq z45{1^FU$_PbywLt_kcCAwg)J8n*awAUsC@!Ax&!v$hUp1O{=9 z`|V%rd{ch3q*tl;Zd@71$fo^jt6k#iXPjJfrIAX6dRR6fBCupbAcQlM;F07S-un2^<~b-0Z(~aN#t)r0Bwav+T4Nr85h1&PxX7H z!Myn*VC}J%w_vq$dGNWbDq!7qXYr1aLV0;b{6dDbRi)ghQbjudgy`sIuvSk~xFF7$ zL^YleQXd2dP@r&>6kXA1#q|$AWt6O2+jIH^-m?HfRuLX@FaO9Nq0dWGz~_5TiK@6b zP06=q2i{)twt1_<{7sWErT+ARB2LCx*r}qSTRY1ohn^~SWHSr06^%2+*LjOqaiWl- z$U`%7O5KNVDifF{u?Aa9!%>btb^6J{;+L7quFbqvyHy1HQ6L;PZzNhR{d+dz!d}IJ zFIz?xFOzL4PR^v3#e4cIeq=?uS*-}K(NWU#MJ?L~%BVbDE~{G!AViuvLQ)79IuZ9P z&S=X&af_q`*$Os?WQ@3GCfAn6`(df0LmUpCQfwT5Jp;Ns&eKEsjE4SmsOyvm?+?Rv zo-QQ-Cf!#4Esz*SV3Ds)u3Pos{g4f92twuCV>Qu?Z8k~E1-piGI__2X5w1R|2{SoQ zo6t_g9XnNDQdp{X z@b^wP=gzvX7r0-0e!172jvr`xo25{5sd;9L%rm6bRTV%jsc7b@sp}*+P12WY&mTJm z7yh}L)5C{9MxqhNXXn*Vz>ERoa(&o$-?^Co1KTE(N@*BhJXXAh>lk578qP@4zyBvVS69fKW`y99H1NyA<^Q z|MH`a%YyTU_$Hv+KJ^QvhI5o5zW~Qurmtqfsb^#;w(fj`r4Duk z+R}mlml^x>H0WvleyoQPL)9V6$u;ve40Nv@%SD%J{aI|^rra+=zW`ty7=*Jqk@$bt zdO`bMf=yCIg`cKk1E2eGkslW%nHO z&Nn0TH3ljCy0U_v*i+hSfj23IYeQnlyW(5GO|%~nl||Yr`-pJ8>|#aGO#LD znpEfXVbi5S?Od$PHJfwSWUp*Jf4lWQuWDwWe)j}W`}VXcwZ zcZZNBOa}7q^UJl9!4oth!PRunf7lTK{ZXF-TKDQT0yVQsn3SX-tWh6BtEL8Wxb07jr>6#4?UEZjF1@W zC6ff@q_f>CvDp3_PZO%oEN1fJc#C#^K>Qb|abp#^_WlMl;pm)S7IEoKEw(@}L@5MO z0Mcz#UnX6?;AW;{tL(uoh&6wg7bnD((EV`p0iW+qc%#9=RI=Vc54PZ2a_arqE;=aF zJnQQ+JX@-h?my9Y3)h$evE9yA4RP<>iYMKce5GKEDD}EqdXO>AWlnQszY-r(6#tM_ z_hEju)%#H@q-KryEa8V3jdru!KE~NM7D+Yw&TW2l{*YHfdm!?!+HEKENo<#ZPf=cn zQ|63WcfVyQ5y3aQv4WKsq z*{-y&gbOg%(`8FLcu-@@GsB(whdU-YFJ<)!T-v~nKaLiI*aiW<>KCWwkiVTNlZ>fO1p`K>wljQ}Z}Y|4FJdRdTc)FLkci~KL(*+X6KR;4>Xyb&~K5uJs29~40Z#TGNY-xkC- zap&=ovd=*xjJzjtu^?FZE~{GQ)m*pM`uvSpm|t9Y%upj++j9`YIm5Cc+B(IV>FlLR z??86S@7R)}aOGrHNr&VpP4ZofMQL7a$PD?571@JXvq>(_1L9PPrf2^=$F1{1a$8~;AJFEaw3n7r>H&jE7=Q|Nx)XrzQ z)Yoe=%K2?4n6?-C1ccE>S$Bc_auo$y+^(&_Ky3I6P`2T~m1!L=PYH0SGWv*qukJ5w zvmw&4YnJ&Z*|lM%Qjibpc!F&(?x3CQ)uC zuUfDLLP77uaZ3gkn2&9jPu@fx;VOC<^E*gLtf>WU8GbPQN9=qLLXT2&zz4&;UyG+T zXghHD=Jy9?TY?PQrMnu^p4xpgnaODlW|sV8{1)?sJ`qP^&^574dWP1)hvV%((!~|+ ze@n+5dD~%%W)?t&t7RXI#s6QvgrPWc(FuWE7nd^;3rvzBTYJ})XM|@yBh-3L5I-r# zg2w<$nQXq z>rzzbZ2rAfj6T`>EXI<;G!&zu)h?xtZIjK3HbN6^0NM+u4f0s^)IhX3y zVD%ynTguN#+_ILC9s_?=B zpUiyU)t7e;_nF*- zEnc8c52;Mpz8fpv&!h99ujlR_+dxs{%=XGN4nmf7b<)9NH$25Us>L^4?QF(2FdmoS zhg4@!fvtO$KG-P;sFMMq977F^hJp}5W@0lboV{)ysJ#OilPPappC7B zMa6N8ua~XGOI7*;94LWCa(gFT%8gftuZz8$^s4u+#Jn)vXZ`njFT5h=di|PpW8--V zSLq3O4OXAlP$!AX z{Es3}c?-$eCx1Apn8bR(F(Wo#!qCc+TJ9P@u*pVuE_mO>s9gmy(|+dD*)?`>Z(r0k z^FHkK^o-fLc?jL-U9ge!=;(}@Q0}VD=tW;!qZ6aiij|g@K7sw6nPql)pL11Na3BO> zIyIj=P8>e}QbJ#Hn%22RaC}QYe&-#{WR(akGJngcXax0EO)-boOaZ&)m#Pm71I~+* z>ZAlj+}l2$l&ja=Gp@i|EN58qnt&s9nRr zE=iQBs5Gwxt!aoty*u%JfrfRVOD#V}N!Kw1J=D-Uqrt4xyu)*H3;)4^maU$&q3z^NCzXf4vc0!>N7IZ)hmM6`0_gofS8 z1rzTN%rmA~pH%$A<}>J8o#`&9h#C!`zUB!!aebx+8NMCcju}W)l#i^N?OMa{7|&yY zlE;DtX9_JWfD{&$Q>n zN(TRQ{qj#YMeaR6!cD6ctsJPS^+bGzpsVlVZUAcb5vgjQDM zOZG5cl8`rawwo~nt5*4~^HJS`QWiQ&n;HouDc%}G%BSccW5J0*tlTYC(8-_NsykKK z@_9wEHu+GW;%`yFZ-0NwxtYS!W14q_ndL?Zx=wdcw3@7}i%=(PuCji{ja^XiW=93+ zq|oIk{dQ@eciu3o5F)+iMd-{8_he`{0oN11oqnSVd)fc}z<(QoH#S$jmw`Dlon_v^i#!e{%mt?)3THNA4VfS`z#m{0ML2#Jmbf}4Jj=oez>x&xSVVBUK^cy#8LCpHcbuWIm5G5>LP_{3U!E{iXAI*KFA4Ip@U)g9b+a3p7F=$op>3QP2 z{6@u6HOQbHNu8b&G3su5p4(p@lW^NyW9(72_%x&KLisU;`OhE00066*t*}wM6-TKh zl~MbYne8JAZIG?JXiMKWJND|oWc!EOX$itObITTnyHZXnS74|*LTrUYY1GtqkdkME zc3Ek}M5+Tv9&BnY?g&!P%#|edk>9UALr#Bmj;i6x&2j|=A0!x6q@EbvhJui5o(YWm z zKyaKFn8&-F^lI+7yjqRVOe9MA+58hoyDdJS)t#a|MqSMfTD|dh4?<_Iv4OOm9VVnZ zuIbhN(A-#ndJ!Bo5}wyduY(M&_h2)8Y4ZAoL8-qf*oJeIda&LmZ5ZmlJ(x0Rn4-nY z5bMe*rWYsu6*9rf$5^Qd$u!2P5VQ#ZNuC?_GO7tfZ3C;#oM(HEKdeHGIy~JW-;$~aMJ33<7Ipa163(L z7o)qw3!6q?N7m0gLsj1P7d{&(BePy7nNkG$M5%HZN-XjtWkJ)@X4Ld~zjiQyw+f0Q zlBJgMgFD}Dg8n4U9$iT}#;g}A>Dz{^(1sJ`g4(>qzXqyaTC;uD@!qNFzK^OuwmvgA z`Nvdkk)yb1_GU=U1Ln;}{+cOOJ~rExDP28LEwbJQ`8BR@trnYk7Af84i{b0T6>RODN^7KXvQ!1<51&VCL$VF1dUKJ)2gQK^CmAB;~C zb+q%{{`)SaWyzC^RbhwY{@~z^qP66^Z^CzR?e2cU$J+y^yE;uT3Uq6}7hqfd-CQpkf_Zi>dQdKwYcgG!_*w){qCD$@ZqLN6*nmh{yi(Y+qdGo z6LJq6+HWii7nv_^W$j%Ml>K#S`ar64pg()t)Fw&&U%km!J!ufleBk!+)VccXs)$m% zib0pLWv(~2&Oc`V=Rw4(OYr`)$DYZ5Nm%jiOKR6zs1~Vd&8Q{!U^J&cghX=smUD*w z)-^r!xo^;U!V~vO;=us{EY*Pe7mId+5C&mDEVv=K^r~&cx>M4$v;XVz_&PY`R(YiH z23L9D-7bXjoBGCP!tnNc4}(mGj}>E=eKrU4CFqxW!IW6)zix}_BkKzRs13;cmGq;~ z2n`J#6*YCooS!o@(ZVCFipBI=mKP7x#z^xx5jRq)#{qPO4X9@H|sh#Si2FH z$>ESJ<;61LlW|t#l0@spoA9@v43f3C%U%ZXE4T2DfJA zA_m@>NOqhAoSm|Q`w6(Ru7O-H;?o)!VTr#-9*%iGhZaQ^p&z}t5YRn+?Moz{e&w7y zR9yN<8pa2f@>UNuzkEIQfVAU$t@YKfGV|Owm#{Hr=2|3_J1??}PLYo^t4eq4d(V*W zyT633%s*f0nQ3G$1SHIBfBo1O3pd>`p%xNSbM=0HZ)URsb|ds717*E?#h_}=6n%JX?zw1BPN>!sLNh6h;?CZdxGhCmS z$$~Q-A2xNf^vLtzhv5ZH2w0~)ly~yoL_Drwdf^3xL2 zbw8`c_uN&7v5ud?rA?)0^b!iUY)E?Y`u^OET2CZqZ6yEqI3B|bk-q;)1_7jeR!4R) z4C0(ZHy5oLu#tI)s4%zp&(7F@q4GVqWY1h4w5GR-?X)g(`1pcELifvQE<65P$;OKQ zGq2xL?TO#hqZ+R~9BgXm?bj&lL>sGT;KRz01Uu@+_))2hV{Zpm!#*PfQpMOWNv z0?IEbK*tec!U=Jw_5Op&^;wk&^agqSmC7{$4xB(yazh@2@VkW~YBmc_u6iMziwqMF z{<(SH;L}yp+HdYd9QPwFDAv~&;72Y0yG5jw+5~SNj^hFcry0CF+m{Wubll#E7VJSOW$M)cSZ;M4@~dyWBS(>f zzFok7Xbl1&H1{J17zk_>5tgpcns>G>pGltZuWUTg$AI{91S(kfcT`Q%H^e=z*T*uy zR&D)EZ6>ROwVFZhfy2FpO3hi%NjIy@yAFhPvuf%DhA2}jre>UvO<2}7(aP9?%xdJ0 zWf>BGwGx-=(@LxnOXt4CiD@X68@LIAlRIZ~YNl2tI`l^J;yI2vS?1&*rzZU* z&MhP1Hr?c-5nHn2lc#OC)}mBOl)iy4!wJx6O-) z_a*J!yr<9LY5%nZKkk-->}D_De|VD=2CAd&pwQYK=W}@_J?*l5BmVPl@M&j@+aJre zmTq(Y58&c-_#<%1OG2!bZAY?LWH$6t@CY8V@wrKyeaPFxtpII69khVe!@I!tRJg{( zL4+#=k?_THfFGTDfuz2lmDzg|?m=zlADyZ99dz5PJ2aCkd@d#e;HrO5r zPXbHb6zq`odA-sJ_`q%@j6T%55S=h?A@n-Uzx9vgY?s)I88}bfzFBexb=0gDv8XTn zh2G|XS|+c!&vI$=u={wESNf_$5%>E}o02*zu)W~kc$1;*ANbw+t-l!>Hf21aZ7}k# zdOH>S$bGx7>mrP0jmkgI&6296tbAZ19GldrXYrfZ?r}`32t#a1%b9by^1v&O;j$Gn zP+f?=?N#)hxNc3fnZ&yPcE_v9f^%+6DZF*&1#SO+D$3t-EPZy@129_Zz;W*3^I1lD zyR$8;c;A0lT`KP8CkWtk&MIIDVu7;i_#+d1n9jpS9^H%H1@CN{o`hjOHNnik`h39Q z%AIc?VSy%^v%SaO%FcgU3&4(U`a|GbR-MoJ=UUl=Zk19k=nX?d=rlF zbAg|s8lq6;uq>VRh;Ik(-G`z@1$ClTds1iLD=sy?JTp-)0oxJ>55m7lQoeXJv%zUA zD6c^KGk#CXe>chHdHDBI#|hTXMkj8oy%2}T`O9Dee_az}?x~8-+}WynQ*qr~?oeQN zYKYvqa*_WJ=IoP`KJ4^fr@y7?ntPyPVn#PJTVzKludaO;@6eWj%~_Fn&1oq5@avUH zrZ(}=*``Zc*2#U{vCqPQB9uo_TW45Hhj(z|-HlpeI*%CD6$wvU$Tk`gmX*u2_4TP0 z=jSR-gWkfO-khNGCKK+pg^I^Kh3B~ch((`vE>n3d^$J60NI%8TwJ7dPZ*fqyZ5T)z zA57L#`O0`EW)o7!y{37NN-iq3Y^xgHb?`BH#%XudHuP8K+I7AOkoaGgcog3@Gv}@X z|1nsQdPu4SJgDxoFf=)!JCJrnSI;NrVreVbnJS}=!# z7=#^rX;}T0j2nOtOI6h8Woezg*XDB>&q{ppm>*vKFtVb7uTxb`PN$@-tA~*n=Uq3m( zu}mH~2y?!n(ye+AP8g);{*;Rbsa!NqFYkk+m(^zu$Gtaxi%;d$Hw;+A^}+vqRi(EZ zpj7BVJB9n`x#OLcaCZFD9lvM8f^RvApg7q`=3S}Myn_?b_Y2$nIH+#ph@MkJ+byN- zJAjo@_fRpIU&>c5lt|4{1sXT@c=N5}O%IEJu8GXU+)p5{)_Ag$)9n2Qw&kENGM={G z5RH+Ps;3_hoXmGnI!F9t)ed^wHyP2vC66Y9FH0FbCsXk&Y@qE3L2bXPuX^)JoUM)f z`ANfzv2K^y-_U_gsNQT^lTcux^`wm6K{$k&^v zI=-#CY2nM|DY$i-AyuNR?bu0=G-aPZ&v&l?Gji-g%yX8k%&ncQMsFYiLnF)}AcB)U zynNhM8C84 z5gS{letysPKLwgR#UPMGAxy`wV?|~N;GA%LnjQ)&i~Q=}6yUHv9Mw$->yp}ibN<-d z>a!R=UqeotUf;|?6Dk3hWNBdN>COUy`NVl;G~-KH+Yy#5Cr?T3vEfZr#D=;E3rCgN z30r<P?>l0^-#-y_-QRK3 za1_g@<)Su7EFdTF!(WL#S+|i-8a;Q~J0TDmu#4FfcH8}CaJGv|1Xmki$?9DcJ>@aM zzQc{lhn`y&k%Jw^7bk`x_e(cbTk{a^0FA-`F?-n$)Wjurp^x?{6uq z_ws`quiykoL7ms+_28$N?>lNi*Mp0}R3TN)vh=n&c@J~(X)S0+wD!t;H}kZ%?o49S zl?JWUUv3173DSa#;9?)fZ?$`Ca$N}*ADX;JjbYgdPY6e>5x3HJz{{Z?Q6ZA>ko`R= z&8Zey!mo5c1OJB0sh4q~WdVjSf&u!dKDB6AgjjF;9<1#~`BPbCf-{x#RR+ju)^Idm zt!7d|c%I|)`GV_^}OM# zTB#*6H6@8aQX?mrbo^-X;7=<0y%b?n-*s+E$od-!Z@Tw%F$@&an)$IJnjpZ z;ws2d?EiKFZpVc9E1MH$ckoCyvtGt&d8uB&7s%TgXqQFw0N*azb zo;`0K4a|vwk@t#3_S{|i)e%7Zc%nzw+mFWkTJo8TjWHwN^rkk0-;mrfT3 z$*6u8N#2^Q^SgM42@E(gfGw3{u%7f$`B#>|o3yUJ;F`__sCuCNRGoskakAaO`G+SC zYIy0HzFqa+-O{A{;MSeu)`85NkL7L~fviCdKYs(UHn_@4Sj!rqX1clUUT%WpzZkvq zZ!;ly!YCn}m*0%X_I5SoS!I%5txt=Rz6zVObVKp+??oI)27F=W9Rg4lTY1)2fwwCGJ|O;u2YR5{7k zcQJ@*X^4bNgJ`(w zWx4KVA&gDcb>9rt^jT^{td0J1Y!TYdF5nVRfi;OKTZneWL)t~UK0SE<#D%LgtjNA1 zjrZ;5`Y$v4flg%;eCK`!Gw9B?nhBA`-(ObTvDe& ze?9o;eI_-;*8fM;SBF*g23sFmT3RI)kp=0XWPukob|dCBD#Zsy`*NHMY*=SOW&6w z%*)3*i>!`%-dKJh@8sn)Epd(7H2TC80dD|V%QM<*5Cy1vR*yU|R3?*;bC`zAz+-JA z^4fcIj_jH@Iy6(!E z_0RgFNtAy+ZG0c`S*y(9lRKluQGEFt5(|Ako!64PL*V3#4}YQ!Ct_ zEN2T(($$>2d9nmL3i|?&K5&kLFeb-RdPS(5Ft&Mn{pB;H8Q59kjDaq;d>i~hx@hv@ z($*&nWkRU=?ewc3?}Gr)K(sR#!#Yn33Jopyx>uZPhKBF*_fY=ySqvHdBEgGM5XCUf?u z*M!ldo9Bbc3qfxgP097Q433&@`G@Uuf>+VS&09!zw@dmTZRgMY z*q2}+yU1Xg%%QmD%xPDSHEOn4`al2TB&H%b8H8yeXQV)qt<8MGKq5R^0h0?0s%}Gl zh|JhLW>QIwG4k3n^Y@$04Xttyi{q?Bn;O-;gnEC z!HVTcUqXJaa>`$7P~0ciZm;oRNAa5KM@qM)V>CaDNN%!i6ttI{3;BYq16zlVuYU6d zWfYfCi9q=bGVM!YHR{vU(Kd6zV(om0;j|Uv*;~&1}4k^By8;aAB-iooQcj1+EX`LF>`Y6WF-4&l(ET4y2)gSlgN@iFS z%+=gVcUbTqXDrM)nCdK5%UWonI5S)oQMWkzBqe7G)L8#C(X$ASY=~Z`ma!ZVe#6Ln zBp?-ykRT~aQ?QiKUw*FettOfY0W#1(ubVaCv2m9Qz-2@zx2Xs=m-^(QB$H>p==blcQ+Rm^m(bD&&ZX_z7LpO5)T+X|=nKxWH-fDFc zJyJr`xs2z^NCduYh@N)Uy*k%oEzzLq{`LYz0Id-xkdm;xPto-$F8|r2p72SY_wd0p zwic1iw8uUB+l|{eqk#oN2By)sX_x}^-%*sTYW?R)-{jXJH6!D_#ve+InIbxf>DZx{ z_<@`F0T=iL-M2R};q)o4zL#I0!DkT}F&7$0K2FssT=qOFqQn*4EM7xaV)|EQEWDhU zwrWErmhPydG&76g-i%q*z_1>j`=t81oSTSc_vEfH4u z=Ry?RQ=cl8<1^w#zj>#wKXUdenBf!3M9~Cuaw9Be_`$!?Ku~nz*QkcsP94NgUGQ`~ zh~1cNEip3dqmXBJt>D|FVCsV47=w1fLx#~TG&V1znF<^w^}BJ4nhsH2poq!?79C_T zR0E(W-bM92P5KnO{*1X|2a{gcIZm-hMd-mOktjoPh0VKlP;R%{7D*%eUf!`h6`RhUa2>qw2R z4`ICRL#?fA$KNW3+1W@g%A6H#6k6pS61;`r#aOfZtS_ojmga^{>*sx3kL)AJ(+3BQz6 zF_`UjF1b063dsV$<*za(tui(-+Q$!$^q3rrU>xFsI@49PUsGkfjS$iq`VRR)?tKt73%I9?ZfpR%@d#<}Wf6)aRjQ0H|&X?56-N|O2 zfeZ_On&Eg|mneb%);&&`{`BLBDXOS}#}YT}@o1yD-Wi-$)r! zh~OJW6xZGw7C9VcE5dYEjtwR4u9em}k5{%|FSTCQ7n8usN0OJ~R^9t2C}@8mYmDiJ zIL~aX&cH_6dy0esaV!TB^t80eBieLQ^Zkp7x~%MT-C1bWL4;c?q=Z{`!=o}*PfD?qeX7FmI}4VI-Df7F z;G#?+<7G4y)j#pggemsRn5I%Jr3N|PsNQRKT>9oVpMkOP!ZGB$ixVqU^&5EhGwKQ{zL3iF;~CA&Du z*ROZDe}B;!y(6@Z(ZQ;)OLVv)Gdc*`oKh81!HL#zbXm44prgEm$p{`eU`0xclHEFR zr#z)-?v0r*mvYJ#dy0I2dd!$e5)c1|2gM{y+hu^|=qn9>>=Lzf+8fpekJr$}EHlFt z#vy!XEYK_Y@ELN`-NLQ7>Lf6Q0R^fD4b6Rd%S#@^hn!iTUV8OuL2<#Y*|W{O0QKpdL8MxUrFfQH|Zy5v2Zwh&Cy5Ihb?sjj%|Dp+S9<~qPp3%&r(I)TQ=H5S&a!2;n z3(wM8W-?oq8o_}H@w>Y+hS}M9sL{H#G=H^gC+g$fDdS5pEalz{_dtcF6Wr2J{uidM z&->^p)_0V?)ySJMq>`A1Bh$VvUfZhsiw7}L^1MfKpd1>1Z(Y#@ys1_i;z7j{bvJR( zy(N>v*ge;o9D{!RFwz8Q284Kt!UF7&5%3&C|Fo(o+pX6SLR-#aj4vhRtoy9sYRW>M z$6vl`;w(3E?6tKYM~d32@Yb1>+ao>T5#%pfcY5_Z?+WLc;{55mUs@7`^*rU}4;OCQ z7Wh0WPp)cB6&8O;mPFM$XY-g?t1XQUS5 zosk?z`Q%(s#~c8aa^R=awn=|NOfM~XA*5b*Ge@aI?I0{5!*zjpLeGVt=epmJ%`d;k zP$Upg=Pu^fc@@3A#4+@;!06M9uLvsvCd>P>Rxp&F73WPNtQIQgg-T2OR5D81hhn65 zRqDrDMe8pEpF=(rIvmCLd~w^AjFYCP=PbU!*zybocA`DhHAho+$fUD#_U?7vi5h#a z^6UZjtq<4Yf*YLel-rtrm<3Wy&hh=-OXyiiwfMj`I~UG!=43p2+aHt6KCc;DA^fEe z%6u*?$(S-J-OpwAcD<;wnS*_|*@=^IL^HC754HQk?a%+{w4FCDIQ1%H)c%x`(xOrX zeyFK-ziZ^_kNe-SEaPv6c>T5sAOhMh^JjmTY@`Wr zitdjO@7PK-kA%p9wa5O|49g>mQMNr zCU7?MfIFKEnx3`Rv#4Fl;T-%sW!clYQS70);iq*&3LC7vckeel6Ja&v??vVhPyH1{eq{!c;p03yn1`|_N#5!5;E#><2TDtjqwhWELZRl?eUWSff za6W~UH5H}C*%Ay52FX4l+UG{0eSGSd{H>oq7iao>=^6IgofmJ;30XGn#GwZ7`R5G2 zO*4kBEqQtIT-0ME?mJoAO2cFIsxS2GNDyQt4C~m_qVHduN6y?XUg*A`E&%}WZ~QCD zVt&xJ4?d&CQc*D4lNIh(F~y*1dZp6us`hI$E7pL_#Dw-~HAC?$6xv;$z}pMW-pz!D zMPw!e0nH`uPT}xm-%P7R2O6pkVj-R(ZQ7fSlgKYmCZ($(B)#x)eG0Oi8_u&`8WUwc zkEf6rFM?zh{@p6xdyIB`K~>n|(hf3T-u?s!j&}}&yAp;+Rq|Z-?v-`UzT6%U4R+hOIJx!J%D9ru2`@6@pK;hHxsf0X7E(LTL*5zNf+sJuF zu~py&rAwB$m`+J6LnjUUf$J>+}tYoUAK$E@+l;}C|_7)PTLd>FAMGL2|i`IEYsFBzBpk8 zMdN4T+GoWn+FN<`KDx{XZ=j%^|EOKfq|jcz671+$TBLkDUU)q~9&M_Wbd2;RkL;OF z>)2$k;%9--_J$MUW?h-+E1k73KYUupG+1NlSJLo-tAsST!sec*sL6thH>j#d%bn(E zfhu8$*MSD|Q)#wIr=K^mzQa$OdSQw2gLG224C+f0)*+>T)+%sA=+~fOBm7GCq?rod z@?y{e&;0CS_b0EXR3ov?-ro1{C`SzK?^-uTzuVM$qqL4m<3pu(|8CwI#?)<#EC3OQ z41&Nx*Qi8Zx&ff_EZZthNAe}ZZ+tN;3AEO#H&AjINEP1v^oS{g4fLWt7McFimG5~G zkiW>gB?+G7WQnRnL|;iZFcWjLl=mvdANUkbt-<{!O*aMuEA5@6szxMVAFc+A-Jy)Aclc9c0lPe>Xt z5Krp1duDQc)j%()xZ?XAVb2bqzopn%1^7SriA?+m4v@kfY-?6OO~bA&5H)y^^P+eS zr=ER8tw#g?so8_r(#sB6Lpa9GHDhKBpyDkCdcAWz_tIG3S=+v(>>d6+cY$MPJ&=G= z`^!yA%Z&y?E`Kl4Wp7QCbl7&kR7ZHN|9MDN`-Xp`q=6(UXj?bVm753ePL^@djS^ym z+V!R}$EJs_DAPU(p}P_EVXKwS$U1rB*F=B`9lBnH!WFQ@^utUO|UI8 z1kf69ZY-KLjMX7xdE24r=Y0V68!Ve}c%mECB%sS7_syI)cMpTZ7=4shX7M14Wn-fh zOVt90(lqo<%PUcXPfOf_JvR+~cODw#Z|2UV9FD^+~g$3gG5+h~D19X9%z zK81ny;h>W-C8Vc!@?~D+zZn}yaBl4vKW^e4>!Qm&&7)yAg2Fx96G5k=C2D)q(Cpe^}ES*;!%1~#DK&5POLGXdGesjS&VG;evfa>{hjvR9ikuw zPLQc^CT+Uri-Pe-_pd>bdV+i$4xS&^+mcYD9v2?UiFiDxl%_W&%+*Yv0ZlV+Rz?Iv|A>K^~a7b>c@ zJB~#}$U2p^OW?AC3C^Egz$t)MEwO2a3Men4=yc$@LIYbuvSI-S2x)gJdOUlfQr)+m z6b)pzwq_d^esb9eHnM9V$FM`8MjK4I?KKsWoyAb z&PdqcWLsRKE-n8OB1D3y{!4SX_FT^>adPpxWkD~X`O{G9+Sc}t66A&0ZdvB2=t#cj z5}ZSR3LrC%KL2={l9{eg`;6KcWAPPvUR>1p^FDsPcMT)0W+!sy@@0_~OhFhlb6q*e zZNurS!$3OeYP+70CBqR`y1mk!yv?y%CXd?zFLch5nB6zpm-VoBqg>NDR_+sR$2g>Q zZC8stePmQEy(jnZ(HJV5bw&l{S$Bd3nbdn>>cJm=`^`n^-Q{^vEPAy$2I3IllGKArj6X7}Z$N z^_9G}=p$42gi@=sRo_b%yB22@8(({JINrRKIVgBIZ&kKdzF&I8`FND}GM3-ImgxF( z(zTx`54GppP+dpy%NwLTiz8y94-l%?Y7a-pQ>xVH*fHGp@f!?U(oB;tX743-APE?W z-UH<&u_MV84yqK{H3||ADqK}1*y#yxBKcR#B_+P9gw~UO^@=1f<0fq0{Qi5Hw}bbP z*Y=Z#b?@tMe&t#yQx>yQlq7Im#E`tZ4pL}b{RM;B_bn?zB=k<~M9=BqadnA%PAc6$ z(#lMi^X*BUJQ(g;7}hssCx(dLr4Ah@Qf5&a$7h ze%tqn7_4?*q}=9L?QT-R34>c+Y;-pq~PSdC`r9= zN04#I3PCGvyu{(GzEv#t7m1I^q-ny!p~`>tr3GN0!ec}&HtCaL>HT(}N?~XJ1X;r` zYDG}v*h+=^XS6TvS6S>qm9o7O^~6w zUh3kPn;B$*fDI`%2{HMQC&rS)0jRHcckix~DN1oc6Kug*GoFWZ_Sh;zPzXPw{f1lc zs&r{OJ!(Cq;c|xi>S2dNx23s@rlqCXO(;yhy4I;k3G(rHPgWr!iGA_6g8l>X#Ijh* zFze!cgShQ7WxMK2h%3=jV~S9G?tXUaX3P8F(`5F%mZ&3>=P@$*P;?k7bPEjH*>;Ec z$Sj2v{k*rshvlGp=K?9X2OB8heLEB@rOsA+e`J$Lq#N_{Z{{T*t6i5@t*%eLWE$3e zFVAWb`C8cK3-ZAA=7ov}L0BnUr@ovOu=&DGqd@x3F7gS>`_fs2PmLqD8dLs^*$=Kd zUTSLCK*Uw1kt#UkKMizTah)#ms`6@_5-`XHSiE&*q}B+pbcih#30TkX_LCWE(-==3!}q2IkZ zB>kZ7$+V_1R}s29{gR8B_PMtm^3y%^Nbk$3NlatgeCn30cM>1R1|w3q8H(HKwm@9%FWkt{O-6Db;BDI^NTI;vmHelgMYIT zB1%GNz^br+PGAw$yto(GO4zy`Zk%=F1sK5S%XjHfoyj17LRaH*ocP8eKTH4ewfc0o zF|m=nOV#c^exn!8;{4|+Nm%`a2MMh|^hnOf4(``V-G^>biT&Fg?jm{dy;W=p zezo}8heS=KQFg6(QT1w$hY;o+`-4p<3gfTMe9(dLHw)aKJ26UIe8?y| zJQS~;x)jRI4ID{MLDIQf_@%EIgaJ=|wh^#Bf0%xHVXXUm;%w-H-iN8x6c|Wbt-IW) zJAM7bbt*zXqYhONzbnOqfXmBJzUBt&r!4O(^cAv_O0G$Qb6qTKS<#fS_I5LlvEZ%Q zoQOdJ7(s{N#YPBH?Ao(rY@jpf=k0)(SN*gn$H4iv4Hz_DKBEoDx0B8^F-drATd!17 zJzJ*-?X)T^3eOdY847K3c8i-Fc40K(j|&9`@p7Ejf;TpawLB)1uQ?LLqjquJ)U(~- z{*3)?@m&rAE^Y|_!`{Ljnkd3b`dT(VXY08#1`l>qBK}n*evmT1|94g&Sk**t94=a7 z<8EvlFUQ?kyVaTOmv@IFhUV*noX5&eef(`FVX~$?$CD?I3nNPup!^g^{48>ED>w>Q zPG?Tz#CLTPxn*Kq;kc`4a2mkG&lD!0Vx4SzO=Yf8tWhj0Qhp4I(v)XqrXIRp(^k3F z^|@3;wfiUshM|M%UmfB4SvDI;A4vXjZ?)Ll_kJlf9=MIwZK{V|b_%ooxn|^k3K)lS z$VvjaHGdCGLqyl~M7J;W<$Dxb)|vXxNYKNAd7m_~ z5*I}Qo36-lHdmilQ@7XmleOnXfW_kcYv=+0TTSJpNkw2L{Q&$7oOTL>%cjtH-3s&A zP1IjEIgbTrGW$lP;SC+8-}P9H$(gc#hSgz$7(ThIf7>Q}*co5x=5% z%8Xk)w|-}gB5CL7+-8aD%*?&1U?q;mqjS5>#X|8dLOu8-+fDycoa(N zKWg1$yy8nq#A9s_g4C&QNIeKErFz#byZ*f5s#XkC+Dh=|a#vlb{Y`|=7jXE>V^z)(33SlqLct-Na&_FwyyT+8p&<$XGC zg{^)+&Fh0lCZTP89|*yP6gYNVKhw3u(ykI2TdS3%ZizR~&rz;_*rB)Xd{G)dUYSSZ zTowB_zpzlBMW-))%PjJ51gSDLZ8?|N!*x*7Z#OSyP0#8^6)Vo|KZ+U8NJcBuI|pzC z>wz_e~tp4WIog5b?SV_9x~?7^$fO9Zf<2y5~Ex9mE%YQ5SnI>Obj47t^=+OGoK@0|>>X=`5M_i2qB)KKY z!ouwqHMFURl?aHEI3-esf}yyYL@x`H+Yd>2mX(Yw?uk=`FPmNqtm;d*ah#`dJ&tyV zzsG@9gRHGm;bU5|;yvb$4}sGw=G0PQHA;1`!h9l+SaLv^@$4m{*2SN|MIv3+^R{Vx z2-q&8tQIm7JKAq-cZXaZnG8*ozi!q0?9eb^JNDYsARcFY#%<+I6C;iT0He|*a|V+PWc;R>2g&ujYEUO z@=6d{u2y|kB$b37K?>s2@56BmSL!NI;lR?vr(E|2+&o>u*PDqhv^(F{zlh(8jgwnT zh7(vHJrjBE4#Nz4{A1rHW>T}(_($fRybckW07%dg)NPgAQlpjByE(rJBx_+hEQRwtAYz1t+v z`SqzXqnvSN?;m!GL~kd30m0zBTS7f?3arEy$-T>Qi7k00OFdCqZ9C!tmO{s}Fpetg z#)^6QWUL+2qq9JEHQV(phEJUZpPT8Nn;Lmg5%CXW)Pqtu49DncHSIkpXdH6o%vrXw z>L@c6!KqvxWZTGoENO4taCZ@*-JT>ksnFdoz!mDB0HMV*TIlqVO4!uhBS$MSUldj+ zAcSC#pLcO}ONxu=Z(*Yph)D8k4D+hAPt8@Vo2`R z@2y^uc7tWJ9`?XI@K4iXUfydxBTx%FpK3dDoEX}aqTdi>?ggtr?cX5@D2@18beunI6!qN z3P55EX55#LlF^(ZnHim{`AZSMATtXQz`8usDJ<+5@>#5PuWr6lzbzJyY`u?%frIgO zZf4_?K(PF><5*{N;vR(|w02~xPuY+D!jD|`UMb}m!v4Y7D*XH?Sh*MX_snU)x@*0c zr&ZA1CHvR@qw>bgtes42a{d~gU}@=`UiKc{#`48x-9*M$Nps8F!C7vj4p_4rZm)4C z{xZn;2a{+nOZOpmIeq#0Wh*HsNY^Uf%fq7#hP+Cae%cJ8vywp)+HcH|Fm!JEP`Gs3 z*KbT;%cy#<_IBnuragwbULRX-&v$#R>zo#}7Xb;cv|RFqkqV>3cT020Ws+wlrz9LU zu}e|-l2UZo%KW6AFzRBzG|pkMvZ*n7_n{%hJ@cpS2m*>XXPRr^E4f;zvna_Yv)KRg znC=>;lV0_yy71gL``@TMowSSo`7#SZbn4h-ul*vBW@4@Fe8P|41uYQIx zD0Vdwd{{pXseZsT>SzC$F00mO^D!uasLJS!#EbG9*VFBQecjf}GZd+F++~CfI?WlK zjrCn$Q@Oa@5GAAa{?2(EqIFsU*L|niZ09co`k@xt_x^#awBGog zrf-!B#UyS~!0HA{w7sx%mx9Ro1Z{I26gvJlZ({;MPemha5s!V8M>bu(+X}d=C3I)>DV}^&Jvfw|846L#cW8rfN{Y$v0upLz z70zac-6$$n^}#vO{Zh-T8V!VSJp;^k^XDp#or}*>NQ)E`m1sI2TK!c|7RR*pPALT6C(SK@`7+mk2sv>sZOCM_GyliIS$Y*@c`Wod(=0~?3=`H>&*HaykpEBF_|l?tbTuL8E%?3@R# zWvOVir9m=FuBTCMd6aE-gU&5eRi<^<{P1Qs3u0$4aQz+x;gB(pXbGsnK6!c&3qk~iT?ntM^71jqL_opRcX|;7%@fE`X`2%loSd+0L;O410iTwt zi};MD>+NO@PpkHNJAx^B;nM*SM*Q}bk|y=cGA{5-93n_WbFx_8-L?QSICH|Qec5v{ z?!V_xd&i%~;p_JmyIV4H$DUaI2eUJcb2b+Tmo=5^g(6=Zr2S&&)~zs*t6bC@QB1e3 z`B5*lQURCexc1Z0f_f||Z`YrgGa};uIdj7Ojir04I^4>3$}R2tA0CAx@u)AoS+z z;(X1)F3_hVaU8Yc0P29JSF6F%``S>NlN-;-^9KlX+ln?}DA?mX zIEG4N=iFH;4fAK)-O`( z-8KiBFLjW>5kTUpmmX%>Ms>lpa~#KA&%IIlZptOs#wG!H^DN6h)3dcL|;T`)Li+}GSK38zT=EB~TG4UrKH6x@HG z5A-!=J4gg%!#>zn*0I9f$zpw!KW4V)#*G6jz(`UafX7yfYq*}A>(WmNT-wM7B2QVb zIt1j2zQ~0Q3RAq ze9UP8-V5WwPh;;4{JT-GNY}LYNeMhoNpaL9nOy1K_%Cz%@`y6c?t|ie5t%(qit84a z1p-YlUn>1>Z=_ed!YRuiudz%BvJ^0}%4yaOS;q8TP|bZ!8@;qbKIw4xM9CMhySOMGN*ro36lU zs`YlzswpvpRUoulJi1)EP8Ud5qt6+HG#V}5fD>!beG;N-J4=rk-mM{vlm)b>&>dSb z;`D^iRQ$p8gsK0Kw0OO!QrciIlq!>80q+1t?6r!;}>=jRv2wEK~b-xWZA}F`#Yt% zeFEtTN)1$Y3gK<9wKq}e+akW#V2^cv|7hS%9W-HEuBh;J*AhQ+?pdm+LEHehI8x!r z4Zq@udY@$ou`Vj<1>C!q^V1r)kC(URe7v^t_?{EhO_4UTDD-&>Wsf!;6Onw}=^cO2 z1tDf-P|+Q~q=8Tp4=;B8zdpyrQxdH@p+`QLm%qJY!z>8vsR1)oTMr@n+U-Og*3M<% z!TZ+2UjPuOeNlY6kq0+Qa=g#*K@Ay;hRG*Z=*?vjT3vgB+edCMl{o)t7Xbz5dCw>Z zMBciFJCDMttvWy3pZfZ+iEM zcg7|r={sE7)Z1NlUziasjxEUsl+~jX*~COVOeXsfrpoCSAo+Dk}+8*N0-;FKp3Dtkv$+z5Zopv_O&`d2L zv~3KcmX602-A!0ydkQ@|dj`gz%-|>MjlIrYFinWf^kmsou+PNgmnDbb;Xg6(kZ?yj zc;GI;>5mTu(W`zr#D{tDfoak|EAsnoT+ZbDg(em}>|M^jt=paD<^{ef?@W)o(vLcn z_s{hyq->*(()5s%0CI3=^dr1-|94b#DrUJ{!-|aqij*$9i zLh@c*h98!To31P8=A15WbbJe+2@u!)@fV!gQe`|iP#;4PkBSVd4NILRadDx{e92m; z)!<_Qoc_V(10TTZ>|GQLixhAvIBCw@NPFV3*flL$-M?@cB`Juo@*_h#mDt#dbojzn z6GXxL!`3bG4|;3Q-1dGCBFQ%AS1ey6P=o{$6p8(it3wR`!Nuq1fwd5PmA75m?!gTyAF*r{a8i5d4ZlFx=2_ zDYX_-0O72%u`#sMyTk{b9j5Adl8csW3jecuIMffpz0GkP*wbmlbP?Ua{ z;qQeS>Ia;PQZFx^T#NKONOR^BBlhGzU7F&pX0-b~`me)m3bwG{Y`R|)A-D=)tFS;y zaS;ASJO4L^FDQl+i`eynfWLtytVf?pK%_irLd8x0fLrB{r}s?TTHu3qN&TTQV$u2> z!ng1ww(P%$^`dIX68#b1$gW4X6CQA`|NminKP~ybg$kUf7_G@;kw@t$=kX^vwrcRN zf}+e4 z6anpFcDU^xI6;RO!$pXM@j`T(wM!5nYRZKHuD7Z@%~hd-^?;`xLUkAUMFCT^-tug$ zTWdeaOO!g(BM#11~slhx#alLc}jRdxelSo#}P?MqNuWS&eh2c*9Psu zncVkb>q;*-ncldr{_u!LtGa%EFly)Zy@bUuaK8BR>zR*&7y*?4kf**pqbaIdzlJ$I zXO}8B%B^u+LB?Y4`uXpS%j1?2FlvtlhWuT5dwfwhPgr zm{^g5zsb{g;o>SNX)eC$eK{ZsZ}M)XgoE?we~OG*2E&Yrsszzdp$uX~!3rSnw?_Lk44dMcGg*hP!(V z*E`~;1Betb8hZLsh0i71ken?v35JSGplPUk!kDImj}Kw@ODEEDt3&wREX9|d(v9_N zDY4`aNoMy*x91NCRwqSpO2^e#3~;~n(^*r$x-G}_URv?au{j@}pv$x;2`IDr=yXau zsDus;G7SCJxX#&W@ZfnZ^nZu&BW-_b6DT4IUuV-ZR0U^IL7zHS)hmQVrKsjQRs$db z#Qji=J%e%BqgifgZ>K+tCapv0q)-}8Ta4d?>)V=m>9!&yG^r<4rK4?s=P|ky#T)*9 zeTwyvMJ;4VV{tHs6khgVqQDCr3~~|-KLF+Sl;@XBZ4PD>yu+5 z>XNSft+o>Y_|?NRd@g~a$`23981&_>8m;(BmPZ(gw`E*+VubE8SblTh&sXzO8p?11>yvRGdp*m^y;;|8F8+9zH7Fg=>f7?mjpKKJcO~tDUpS)P>#8E1gpDxT#6{-?X5b2J$ zYWlx>0jU49W(XG_k!T`ifVqm-qsr6>@hAfQ5Bo9%J&&lo&Dj*C@$Kw(84+pNHC&1rq>KjfUZ>Rwb0z1RpKok^}Hy#w993d7K z-0xy&@^T9UEQ?-d*0|P)ErulUyP)U6{fT#w@0+Z|mO>RO=+3?FyWef46?iI~AgNF> zu7p?jsoK1sTr}VBi~|>X3objlA^sB*^=u$lK-c2FhIy0wnD$?{5wc%Aw%nNXUW|Gx_eL+O3Sy{QU`Lq?TW-3NM2|di)#;th}u=wGn>J(u;G{~OG}2EjrvSde%#3x7|j*g0V~y@;}dwsTVS5(4Mw?y$_v}OI@dWr z`^Hb}eC`yF=G0!EFBAJzo6`zWh&S1mdURg?x7oh4XVz5$Awkx~BwmEdR}VhAiU0PO z2wxl({d(#hJD<^Y`7^CUSjPs?Zw)2N%^Ap;2SE+RBP(7LldQeT5!1O#i!Ca0YE30y zLFkqo?LnqNukGRuP+#6CoC~~-@RWZ69xY@bL3c>#S*Wk>SVgtn1~S;BK>m_gX^*n; zCmeC76>5G41^)~F@b4zvNYY| zu-=S9yPyk%*Cy)Qx@QWXtoY)oKe4p`g-tF2Wy#CAOZ>Vw>cZyz&Dx!V*44g=ov7yW zLqaXdL2Sx6YRG*uhUURa`3bXHm$i;{V!AdU?Fi-bHu)IsFFavfjsp&vExVQAK7LL& z!&2|6z=wGpL8fkPgvnD_j+T3Xe&F8&tEOtKVuxhR4`^PW6~Tw*;B6wTz#$Dg3Kk69 z{iQ$*|0xMt!c?dzM<67=3&;k()_=U^9uC=u8@M4o)ccKKS4(s2nQ1Cr8e8|3?W@i@ z40*w;qA71%we7x!6c0upmIc%co9v6O@ADS}Q67xG&4yt0MGck;ikO(ZY1Si&=Wl?D zHq1Zv$EctWmt9$0{!1D#SKjv8Q~?sGu3S1o$iOq8hujLkW0pS5Y(ujnib9t0MLSEO zi@HRtcK-|0-z|kvzPmeb0P2L=zT5(K8$qzQ_V$%VILY@TipJ{M?##^dv;oW66~8RF z#SWfF&R&f{1+1bP+ZAt92RfK-v#Z?=ij^FzyocQ*blM^-(^E?qk$L4mjgi|D_c`x1 zKIuUCRUq0>4fM>wJd0iZ zzU7A)KJlu&^#gjd`l4YcN9oydSSIBQ$eW2Dmo@@}IH{?HY0Y@`7)0WJ!%rWS%)KMx z&^_=vGdya{vAAd;p20bicnU)IuBM@vO(`+4CFz@>3tl9+dp9$by$`;PMAuAXi-+uI z2a(;`g=Mw%KhFdjvkQvVW&;$nm$p?8H7*OUjcpyWZ*e z29iKs{wDhq4TO4s7uK;WIM&OwUY~-!30Gw-fB`#uQjK}Y2VT2$Z;&{niNvjK^b(uw zwS5WA=SRWvK-Wg2poW`8?RvCH5unN5WIy>|L-q*{Vk7#lc*33S9+ zFRbVM&6*7VzMLYV(q;@;GccnEF-qw#kaufZLA5Si+ja%Y zlC$A?Phns3FYKwn5&-(A(IVLBkooQO)PCIo)IF-ZRzi;BO?|kJ`T?|bq>t50N$tTq zeGxKgEHS|Yl*HPwmotwDYga*KZn@`r1guKJVj@|j`3}l&z)mOR*{3!L1pLM0<=H8T zH>N|{KF6~D`#()TmHxC2#qjq1q`|!8#D$gTa<*Ih2WsFuKP;!Qf80`0PJ0g)kbbgZ zfBq|Ex_4~;|MB#daaDcK*OxA7ly2!pNl5`gy1P?QO1iJKC@J0D-JMcWf^>IEcRw4y zzyI@2U--GlbM~HDGi%M7Nj;>YCG3}9K%36juA^m6r&X&iWzsHtG{K(#dKv?7qk~Eju{0t4vK~u63N}qIx@G zfPmR{)!<|jnBZ-9oR8b%bmDW6=959zO6-66+kIGx1CVQW$VNFwj*mxH>09mPk~7J+ z4%LwDjuMKo9%|&n_10bKqY&l zQ<+&&G?(1@7Qa!L!Ts;Po_g3Eh7kPc)*+U6Vi zFi2Sc!ft`BCc}k;@mqO$G<~AjG@gk|PROzn$5>CY7;GT3t~}wyg2`U{9}x&6u)BdQ zw}M7zrSTnm_x-vJ1#^iJjob|yQ}+zw$;qZ5E&1CYvUQI0PgXbA*>aQB;^;5xai>Sy zuGbQc)Htn;&rxoP@3xlPkXE%D%!#>FRb-G+3F4nb2s!IbjM%EatN{s2>9tTfSJm&; z99t1N-kXXxQaS5+Y%9O>XPEy%Ms4s3fOcK`JozXA6WSj;NHVY>!bU37-aw$ve|{ML zh_m`Fj8$&JUOqJ74W57PE;>Pw6PbrK-In>w)m)jL=sS~}+ruyP>fYr^?gi8o&q!1R z&RaXx04f+*6~dnxm|}|rWf2Jqg=M3+E&MZoz98lt-bd6Q_)?`-yg@ocE!-IVa`5S$ zRLQ<3SFo_IJ4vTKeKP@^d(L{m-suulX^ITL#B_i7h_*Owb%;NZU`|iNgv2$4AU~&I zsMEEV?-wp;dnEHy*^BzSS({a*lj>gcRV&N9AZNwAo%rc7ZjkuxB#%zY*)3rt@x0ek z&@0=U#bTM$d^Q|xOmFh3Aza9?igv^XB9NEc!`~z=S?GW?x{-0_3~d+AfQ;J^Prdw@ zh5?5Df3d(xhsP$~qWfw^*}*+yz25inK@vapiWF)X-g$|2P`6%C2PEL_PB>EFEN&w+ zbvSNR*{%|FkTq9{&N>eAZIWghKBQ3aeDYssYyFiDTKa(FxY73q9EUGoJp{r&KNOeh zN^N zivE(ggH=ck^1TSq^0DJkCc5RcBHCo9UqZm8F(PNG=X^iKh7?K4wyrQom8vnw;40Y= z6Gv%xR{z%+Lzc&b&JvtJYny-A-WT*Y`-@*bE$p8T!)dMZW0Q%}nC5&Mt}AF^0k4CZ z9lVZBa`)%7Kt0s=HChaPB`*S5B*7o8-Tt4f14ioCn9PW}_lKf?FSU*MW$ZADbp_2@ z_P{aOOvzf&3S^E=;j5z#8l@wTIz-qNO3(Oqz&?V}-Dy zIm(W@#H1`bOK#ZbX)Z#RT!xqtfYL?vJ!-FFASD}smQ~C5;kmlB zA@q8e?=#hFh`AKn>9HMvs1sL&f`iS#j% z$Fr8rcjppFr0H)9E5MAaMD`*8tQO~cZ`{88fWe33br}?-Q>L^{777Fh)?>ZO3r~qu zK?VlwRS#huvb!}L2{0-H@HmQC*|PV_jWyu^AJgCt0`4tFbG=?lm8VAA2v~e2_74$G z$TH?To*=dhSJ7rC7cGMaNdZQ8pBCKO@8FCAcV2vtWPK5$-rB}ADm*fE@J5LTUysM- zQ+CVEu-clLZpL80HiclQ-ogo6+njv&1n7C%?Ww~OfFT7CNP#saBk_Dp5fN)ZKU&n1 zaHPywX>s;zOqmotf3eu%z>xjTBBKxI6jRG^Q6iRH=P94D5Z7?{9;p-^N(2Li zfs}vD4X}YhZB>IQ)#UKzHOl!O3w@dZtVB`O0T^v84P{95;qtS7vExi_EO`G%x5m@0&MBE z9Kp(;cVe@S3y;*5l;3P}KI|S;I4x^YLN*9HrH$;>;MvOwofz2s@WDGgs;A{2IRkAD z+viW!l}FuSoy|P^$b2EHmS)+7@rT);Rt=50jEh2iB!5>f7@8#w9_C{+5_8qXU)<`$ zQ^@=)0+1Og`NUq#zvZ&-z5HkzdL0DL#k`aDQ#LzpK`kT_H~o&0HD+ z8ROZ?tTKK~;g5ql05){Z{lh7jeeV}GbOHop)J`oK24hFew(_o8%G>TUcDc0uO$HNV z(!@8cfAg(=?UBxET3b}~7n_3;^HEB3%xg^%UQ-u4;(@mqmH4V$ z(q>(pK{l}$n<-!l_kRb0(j;Jp#~yjSq!N4ppyrE8V>54H47Bg;O3l2Xy<>bm-EFnaasXa>G#aZp@hsN6c~@$CelIwNMB*e!#i`I&s~dMD*QrbeR9^@MCg;YNsH z=q>Ejo>3Q+D=&qz;XvEW$JO#`4?#9|xb8R}NOC4-N2O1+apfJ*EHaqe@j!`)AkL%v z)3Vbu)JM$Ej4}ew6P|QBPQ08rnm7 zx{K92@eYG;2pAT#$tss$U=%faSd+R=B8%jN6&Y)C2gbPe!CaXgbW=+ouz{rP>PjMC zs`+;YEAgw$+EJzY=r2rXEk2ugGhK-U&#l7auiq_=?xhmmR2%>H8R=D2NY42pcVL(~ z@Tcq#g0xc0ip(OcHnu8i_^sSkA-iQqfyXd-ivLdJGWsd4ht~bMtT2}~vrk?aMD>I2 zbEGKH@H#7325otH?|ztep+B_h1NA4MGcO>(x}ynXpz6;WNShYDNA0KE1-Zu@;gGsN z3!(xbqeS$k3COkvKJ8DQmNc$>C5uVQ`#y>D_ltpdt2dIQE@_9cYfw4PWfI*QlIFI8 z(I{zibr65&=RQ~mj72-2$~c9@rx1Y-RR7x#cyKWEn3m{Fmd4te3}4Zq#H?>V=2zTi ztj)jIWN-&$*tY{r7qZ`q2-$`K!P#}rxnila)5){Lzf0)mg=?Mn8cggfhm2!cczH&J zF}>YubHr-7ob1jfu$U1!f8Lur(mH+L6s))f-Pe~IDvcg|Bw`=nL7eI!mUVs}P!KoA z^TA<$%>!nY0%FDv045-T<{;_9uRi|JGWv$pyFx>%I!3Avd$UYa;tM|d$q4%GcZ`d6 z42M9Bot?$_?8b^Gq^vav`B{6?FhZZ1s9jptzIort(oy4ktQ@Cjar%61_U{qIMq zlhql_P7lvOba*1--UQhI3KRttYGYnDLpLiyPw|}wV%cOjPbk1kCK3xa7Ko>^VX(PH zvi|@BhO@)WT%fCXeLlMnx{4^*55-kA(I+SG?+S$&N!g;UmkxY<;`ES4nVCGv&_vwxJ8jSLy|iHFu_5W+fqsB_&5~Uvd(d8gR8fa&m4V13g&l5^R}gW zwg!vI)$LA;piwaT@t*per0JEh->~w(;;?=`R{hY4<}Wv=gH90eT=oV!mhyHK*EX;2 zj&$C6sJKG=;;^=;cOdQY0M~MtpQNY??rswB?+Ptr^gM&)TOJIm%=;~yrwAzJ(Dzf) zf7s>(!XA97vp^-CgJnhZaEo-ziKhfZCx0H&uzrXDs=;Rqe0*oz%= zR8tn|BY;Wba!b`MzE2IZr!M>p0E*X{-LX-V-bq zocg49(CCV{Lih7xy@xg=Ka5ooipUIuPc~v=R0(3oAQ7pYFwpjqG>3)jkn zL*pfkQ>cEtR(kcXBI&2lZGs`#tX#QoQv}nBfI{Z|R1rM>8JeYYzq_UV=(!X(KX#uv z_EN@tcjv)zaBN_>qMVU^$P4{82h_#A5@6cU6}a=d3WNvP{@Pq(0KLhHi$sfKP=kV* zfl7~|(Jq6EZFNv*mw0%wiGaq0MEg($Z0SPAQ96A2975Lc(LeaS4m=I}@v-OHuEwB( zm2Xoh{nm4eGjErcB+8L0LRq3M4b(q9B{1{&Rt%M+-m)p1tq(;7_>nt+FAfIw{39l} zKc-zwTj|~ufC6M>H3R7ox{OZSG{t+j@^nC7*;E^4(^_Fgf-z%C6!>(|-2B;p8{dj4 z8+FrL%gO?gf9*VN*`w~CuI$JS&%#~plBkP!F#8H|JY-%%)?Q0%(V2Z zjHnv@g6&sBzpRI?02Kh`hWy_ETi(&Y$xG4q%9PZ&f$aT)d3ZoBrOp@9)H zD-2afw$`@ZTFNd(OBMx}`y3Mq@7opVBgT~bv(2F}`I!@?BH5<&Ky3-hWdh`{^6M-1 zBPbf{Ru=DVrjZ$H|8B(3LLD%lP5@ER5?=rx&o$710D~HT48_JsPIQ)?Qt4iPrkUDm zJjiYGD=Yexn77OLr7IgbyxWK$%kNTrt>s8R8+3W^_3Xb=Y1-9&Q(S1_*7$lnSwBL! z*)Do&%Khb9M-95FvK-fxUOlTucF~wf{nq!=aWA@gF4KnCYI+B|2`H~b2Bm>FK3hA$k*P}k{?K#vSeNyAG$#>`t1A)r_skwfBsZhQ> zWjI&c<4i2Q-eic<3c`SI$QJi&LJkarxaH3bKbuT<<5eG9lQzH0ogexokzIcxmsBU! zONYqVjS%Y1-zs#<(}N=Cl4}`3=@se(%}hmYsbU4Wp&N%$Y6oM()1|g(yV;!*rTSaQ_%Q5p=Mf!j1<6gF-)_Nz)_QuI zWqLVyHXN0t<0Sf{_Fl_W6e-*m+5bJ5`4I_o8fO{Z z2Xq?$G#r4Yu>mY{y@G8=!j(*AZOCP77MKJh*AG0p#9rmoP)KYHneN>36S`o^{XUo- zJ@6ux>Cx@pdF!oU<}m_uT(^qg*j>O~T#qwdoVGj+3_b7Ar3cs{yK*I@@Vl+@+9gEb z0+Y&sEK}N!gzjyb{Z%9l=9YocFb;?sEFhNvnb25&iI1Lui@iy4AS)>ZHcw;L{{1;$ zakhKw<_)q0ptaBXEN+WQ>J&a9UksA*i`&Etv;JsnviTG<&*xn~ztpb#P9VJWR`+3j z@>{;KV@-?bdd-t&>g&!E?1T@0o(qQd(Lct+vLpZR2{sKpV`7xK9TN6EB4dWAD*H$D zrodjVoO+HhkkKG1!$L)zqr!Us7=Xps{xOJC_2C8|iHd1Iz^-{?{Fo$1Zx67NT582X>JH!0>|z)&67pmb&xf#CTg0F9Bg%@2ZSyez)`bT)q8 zy;ANj-t%M=B>^+yjt*cP1%{^wBvn|$Bzp@rM&Z}yD7?>fm&UrSW~W3K2A}(<6#`}= z1b=Dn)d#GvH1$|LDO2NS+Ii`>G?w-bREW$PPqO=c@+N+hG(vFC_U zI@ZLz>;&Lz*e?<4E!M(_y)SjHT@MV6FdooH-4>Ad2UnL`W>DA`uhf*%pFmyf&1XM* z1zcsjuHwc9);uEzwWGIXZRpSi-*Ef7eCq=J5xiPY`(?R}o9#kSrkx=&Eesibv(YhW znx85!+sc^{A!d)a4d+wY?Li-EmS>_3?e}~}J7)j5z3My&{K-7iI13ARdXS&llppFb zCKd9I=zBFC3ec7r`3VE;F+9pGt@H;GLpWlT#2^2}Wba7>g`_j&b@xFA^WdWZ$Ro*L ze5L7ZyOD|6P9ZKSZ}_kAOZ;Qb0J0A4W;p!t+c|e}&7H}^Myx7_I>^!{nPV~-8$mDM z6Z81$^JD&>#}=nJP(OWkK{}_3ct&SNsy_C_g=W{n*hlMK=UzRH_vOe?4_|pOJ$Cl{>7T=gy3Q1Cf8i0X`#fJ-#iIKv zSCMu~kAFBg3&)cO1f3YrE!t0YYl$?;&u!(2f9CovJ&LGP`bZQ|Z_x^=TOFxUct=D* z3;*pTYq(J96u6iaXUka2*8BJjDtr$?NxW8_bz~Tx5TTS~+G6e(z}4{;17w zs$=p7NfEunv>oeT4NB#o2*vqj&Wg)dqFnL%|NpvFQW)P>do$Bu%M4UW^BW{uCv=AJ z|Fi(_fM0H;88CQi7J*Cu6GIf&Kv|3jZmKc>pQKZxo3UaA<9LxrQEE?wx!YA$3!oV{ z6V%iw(elDMt#n0s3e%aMcm8o=sq@mg{%bKv1r&d2Ux0`zz`{fGE5=QJyc3HhOx*Dp znU=`|=uI@Jt_Oxk++`ODOslY{1Kf*rIY6+{icev!WMiB)Z`bJhK%)!a&TWCg=dR_{ zfU6wk$kxBS>NnRq)stJx_KE~WIv+c-iC)%mq{bVON48W_hRJDq%AOd|qB^Mq21B*)0sN6#Huzm*30b-AriK* z3P3GJ2??qPppr)TLV#~fyslr$wKklY)UIi?~ZGRTE-spnpz-9O{XMtmj1UCp_ zK;YV(I!N|Esi{1$9I*^cbDUqga0wq6^JQ1-X6`jGt|Y;EYL>i+cLB}4UjqH1$x>e@ zu)L*VF5e^mIp&N7-W)U#^HH~9=~Gdg#Cq;d5zsYDg~MIQVmqW5_Y>FUFv}hdXf~hJ zgnI|0vyn4hB{*HKppEHlPci`__3MC`vFOcv7U0jE{pn$U^DhhAek3T($8t@EIS#P` zI`0?<_1=C3Vm4_QnPnRPxV+Kh&nQp_cQQ!Pe)Kn)c9f~!-zTwbhY_8I#v(t`7+t0KaX8gL4+f4vCmL0Mv04WpUTx5E--i)FXsjrr)PYWQ!37fU z>nVU5RtavtI}rS@ts99~y2>NLQC#V~9!m*{;jjq-(O8veyA27N4VWYFk>bK~M&y$++R(^HhXHxGcd$uMvfCLQ~MquOWMi$ zENLS%6|6Kc+hvAmO+DQ{)$SL`T~Egy9K8WmZ{3vjcVm&-qh1}<_DqsCl)zWZPK6}w zJ@9Ey$8k2afjKy$JQ7!YFSqL8w`S-|;Z!kxNtN%qS@UjY&S`o&&dGm$|M;G7OPg!) zO_^XgM77fsY>>P8({f@0By$BocwktQnPxu%Ao^k{0_h#TSRv!0sfdPK*DkMnbSRd=nA zEP_A0@YG+o{1*Xg<-=aPc_*xAef`%;-X!nLe>#Ia%8Xp-L`w?k@cb`zUIO3N8R`2! z3&fhr3uDnL(XJJVA&u~0B^1w8fi5xP+>L{HSD@;)E-?u4zSg1pC)=fLfEA63 zT))0Ss_DuBsmn=n^TKVdWgsLm-G@AlK_ zm4m27f&0kEbkmKhG%eoUFQX+}`PCybGUlN&U>{iDd0bdb#qJB!qt&%+p!e46^!gymYv202W>xCK20$&Bng1Eb?h zzT{odLi-My%J&F~H)u5AjsE^c1nnMkPdF4)L`nEF(e!A8QIfbNv}mGd7<|~eNvPE~ zfQ~j?`}r@Ig=!zmR(M33w-gfj&#K*x{m+*!v`XFEzBO?8e|*XMQj12006S(b>JoI) zf4Y9uRAx^S$vSNP)YK&8^t#yy3XSkw_yhztITOqsX+oeQQ_x1UI%fI{$F6kIv}yVm zkw~Ndf(QGLki42_$@?s0Xv9j4Yf)X8Irw`K_! zeF?)BY)jpnzEWp~nH@ocb-iqIJ!lB;m*lk2kouVhwce_Y3dfR(PlBu!5&p-vZ({(o zUux8^^`XqaVWe?m3~Ek4(+G4*KxC`SJ}!S?mT3==tVa$+(@#`)h{*Iyw>HhZUSv-}{X=1KNwFLqbJ_Ce_M+TpK(_6ft479O8)B{uBVbGg7k$G?58{;JAfUL z=3?(Up#b%ypNr7RcvE6)U=&3)+KZL2ZC-9AGbK%Ve`=R9tfHGdd~!`_ymTM#lA6x> zAlYm0E4{fagjWbC6}l9*vRrS=)nlTkpAC%n-ei<*Q2i_xbgPf3CSco#EVCT&6UUAj zZVF$f2tprVwmcs>S?OD@oD$5u@mu2ZY>#AT08TQ0!%2EU;p-Sz};hv3k zbcU!jE-Ktmnq^t-VEEY0iVtZ1hXALD6`ju=L&Rv?xyyZP0hkEPfW7}86!`2C)?Z*g0ar0u;h&M@ z+;4xGAJglOnq$wHZuF6gAm!uz29J``hN`kZXso>T0KRcgO?mWfa(kQP14*L< zJ@VVkk2r zaw34}*tIJEFlV=vJ-6JsH{&%2{V?UwZFQmB0&B6rjM943XqBMNKm?0{zTU;Xoz*p5 z-{yYpxv^1PAd(89tTgjaH%uDaQsj{@2G1Y~Cd}xc?!JjI%RLXs&j2xS_TY)y@$JWMN8ZA=Wx`j*V|P?d zK}|vuA3-Ppg2>n%F(!9dvC`{~Y!2 zd3~7VlKXI3nOvundg~f~M-E*57ErxYP^Rq3iBiK?s9JFiDFPZ~o+6PFwZYLc+P@DX z+m4I3>G|hrlth0CErJVHkpmEwVMwfyWVLk?I{z4|uT9N1rG0u@zW@nWqv4SpLAN9; zJdOQgz;4EDCi7VSK%@NA%=50o_3KzWBdhf2q5TxWFDm#=**S)mU=>_C|0~Thz)IXKy4o5|4Y1s2r5l}zQe_V3Wmw3APDcBf{hp`0m{aPKJ@OCzr_pYXx7XC%wkj zKoMW6{gEche=kO4505p0ts$f9KS2#4*disB+h(_%qs$8=d(*$7%W*_$9Fq0>r|Ki3 zkVPiG_B}87KFQJcpx2fDX5FJ4X@xm~Z+J&qPuMmY876_}7K#7q1`!9P zftM`;oUN=tnlJ-L@(LHadJgmNjK)KeOK-omDG&lh7bnDaSh>e<{s^df%KKG@ta~M8 zG#N-583Qz|1gMm&jb$VXB9i<|aKAIFAnU5K{_^v}DYu$qUz`TocoDI`<8Z1Cvu+ zvVx(5Qs!ma9sP3}Rpv%6eJxp^BZNGT`v?Y7+=%zV9kfzPX1Ii(uRlu{g&imag1Z~E zU5rNxLj!?xvs`1%I^Y9;#ps~&ua*_JVo-N1GPs_vrA+fZgqmM1s%^PdqNiS^DlO8TS$?{^b|1xbon!bl9#G;Pcu(W9099(U6>1J6v~w6#yD&_Fyum9#AvDoo5|qgO z5|`^GJFR{oW8-5BEv$HONo!$e5k+rhfr;tq-W)msiY(p*VTag9A@I&}8|(I{+h+a2 zKDwMB52fXN-3dE}!B*|YMO`roJCZyp(-0J7mcJ-WG$qEv6H&zc^Cc$UZz=tgoj$`Pnkp)l3WX6^`?3UGlR}41KF`G+;DRPjpv|pJ^5qY(&(UY{A$& zG8)lnetSBno(_32Yk_X139p8WvXnvFd7i^N{;t`m4JPYL$%T{C09)D&ZJnM*FUNNq z%OfRO3UPEp$b3*CWwC#8%#bc^gPL&U>O%_&S}3msm4octmq4nqbJ8U#HNy-jp4q`y z@Kr~Clt>$|Ki%_X8p){qvzu5>+pJ})IdnLJYW8k6dXAK3{aA>~b|+2IRO?8fC3U$i z|M7lqa+(zrXVES;>~Q4XS`>>O8UA|ngdxi-&0MPe+u=dx_g!_6wnUH)#3r(@Jzq$v z0AgWxK~kv;(Uc!1!JpuhFP<9rCRl2|>G?EZE_zAV1yY=e+;daD4(*6P_`L>zUI473#0 z=~XyUNS`RFZfCXXm=!9N?tP*lDH|hGjDQGa44aP!S6D7ea^f>OR zF8OsncP$h=JP?4%K#a-r9Heo6}e8ccaB z`|wPIEwIbCa4#VD9+$XPZF8Vh9LdE;!IexiYNt=`J>fy+wzZC|`!|C-hzk$(bUyWU zKoL3bxQ|a%(@6v-Z91Y5)ZpGr2OcAu`r_3Sos9v5yM<#xQ$#Td?`hLIB2@aa{+-%oxhZlwPS)Aiz{9p=H>;J|=Iw*;`eG^s#%)|8E zKgKa9u55jd;y_qJu)3n5^IW;D0yYgIuK^hrZJY+d!_rPu*^a#T$hoWBa3qC!%UiiG z#~^eM$*D}pFrah*?eErYq$eRQ`kmb`S+^y$beX3^?4Tsyg$S{{so(jQx~z<&6BYJh zTJ>N3M!t?TBQG5K%tI;(@6qL7S3b|9goG#R`Q;mLtUDA~kyagL;o%$9?JJ?x)!aX8 zWuxz9m7HvmQQ5&+9}6+6Gv+v>RpN$#n4-&QZXidg`{Krw*#6D?%iKJ#ve#I@R;I{u z6fLD!?kqw&$FFV3y8gTGfJl;3>XKzWxrg}{LeF*&GrhAfh-4h9GbJOiu*E!VEs6d@ zRe1r?L5XGP>uc0TL1riC!`C+(ujWh*H0r8u^VXWmi*$5Z;eDdY?{>BKx`g`lv1F|U z+sCp)4uH9kU6R=d$t!dwnBRq-yT+)WP`??B`0fEkqpQjjqGM@Z)Eq^qLPhr_}3Zi=nq?MfiBahStH6agMKpm_bJ7``Ek zG6dg29z33`L@C@HoHL-PJxy*=<46{pXJDNjVzn!gJ>2sJPD#R@4oGA~%AYM3n?;dx z`zniff4jjYv7DK!s>X*tIN55_bc+T3EmM?E*}&4DnQ@9LOOopzFIM-wgyJ0?u$nZd zE1CTI=2JQfU0S@)<{E#%1y5K$*3O#7)w8~m034X4Yq@F%*oZEO`M8&2$0}QNM)qsE zJbZsQq@*UuR*-yp-OBd;#^X$(=RpQ}Ul7?N zx^z+%+JurHn-%T5UH&l{vu_+P!8Ua$XyM?4UBqQD@Z6sDz}Utp=lADsV`Jj#gvGgi z7;M2CW4H{mm#aN|TZ(-`Hkz)rvL%dIe?^(uYN{{c)iO*8&HU_J;K>W0v`s*`>DBz-%QMNU18_?U*!6UA3ZbQy`R@xoV06~3ax z^9DC0KQ|T&4?2EKg&W%!noMuDMpgQv$i(|qZ^;Yh{pZZb2+OJq#A(ZSNGyz-x-0;> zF_aQD3KNC_@<>)&9?Z5u3njB<6(53{=@vGa-lXO6_+yx zX}i5+3L)M#UcTWq$rSo$obQ#)d&m3Z!nu1XQ|_@hutznkso^;uTilGkKR1Fvw|u>b zp*Rs=+#fS{1*i|!f%WM7#r9p~r&S-Qf(ovk5KXf#B#)g!Q@bZK<@dndlN#dUt|lth zom|bZDi2Kkp1`%%`Hub*>B&sQT}`*`Hx@hi*}<~?rLUmSr}u8g_rsqs>le0R5wgLi zK&bqD&H4i+EW-1S6TS{k{=P)O2<((cmHv^qYz&H*5Fytaxqr0*eev9(G_QOYoD^8F zB~wI$#wC8d1PwZ)f7hODCca&W$D02#%yF;LRX|VVb^hM|0W>((yG9A11QqU|&bN;v zizz``YB_GLKGgEbxD=d->72+N={rB^k-uOCgFk;@Ot4I}QeV(fs|#W*Dk`*HC@rnH zE%@82=QmQ~-ne^FBF$Ab_A6R7K{OdcNf6Va<&P|m6^Y!Vxc1ZTpo$vKEZFF$ZXqbX^0VC1+I$%>C5{n!o5x*smZD!D;>Wg#kUaDZJ>;n@5WZI ztkq;!?`L_H&@8eF#-Dujh7L9JI_M!@uUlIJW<;1dpJOU&a^*>wCVYi+0Z}UrtRbfP zPqpLih8b3ya`(1C-#=y^m8pAW_TzTM>YqS}Pb`2?>TRMt*l|Cf`yTrW>bE12?)`a1 z_wC(T-dp7TxH6K=V5^>xK3-fT3|$~V&I3L$y;Kao{j$2u-Cou$8eldvdNYe_Ou1?x z^an!Y@8&%O%Yv*m{>-?RYi?a3$&relmmlpmptXPtY z{Q<;y6C>>_ejZlr?t20U?l7w3BqebZz4A#sM>6%THL(~A?#+hb_(!5!2PvHXsFc-e z9>}GE?f0-}BANwitz3=&-%y6PF3$COvym1t=yUV4*6jorF#a` zwRcn5ot8_1;u81Cs)Cdu_U%g~h{LD(Tg!D5F1}*ZHz|!6^1ChpaL>aB$hSIy7bN}Nu~2*kA7s<)V&+{DL^*U>`|f*V8`(?`k(^YO&`GIbB#r}Pm?i31325Ii zZQE4%$JjO84WFwb3^R>Sz4z{ADvg8Tpx>R;lg<^EwwxeAJ!*zwpa#0GPU#Mb=jx3| zkmc0(tG z&n;b7Pi>{`iSZzDedP9`I`p=`uE9dyZRPOG*Md~hVNy-|{Z=R86~&&gUxHZw1Pj!t zyhK1TDCc}45Q65z-;m2~Lw5&Lx52~e)B^APZB2iC>;6OkU-Cx%d-haLmu;{OV*?kh zF2#YRcwDJ2wzRw+GyZTwPZEXcSg}fM-#US~+kWwU>SFY`+G)M+_Rngq4F>XuuGSHH zLe{JkEjt66H;RZHo{E@mcCHD0aFdF4=gxw%EoHq(`X^OFpdk83nxU$S7A1 z!3-LF7a5T{=#QX0o-#Z!`Tk0K{%Z1KlBFGjUqiCQHY8~k@1dvZ$fhmeT91cGQlj-G zv_hc$6!8NXgYk33VCu$J4WTan+2!4?m$W$bm?1pH=Ck^YvD@sZovFGc&z$>;1Y>TR zp4W++P57Zb1)@r%XDch{o9H2TI4OeZThQ!g*yOW#u80}#=r?mVZ2u#$i1>gUDr4WF zF1}(Imq~X2(!&9vDv?%(m1#&Pbu>F&q%yI{`+c{Xz0O3zSOTV%0RR5zeLp*}xO)L2 z>}7crBiRL<)Z3xRs>-!>{K$J5d%@+}$gf5@rTLy)L@sc8!c^-{w)xeYpK#7rC#k`a z{(6l&^B~{ta=;kZI`b_24e9dQVr>NfHFhse zcjkb!cP~HsJD8VjZKo1Eg~R7;N)ii=GUlBv7%BGI7+?E1zYX^SSr!#o5&3Xzhzc#3 zwEcFG(FWLLgh%@ng`aaY!oD!CiM=`hisTpT1@g3pIUhT-%@^@}Zjvi^#%Ur;b|)>Y4&occ7g%R*gc zoGojZNf3o9ssi`47ri93cjdP=5K6!F3@j>-&gNiCBMTD-L!uQ}17oh9wO!$BV= z{|Xr|fORRT4=KzKBRvmeV6ggrj9Ik!kLUU)0g>Fn|!_*hqOlEReAb(4;>EY{_@$T zg%3BF_qvI52_xjQY5TO7kGqB4Q|G>}4JUwX@lY?Sij`3^?;cN8xHEJQfA+4hy^a9b zXBUMaevgWZAQE$?)(>N>a(cH+~Np?P`3YjI-GWJ>2S* zwqLRLJuctOw0J&Jh9Kv)b_l^2B#S>11J3pcKiLFg$pt9$hBx)tY6^RYNVsJ$iVo$& z0%KTt1v{mo50yW*IAoAZ>~9%Ekm+9A7<7rxiFEZ6(bNgA-xTNb%RzqW0<{A0@k>MV zlU5>w43=9?am*MjVGU@mwbIcRuNy=g<(x{Yx?8*&%hO=3nI$k6!|%$`@yq0TBgFA} z{`W}o9O5e`>}`;5WddUe`7QwX;U;;{n_wI3q^7&{JoI_w%DXU&ecolr zLQ3iKK$iKfrKg9--lQRPNIUE>E;oG9DC~#%h)F1hnQenyi2G5NmBhez5-x4f?*sdp ze2^Jof@cd0!*w3tfz<@_6TKjQaTenR9p9IYd-`khO>`S>Po}r;VVwNU&|d37zjytT zvf@{tD`B>{PWcYeD?Khv<9bH$1?h4t>?U7$5 zJab&Zl6S&2RW~V}r4x=nehm92c=FP=IWUB(I;~hx1bgBcHGDw!@iXKSD!C49&$j|0 zw#`-NFw9u*P9qa1Svn!aHO;pn)#E8rTZ71qTA{4pGX%_zrc>zFS^r9C?wn)^I_|>$ zj!CmF8$BsaEsqig_pcFvG^0T$N9f6pBficuHdML%(vSaoSvOS?^DT8gZn#7pX3cT+ zvPuOW6`%g%BO#4O?>z!^7&M=|K>RVk!Pfr6B=9XTg{ZFd$OWK2EJcuQl{uX}w+Tvr zp2KFd>=-JIN6j~e+F?Vpt!JHNh4`niZVYOG!Xtf_X(R&*v{d?` zkr5>&2BHayQu|?Wm9^cv4sguKd>SWU`%Se$V;W-|33ER z5AJng3Y3^ckqUy_(X3iykJ0G$LX3Fb$k^@6NMW@IHw~hWl1BB2MP6d;EbbVHSq4wraC#$pPAcKQgs@TCFN}uE&U19H2|Uh$HfzupwFESSrfiq@^cbK zz7dD~!T{+sLRFLDACw2Wu-Qw@k(MNBGf)GNs)ux~Cm~vKkqtSG!&p`91t^{0%%vsp z9R*L1E71*iVZ!ns?}G3W?!8y2`G~F}CTK}Qt8=4VvtkjESV6@(Jn9f+4fF)8VOyrE z&uRN*>or0eoh*iC7>h&670441^U#WRf`=YMez! zX7!jMi(F@mY4$sVV+%M1Mf+V@{fF~=lavSHwI}Z?PT^v_7&PJDo&`no-uEm)}&km37gmBU8@d(1xkF;M18!A-}-a!>7() zH;BaZB!!$Eub*6^FghKTd|~g{SiO@?77E@!gykLYAU|*9D8G#L#piZZRgM?Qv*Fce zM>cRkHtS4-;K8J+ClPi zt#lO5>@ig#pZl%b<^~++fll?H#xl(m`5K^8ZZ++jAl*^ z)sPfeF7Ow-E`OMkf$^(RJzUie)?Bwqk};ZKdF8ZRPckCOC1ye6bLXsv>JEWU-S5b@ zHgxfqBj|pB5-bt{Jkbd(!RCFkpC#?VW+^wV$xdJ_o%vax~MFb=2v~NCA zl{kY%IR3YBb4vXo^!b&NPZBR`#|bq_Y?7nPE#1iNGnm&ccP}@5M~MvXEpNNFw1Xo+ zIV6rUk-Qls>kNGFU@^__E4cE1dLbK?G{fN;kolKRe|^58@$fHq(Jt1S(PY^e z%j;qPsac%kFLv8PE7iacw3(d3 zTJRK^gcseUibhYN!u79xrR@ndhS|7bj~1KrSg+UgVbhiOls;N5#42bv`9O?@l6d2Z za+gyr#ftgU=*0wU$(7tQa^iS6u^KCYdJ zkK8RchfFT}cBAHaS(_>Cg2fE{OLJQ!;o`uKe|2P(wGqu*S*Z2H7TZ!}Rp>6}9SReI zg^K2T35_&*Tw9V4(}X#|{{#C*LM@bQ%lGW8->evIH9k{eBw665q^KC>mO_y2)9<}r z7(_x^s=K=vJB9@ValDh3C`>R9rnK|G8M;xWguQ-J2GrBa`DK3$UqXHUg{9VgRc0%D zU0n1g)>!!l9X}<4J(MXlL+VLfFW~Y3hC6Eos*ygeIr(VcEOK;8r%)8HpVH%bl8<#O z@$C?y56=^T0K4jR3vPL5h=VBqaFF_o(CsCnpQ8MXy}EK6*t2N2Vano8^Fb6s{xQgm z*BRxivt!P9eV-}cNeJG&zlcTUkscJg*5qod7+bCK3r$?|jC9*bB=|O1jl?Wa^pn|P zT_EKxL|V*V{}F^ly}RY5)F$^S;AP5s{ljO`Y+|O-`(B#5`0N}zat`TSz9FQiq5ATA zu=W5Ad7JJdreY-q=!2;Lwh=_j#h9>q(0%38qe!^m2O;X50k~hzmLx00>V*gTj4TN+ z7FdRk5F(SqqDP8z2Z@SGY5bjdNrk!b=TG^HmWBdppw;w|Ocmm=((L2fk`~hWVU6`=6UQ!i6()YS;&ImVnpa0V3zp}}^ zRS}7EhGy>hX2xBTZ{g!051}m%#!De(Ru7R0Dqb93|JmGXYCV=`IG8d3g<8H^-~fK}$Gu3g5}lCaWpdgN1&cHY6CYE_ zb@9>hhO*6i^&JW-UTa8I}d!A(m61kclNUin!4RGhZBYm&s zbmX7RKVacU1v?Y0Pua*SGx;aGNcm(XZ<)?}#{x6LUv~q}5li%g9GmcaY$N~Nk`bR* zviwYhQX}t~b{F-e&5vqSMl+n9W1zsH?|R)BWb2@NU|AW0wt@-izfH=!lpbmT3IqR` z>M@?ANPC{>2K_(^*89v#&BK-5k847;d~aVLB=uuN^AiZw=gIDP1Sox>+TK!a)ta=< z_z|Be{jY)XCEF55T{?D_$bKp<&&bCg?IrcA2OGxF3xs;+%}X^UaRSP=ebC+dT0w6# z*M&!@tLvr)(KbEkP?p-`i*3F$>-BXkQDJ`AEzUHs zTeA`czghmZ&RXjtb}nA!PfqfyqguR=i`Tw;RgfXKveC{s15Ua9%9q7>y_Q$JF;OA6 zKX~8husF0pxTzsAfI@r#6A@T(!uNT4UmiDAyQJ-U>PH=YP`5iV6xYN}i?OpN2gl;Z zF#UJ^S?b4|&Iju^tTYU`wHqyFvGzukxW?C3GpuyjEa%4zIjN*7DOH$jw+)t=mXi5HtK-Biyx`;VGBN|62H(44L@OeJ~<@e{h zcc@NR{3<7|8+=P~%>oC2Y-E8PUhnnP!5r*z@MF!H;LO>MC0Hh$)`_3%_G+4+F&*uV zNX1S=CRbB9FZt#U4kEW{%sH*F>fFLD}ba#r9Qi8N}cS|=2(%l`>A>G~G0#eeTbax|N zcOAd`Kl&p3?7h~EF~%IzJ?99XJA9!>9-+vnKUZqxi;8BARAT5!vVEB-}?mAKnfJU%G0v@4c-p z*jA_{PYZn!Ts+-@4$clt>*@WjlB2KUw~R~w&8K(abKkU+RvP**F$I-wk1W9|0Ea)- zuN9|rNdr}YDHnXzF&;VOHbU)=F5KB%OiYa^#@(>a6o<=|lYL zGg#wS6-^JSAil}cYRrB^7rYQZkr+OEjDMwR4q_W~X23{4=AsC1jS{W`j?-c#H$WEU z0)WLwoZJwyj8z}p4~sGP`g1hG8;V8x4r!UW=cA%b=SUsf7fbpZ;DCKBLsV9r^Y|ZS#^vj&tqR&ZQ8o>hibE+s z-U=MXwL4wz&44l1!gUnJl+OxVcDenfz84ECAhC*PR0QOtQkrVDxY*K!*{7U-iR>gq5MG!hDq!f^h)U-?>Krw{DM zZMG*T-8D5`!Y$X55(e80$X){Ls!8%P<;@F0I=Om&ePKaR9Lm?wz3*uF(_|g^G8d}t zSgy2KD3z%lCov*pl7Jv6&KUQQ2735E=mU8M_q|u|730CS^fU|mygt;^kqiY z)xV_9!YA@9?`iD(X8%mhK88ZY=a_=+$=p2$8KXZR@L{v0#|zbDaZp&kw7&}JVq+oA zim-6Q^7A|R4f9K2mLyWl85)k7jU*xGcqx+5neG9B%^}jgG*4d7^p8e2`?6(~`3dLt zC`(0P$5z2FE~7k6x1LLwCa{a=5)Ft()#W z)Gq15`5w7;9hQJF{*$>u5C$$@eun{=%>Mq(vdrB3>(saSOD*VuD)6%M;!fc*#P1hD zOPcw9mRQ4`odoSSw()@SF7JDial-BehqBwq1i;A9TP_>h)3R0s7M}_Aq9t%|OI8_#xCT>gz=9+|eUNi>oEBEF>|+EV7U3-S5x zvM-%9ptfF-E`HM_*ltbzdNfJ?`*TlrxQs3FlnK+z9Axb2dMxEQJU5T9kq ze6+tUSDbDXs5Cg*Yj^qG_H1p=-LWz>B<#IxS-Dju^fHmd)sftN4vjvxr4WE7(_j&+ z&jIpF-($pwpaef6D9p=|lE?G#>krgrKX7wVch%~K&ioB|)DAGb6zgn;4Y;kluCm5g zoC_y@{18F5?NlAkLK~^cDNqIdi1F<7csJ${Qb5AUz_p7S&_2DIBZ-H=p21&Yv3-Izlve-Fb~729v~o;-QN~EBkgQMD_YO zuGxuPROL&^{W~rY8wNMw`~|NIK}N*;i7fDVdeZoFFvtXlyYSsh9$TlPg_{2`^j7tx0UbMwt0V4ZqN;o+b!%}Jl~ujkMhE-PtlqGJ&RgCU9aR0n%MzjftbwK!#YsGRAj0$c--c* zpJ;-l8H28C86vEp;Z0BSE_A-+4;=*u4Jy&ve=(h+8<&!db*)FP%<9IWd#|q^E`9ho zjlbs-lgxnd+N;EPa48h=_XYiz72;kFJh+~_<+j*K#w5g>#-=A(_A_F_%Bfoujn><$ z(L%gJNl;d`7Y4t_=Jwco*IufaV?h0(-?2tq#4M?ZjJGr+3U{%EJnP}AKK@mxnahy| zQsmEsIK_Fw$tQts01WmxD*=Z7y@$v)~jM?oln zWI>4v+wv`9XYD1eq2RxpN0y;*ly_gy0>{0o^HV8G+!v>PUu0AReF4IpZPk(>E*%`0 ziIP<4d}?dJkh}=Cl>+4yHb(Wi?q}xU!4?WPYPJqJm6>!Js4Y$y`KlM;a6W7uhyr_6 zM;SRq{vfVJu$^#v9Yz~D&1Cdm%15vC@=dHzjddm*@mimc>*Q*)_)dheb1xT_Chg1p zYWYB1f|JH=^&3)=JrF?cCo1rAICj{{znc5AJu0PB$zV9!^z=zG;-_)pB0Ycs6x2Bu z-MOQnu;T^{p`k>J12PR?F?P#5I^d^haYy_S&uKd*$_Vrr!JM0-C5$$^ysFS^MB5s* zg!F=gMT}DQyyQB5{4JzBw|o0&9E7p&f-wAqTuICv(>&sZ# z{EF+%o*x&r0kz1R9^9F9g%^3Yl|)|xejK`1s(ziB(ZW9dsJR`DN9Nh&rC~ae{Ba)6 zQ~x{4y@2T~P>l;2fsI-Atv`1vV|PWevH+GykI&oJWN$feR=FLBMVq&Eq{fbiXf@q- z;YvUKUEfBgVC}NHs0@M>ot)+c)XZt2a=AQ&Kta=KRjH4II-5E5*##Y*Y5)YAHRSue zhU^sP)$w`>EIVuQ0Fss+9vX~L&8ItZq$r6hozx7R6@_CrM0DTBOOzUm-hp_<(Oinj z0Z^L~g9ITYk7)fJGjk}ZVB*?^3&i!1BZ)L{|tloVB zgObIoG!06}fCuJ;soS}JZYkk-fR!>QQ4O09VU#j{ka^+4L8&m-@da#Rg9Ku!=mWR- z2Aiap_wi_Z^lX)Tqaze`91CblB2LjMJa%Rgmjhj(!!QBu$Es5o9l!&|y6rPK#HN-R z=A?Y^Ks3pVom0JEZ~DOF7|GP^8492YoNFyIs4YU=-LIm=$__cTn0hu)da~aKhkLyr zYWVJt3$dkfqd;gmm50;ej44wg6iA#(vLfDhiwO=^vFPA3ZVK%UlDn`WmMmsh(;k0T z*YOmR%@}O`pi+V$*qtf0K#HZh{xlYxBSb##`+GcAP=PArtqb}|9caZRr}{y|_<~}# z%`)(3l3uEwYDKi>AS8k6am-(`7P)DP$fj^Rihv4G7!=YPVQ5}tuUl8R$*&*J-2OPm zAFoQTBQR7W5Zs-wu%yMp@0@SJ4QkdNz3nCuF}#ZJ#-23%Vk^f2Wh_)B9^8v9HAhTs&1s>(vucl8cP3C~hkA?1;or zFXBO_3O=7apJ5y}8nMY@Q+858`$FE5v^L*;r3wO>n4Yy8NjWi|zi zQAhXBSV{*VNd#IDqsPLO3b_L4yrG5jW3(r?@Q1Y;Petd!uQKz%DxN#(11d?Xd&bLL zAF0F&fuA0fm{YdRsX{#Z{l7u22Tf7cdW`(%LYFxo)CpWLU5qZ1zUuo z)0#i<#~e~%jzrX9-93E?Fm=DtM9i;3Sdn`z&rW9TeX&X6y6!|0ClXnzMJR)Grcb>F zO*MSbQQg_$dz6np@6R>y`X}|W$MeOpW8>S;2gFW&sf*9^2|oYR0^F26?lwK3q4XOR z^)W5>P#N@yD%5P$`V%?_yUg8+R^XBs0o6>N(P8|o^$3(oAYk|$>`V_D!hA{+8}-aR znY*QR$0fv5vgaJE6?X!MZ>isQ0QEYvN1=+~1XP+b=?BWr`)8|IF3W?AOtlIdYbIt9 zjW?7ldOxq6C1%+U1SlQ?B$(3Qr3>-=NIQ9E*@BwRcSOAG>a2|%J+HL<5O*?4C_3YJ zK@x&ydWNGgMO{Rt8jx_KEo3$XoNFm>4@8$_Wy_voL<3V+hv@)}8JI0ng-?PYP%I>$Tb0V~s;M=yC1_A7nq4iX(ak zM=vrAz9OZV6vs|%rxP2^{D$3!+T!vMO*A?R-|Wy zLpBUDLk9o|5s@KWlqL}CD(+%gA}4W4Yg(L_Iv_*DJ_%$K9V`_(a0$w2X24`R=PN5T;dc5l*qz=YiViuBL(Fk@L7MjS5tGjn#OQUUMQ!*@7U zOew24fC-Z>o5~glg`IQOQ#zxpIQkYN#eBUBqZ7t}+8RSyNYGuxh(#3njh20;xo}Vr z!L^m4W)@YawRKJ)Fq#e0?xUEKLhr}gMkThV&#tjw(<-sG&d)LQ$#}XOXx%DYu*_?i zsIj5auJZ_I)Hd=9-Ud?7W=dk+#z+)#x%;QaKe|Z78t3JufMF-Pw3ok)`OaF|=4}a2 zx;q>IzrlD-68lW{nH?`QmINynr2-dPVa@AAF(5n(vGQ~G3M+m$c62gTpK#EpVcKA6 z0?r(qbDrfo=hixp2EDLKg3;+Nt`m-_hqVRbC;J{zc?E4%nm(?TPe;=(2ImA=@KG)k z&_i7F)^%YmrHb8mVbH(s{)D0BpEs%vVWpwvk$SBU6nCUdYJ6fOZWCju++*IbCD4Dq z6=npFPqyUi6QvwQC}K^-xsh0^#C_Y;a94fy_Ty#q%RhA&N^>i+B{<;{q-?=aHxnA6 zza*<*wHoM#WKt|Qn=Hsvl?ejc4RWa8EfJnrfJ7s*Dh?)aPEJZq99~QQ;jSkwf)Q5t(5fI$J!e%+5gOmES^903SYk!a7dK0Z1 z7yMi;xm;FGTjuBs&KVe#LV+j(lGSI0O=R)YZ5~(CWm=;KyWIj;>Fq*!D+nb%5$#xe zyNpS+;8C zZj2>sB2{!2Vbn?}xO=%9SkJ#AV*4|RM`|6|w7!e?UBbHvBzuEgBskA7phGFO!rRYB z7GL|X2l#xu+@^rd+9ZjA|AxY&YlAS*@8X)iVcdiOxh+evp{(5hqitbn7MWVLVCt1M z*XOFX(Ygfh+rr5sE}CZDmp7~Js_1O+l?ltH`0*GN;=#K>)|ruDmm+bEv-ujXHl1&Y z?D;@%C2`2RQ}O9E(6cx&bGIZ<@JtOw!C6I;^C9o+q-bD8)Rp%o-}#Wu`FPF&7Qb-U zLB-kOeFpsNxaFatO^8$MWz0D_e}=c*N2Bf$14S=m`h>U2v0?)(oWzcvJUuRpi`g%( z#tZBPr!8a4#P|K+U65G69pmmT^WSwQS4hg5FRE?1A+9QI=;|`!u4&?HPP|^CV!QR~ z9i!k_*1abr;&;9ggs1x*&2k}VRDzZnTbR_zGQ7+~QGdHF=*^swfK3-avRxsWF+pE- zn)U@jfZSEr83i=NA^pw;NlKqQMi)@4??!v)vGXMS-;p)7({{#yo&pm%vLvHpuV$(? z_LhnF2(Un1@=h~0En0t}dY(s`3PQ>7ILl|Zc9E1cskwN|v)wL+q~~F56WByOp=h&D z^uKuXDeDFH-yW6}C}!7%#mCj?S*fkeHaY>XjhN}JwB0ht;-Xdt`meycC{Lx8!O3aY zP(-dpfSkC~r9E|svhI6x#7O(2CQ@i6bNV~&Qdyz5b8>2)KM#xXXL0;s%5qgUpk^s! zEBJMFlerPz@J9Z^msg|1t|y6f=TZ=LWSE#k$Vf*MrcMem%cxK$fHGwu5O=0wzv(F`?;*s|C@)V>w!F zG0vR#w`K;T@pI#=D)Ity>FeESYX>-|F>(2}q>-DX+uh*{L+Pa%aU zSj_qkNdJP=GQSbsimQAC^!BB&=dBWyauCru8>_u%Q{E&-+#T(^1T;e8?+Y zWNVY2ot>u9BI~f#GPS-$jRNj*16xS@1ad0fD1YC8yvKKOef&-~=; zR1gWCLH-R0cap>18RZKMFgvc5p@~?C-e_VjsVh@P12Wc+hklqna;IP)gdWoy?VvDi zvN%I|g&(z;XCWqkmSA_G zT#6#?KA3VCdDw)J>834LdTOXZ1J8jEXNf5nxS_9gaU<(z9xz9h zc|5U~y<{q9-v|hbZ_9M5K24oB7NH4gJ@y*_1E01x^3iELF24iC?>|&u`L2S0cM30o ziHsJ#&Am?S+x6kDXT2WD_bh-4rhn{?E`slvZ7K{iOdsHv!F_mL)Z~qyu{~{#9ZR4D zA8mx;Gl`x_iJOHd7|dzKOCc^$nj|MTJ$(Ekb9jwS?{@bXD2UxY@qiPvl68-zxb8GD z9+^OjkLu``!L_Feq~IB@T8aEF8yq!%m*i4t18N#8bg$af7&mO_a$~vya-h#Qi`I3j z)sfc;Pss_EGi=$UQKD2k?jIOWbi3eG| z_X3Puvo6$)IN0~{L3~~wVI_)dk_@D? z`q*MTqu2UibaRNd^i8!#)?MP}dKWyr9L<&OUVpx6)lRQ8?!5PAUHW}V^`-Y&%X?zJ zg-aG>^-~d3&_9RfchaHU>*m~JO|CnQ6>*#HvP@(paOb{cB|kL=Y3)4IyID=PucMvS z>UvUqAmy`DxW-K&n3sbC;vv^%rzcw-79sgQwaCPgam)PM(*Y9{q9dt2*56I0!H%Vs z-#|N*j28?_drdv$q_qvOF|&y6&)K);h9WJK{jOM482Ofsiyp2iT==JhyX&}T@z;_j zUQm9_t68{!kKe<2C?%Q>2A5`ZOm|W43?uYpg+EoOr}5r~fJ*N8gLF6g&|trbtI3R) zs~PkhDO|XSQB%k?9+yKh*p;mh1g9@xF){9FlZJH~`ikijZ%y_k~CLKws9I{pw)A$yBz<@t+PEATJ7*@d1TglZR z?iU)O2h@(!;~kP$Q5)JLkvU`7+W6>m!;Q%@p#;hj8+zb5qtu@uBwwaf4cTt=(76>J zG4dH^PN&rnc_Pag#0u}cOaA0b3A}Ys2b$LS4NGg9&BuHjU@LpG)mSm*5fGB5TM6B3 zdnb*n&>TKIMAdthT5{E3jq&P63~Go)T!2L#Pd@7gT0*_J9rPa4DL{*%1$Z3GMzgk6 z%xx#{RBq#{m;Qd?-!&dFwB*S#SJGJ1g(%Qj-tNHd_*BiRWt7J7-ddFvA0t^_d2w{ay zB2MZNr#i7dV7GByzQlZgp!Q{qkn3xd4wqB`%LJHEqJRqQ0**%kfzWVBq%QW3DiEnB zA7tcU=N~Z()cSQSd@Jp)G4Xl}jLY7zyFPQwy$7k#0UV8OoqzWda-$FqK`f@yt4oG? zuS)V#qj?9xR`4IF7NE3i$Xv=;CjN5uw?DU;IsO!8dShQq4; z109y`ttOiwMTf^oMUNRX|1mNZxkhW0SFj)i)8fFxDBg5^Mt%>J8W6(mQ}v@~Is|{3 z6BhFSJ7&Bo@HlH!`x2#eocV=9iolkp1tK-z)`xy~@W-K`bxY3d5EyNswh%CAGEsO& z5OE8A=C2|4xPsl@jIJEH0IC}K1Y)gjPii50kFx~!lPu(T%Qo)o?Paq?EOS~e;J8DL z0QPo6X7~aU6kjny@1#tsgjU6Uys5!T&V+x7CP$yt(JqC;%l6x7OSi?CtgYiu1XI&z zyo7+7TdZJtJ`uyqZ33j?0%t8SS&g=$^5Qe`4~^yEOc$syOYg5HrMLUsfI zMa>so0xhoi>qDB(>rOf8eQe)htHoP$%f?rEbQN$V0nvS2XzMBg@`;OVVK5Y!EW|h3 z0A^H@<^VIDx_rq>Jv;~$V0svCSyebC+p&!!EGUyJvej84yW>~BhFvPx+;Vzj_&qt@ zP*iD_<32!Nf%VyX&P_J1lRW(&^nm$KSgJyhPtpsc1Ld)X$P`(nNOUqK@Z4QKKZ9eV z(S9q>w__<`CLZ348E%sJLhPc@_Q?`_kUBS>dy4o@!;ad>(=zrJ7|UY9BRDJ#V+e0hC+8m2hVjwS?DyUnX=3JpHO>C%wk zF-YctkH!QMs^@R&>UHvdj*lRkWelr*5&yV;X%QAI;gbED3~8Cu<%<|@CTfr0O^3mUX5Yb6=4jR~LkJIh?1kX*sN*Twe3$if}OO0+Bj z=p0Xy)@xX2#d!PH?bSfYxg;SIvRqUW#aO@^*y@foU2it8&D$MfMG-Tk>XAeE50AA2 zs;MV*Wh9@p!7ebr099Gp)pjtNDIj!%xOY&*7<9KV*Z$n}^AXA5wTN{$ac`dO8)2Pq z=yU6}G~xhA(YPcSWya>;=P+h-*nBIgy%LBB?npFr21LsTqK=xucGp7Rx-J3e5UEFb zchQYX3y#J<4Tox8j71Imt2#8YKokRZWB%iS4sf$)0Ry5T0?350({eI+z zuY6?Px&sZ43|#q-`WUcpdaVzeM34>$jyy zzAP#)<2&uIQR&lbfz`#E^-*)L;nED;z~L7#Ln@0hgVXkv{t{q;6suATWIzFd>znVt zg!SX#fG??1r<41@ZEvAf**~_+$+1^ZnWXvJ%_3AiqP4m+UT@HpCg<$1L*8L z05O#BUU5q=SJy+1s!CHy1of&)zVjRMNc6z>v#KZz-c$XV=5LWiid}b*cIKWj+T-x0 zib_(a77xf6!1P!*n3GJNOc6m2_yki+4RYrvIqlYmgZ&HZ{?!lXT4>SlwjA}}RU-HA z?_){ZW3FrdoY_qFlCh;Hot%1d6|p!b#&sa%%NkA938oL^X%Xb^PG@QK5J!pnrL)^Z zkKvk10hg3gH(o~`JTPyud9F35@m0*s#kKf?Ll756m$r6;w(%=TK%BAlHIJYfm?i~x zm=s45W))dZ-%$n0XHHFDyYnG*Ig4nXQg4Fj5vZf(E^7*df_JBrE6Pv>TCGMMb~3U+J|WZ!NG zPoCp1Nm{Rs=5rNRg}j`bJiNhg%bDgk-tWpL1)i5bF7^L;y>^_!0xeHr(|0BHsDmT{ z9_FVYS5h~vl}Msl-aqTm#AZi+-5dnum&3+J%Mk|g7}X4<^HTlStrP{`6i?%k#KHWk zRz=$=r3GB%bX=#8?C;gU%AL9VwkZpKjvjTM3mAF6YPA?*Z*g{S49ikkOOLZm9Tr~> zg1L7|#Wkmzn$x*y@r!4@f8h;jPL@sl4@3nWfi84<0#~O;5|yO@okjs1W?Ee>-?LJc zXDdWT(Z{DUqRFK4lPr|V`@Z8AmSpq*PWNf3y(va=%3}y$jDlm63P)@$(Sj z6qzP$mIu)7zWc8GdlO~c37=E;maSamV%`bD5_Vl z*03={li8f;0>!9+oZ@_hp#fK+PC}&VloF!?7N`#~|Mm(CiT<&FEfz&cV5TKxb=#(z z+8{6xf%hw|HxaA;>vy8D62))rJ9EVR7ZT{lVtEkp|1T=&Q)uF3W{9N8ZGCbh8ZG5u5R(H96m+I+Tuyi$)-0M^@MZIe1AWXaIM0Wgf;I`(}DG{^`P2!2l=4q?R zeZoeye^m{8jG1lcYIWk<@oS*OpEmFA6d#_34JTU5Mb=Cy@5WPA4UBzl-0{Q&Li9E6 z?=zaeCfjc`BxXy6-7;57uBGQM`4g>m^xQBp%3zkO%W+L zhoPHxn--lN`&FQlh`Sa`-*dx2FaL)*&kwwTISnm0#7_97Yq;5@G~n<9L| z$LUy|M>rtN&GFkvUSPcPDEYBDIy|s^S-(xh?2-I%Hjfn1NBaz>zCyVU@_XJyH99}^ zUDMQD64$LewE`m9@6$)KWmTi)L?gjCAU_&xJAc|IC+!GZg?ZLgk>Q@^2mK=kiB3XX zuO^0~qol}p6BxhYMK{}nUW?M|JiOMRq$EwRpsOLZa{G2z`H8{Hcf2+w*g6_w8o{q=r_-h<3Sn&%M$ zs})PmJmq>sP5}=bYymq!UzpmbOEdAqH}}i(oDHzOQ)lrDbTKRHa+k;;E>xhkOT>Ky>NA))eNtSoA`Hh4bKJz(t0Ha z*CXAE!CR(bN{v4JBjfegeToAQ(XKkd{X;f|xzY{YxV3v3DAoDu^y?NbLZ)3R+r64! zVfjduEouvs&-?{^L$?s98(A!(Q8Suin~bfs2FS7&*#m=T==qd`mkH(`+1QMd3uMNC zB}2QqoeVRHJWYz$wwaDeq%}51$0;Qa7-ADr%ZIwD!Nd!fd-?&9x&9!$W)!1W*V-t%J?N z)R5Y8LpY%dGhEKvQ4?6Gf*!F-(ZQ?Gp>CuMZ+&R5hm29LGL&nzm-!o8hxeEh5Jc0L z5{+@Xgox>Y@dcZ_=D-}lC)jNM#0Y;s8wOe7DzyKbM@*q&-G*`?5?iN6fAZY5oZZ?x z3jdwRg%(TwuM{j7djw1kPJA$A~gbXw713x8zRpq5Q!T}q4h&#{{J8Y0Ih1O&D%K#Z4Zdv#s z&UVRcv8mkX^xWBN*i!NRj)WSyzw4LjCR58MtW^b+H?igFf}f@*x-qLM{+XK80)fQX zc9mj{ORg?ZRD&A)ula?f+VAHBTkl+)YJny7{kKN>KHS3=D}O+#sFQSp$-0TJX6tBhpl?eA)b z{ssppJgp0G@edQ`-H_pLkfZLV_EjoAs?%E}2;U8zJkL^L=@Hn$>M)}xxGDuJNSSR# z9aJikMgj9Gt>gMQJD@Wu^T_Xi!ipS#sE+!vzPyI~FJ4u$n?xiu40J6u(lr|llcWg4 zD`J)(93?bAES$LE*KLi;Ou3)5=)RPwoBMsp-zoNK#(vBXJW!4D|96%x{f+ffz}N@P zfVwsUM4G3kXp`n^p+E&lFGBMw+co3saRx`akPsuXDeBI5*w{aHI|zNs*U2EW=z=Z~ zL)GvIA@(;JMKGPQUGoAi-2HRaL*!TXihI+c9I%e2)2%k)4(iW@pDRzT3xIl%T1e?dV$>^- z^}j@srV&D(F^3Nsyh(fcGlcQaOSFlbKHkis!6^?Dutar>42)a**HfBBL?I!8vY)x{ zaTrv8@%{WN*z!uZC6_?a;}ykk!IZ%%2imuAX7^aIOcoks#eWAE%*HW9_0iy@PntTZ z4=T2J(>yKrykj+Zdc3Clo&0cT!AYbq@R-+~HNE91bCD;S2 z1YAa7kmXlT>8;m5`ERcxfR8Zh+!6p8P94=JZHddy>k_Mqdd%p3)jGwFM>gJ^Z}~x*W%WfVQ_8zA z#K49&yYIM@Uu1}`*EmT&wri-dw65HybF~q`$cMmMia(t#k=T`XpAXE3r8r?{u;x+@}G>gjkbw|Qk3x#*`rAd%Z? zE2{%#L@GeD$ux!|Jd5lbfqK;*am_ONbL9bYcr$GVAN&Oo zbg#pJu!o*p=>C_?oCFYwVPk5hu|>v^pZ$7Y_5eOYD2A1clfI&7=4@f-i3^lVh7%?qP=^3O^^7C^KDL?EKz3ZUiJ*0W;` zoH3x5s!rbfwuX$a;2rcdYAgK>@pFabF{}^zca3ZEEnzC0MO|6)?Dkpf)4S$I6q0p? zh2Km9THQNEIieJ-i$7y~CG0&K8~5%XMJ;CB(}pr!+#A5D4IX^N>hYT(H3Rn zf0tV)&;NMM4ac-3ekPUNAmk;ggvn85y)dvS*=E0{kpMd-AqHTulSh7omElM0t)4r;e(bd+p{il*v>l*nx#jG+El-u z{8~K6iRQuITS~V-q2ILpYvcj$5lRX58u{3VMO1z8gJ$~>xKBMSEHBe zIoskhp?Tl;Hw`0+Y^S^?m9*}-t?87#Fdfw-zDWq&2$Dy{+vsl2oeAE*6P25C|HWR; z<01s;dHbD!+{-2z+6{(|1?OMoRhZ*%NH`gOF0zV;-6z?12Fe4_hF*V&K-cTn-i7$z znkgK46{p2-WcHwg`eND#wvKw%X{yndZ6bqnjQ6zqvmT&*ZKqRuj}yejkM}-dbJrxs zD6eCiynjg`A%6aQq{DzbbdR zd?M8_;9vljPg-EzUB?UDH7b>JlQwl9s8*NU!PZ8OGr(Uao@ckyQAIf1t2bDQIC`n7 z$Z*U+s`J>2e~M>y0U;|35hKbil#0A_UJz>-40UsD7TeOzKYj9gluPHc{SmBhMJI>H z_Z1K9EfD&bQY16~9pT@bN<=I+y}goN;Ox-*%}oZ>FmCNVyEXnjni>sJdil%!DgVWE zkCDCS2L_@qv}j*EIrXz#6WVMrGW;R1`4y$I@~9!k-sknMH8~=kgdf$+tLg%NaG-tr3orE`-1M@(}Anr9sz=6k@v&!$tCqwN&ER$wB zA9JaHL*Nt-qR*#`sMp)yeX>(^J!|T(1eIC27U}xb^u+mR_?Rhxr``JzaVVtuiTw{L zRdFm#oN%BdWL{n0>=N<^<0fY(E96-H+WFmPKKgBMUc--Z?&6;N+Jge{Hvtv2^z;m; z3lO9=6{2i!J!TFLPNC4CXwX{u2mh%4iVbBMhj4mMaC9b3V@J<@xf9 z>(Vg7B^g6~N8dH_DPWvo5EoHPh|~G?v@ivntQ;i^x{IP&gD+)e6i_=u#tXM+vGlx z%7DR+|NkTVZOtGKm>__cA%KGq6TUJ3df@Uv98ESRn6&BLwguGLS&P#2c3N@ON5`e} z{rlHj6>=y{7pktCvr;Yp8dVo<<9_5af2ZRg~n#SDf*r!0ICKnX^=|4J6H0Z*r?dz@U z3Wk>-N}>&4R-eW$!T!YA_)Pjj<;+(rA3XqvO9*Ns;}DR@vAAcd#itqsWT7L@QYHL2 zYfmS7Io~ZTA(~&?X$>X=qZ2pkCm&{|MkGcwQ4eFb2~wBHc~aJT^ISHK7)nf`n6i}N)Rtnf&-jcgEhqV^B@c7M1fT5XAb-Z0nz)387!^@4s21rd!4& zR4TWE)f&-Z5Lw|d-7FRF>$z(O!X)u9q$AQQOf<^Z*m|BBMzoXAuxsr9!ihz(VZ^!C zZcU(B!iYz5SP3^X#W1+9hk#5`2uOf=`(8S3){xhLI&=L_v}p_2x8kZwVy@BtHtG$| ze3yw(&XgAGe!^53Qx$aWcAPW}Tx*RuJ6r<{OWa4MeS59!Jy;r@T=R0lZpjz28najO zn9#-pV_8NB`SQ%&j)r*jXo(?X5;jli@m$v6LMi?*@Aa=Ig~pZnBhPz@#FEE9e)W;2VBF!%K!#KzMuhc{)fF7sk$VcVp(lEWsftjzF-?|z0@>8 zolWJpAj>8Z*&dyD`}26UgRKk#u}DysB05t8xdLChvOJ%IgQXfeSpO-kVmF8lnzaX?w`b|? zmwa9u4Yo~G;Ns$xrsMT-x^!`=wtbDUDB6=j7?KG{@YAnG>EBq(%bEOR6&hft88s&f zkQ9Z!kUFB@$-OJ1TgF4wq_LpsW7#S6~T7ghVM`(Kq2HVUOQ+eFhuZ^4f>#1eRX=P%atZ_t~ z1?3<3fQl3JFEsT^C~U)6FYtm5ZiOe=&D;E2WR>h*3bG08_~uGDi)IkPB*v zeO$$<|s%K)`LYn|l^XJn3*KY$Bqtrd__NM++6WYZQd`Z9Ue?zQn9KB%Re^vkEjY zX<%(UQX5(oM|o$AV5rt~98S`Ajyez2{q`F{ zx}X(_!dI9+Jy(SMX=54|qp#CMhEh7a0J1>(|G)!$8&|+%;LZsn6Jw}K24!B)#J%(1 zB7Hu$>5;R0xkQ5w#{$dhylrQJxZ_xl^8LV<1nYQg<4z%ZzuIl8Tlv7CYT25c&Mc}5 z(+AzfthmVWsxgv?>+b~UMnLhPF9vhofNE8d?Mj^UEzK(!Tjx`e3dS|YI1b3zkW?+1 z_5Zn{`R_b%e~4gT4)(sy2tA+kb|^&Idf{=_B2C0MCs>W1iVFkS(s-dBZV-fdmdEyS z_9t$~N2N$flD-w*TB-ivbRl2>UI^$J1Q;5!{Qkp>Jaga{2m`dkKoiD7`}%WkKk}oo zt=ITn1td%YubMD$=>ig_A!>|y)4TgBy$X@1jINXMsHn<9gkGs-jVK|v15nA2HLc+6BVga2XAV!!t8d`K|`sK5@4gL=2MxnlOj z^Hgm=>u?4PL=tmDbwwA7Y_caHz2Wr=0z)MxkPc(2RF0MT1BbPHh0}o`2n1oRAP_*t z=@$h-Ab^Y)^pKHd2UW$nsRK`mqm6bv?U?JirYE(bnobQmtbSeNd2^U-*ClG%&9ulj zMvL7fT2W0)yj~LXEO2BhQ?#cf2ox2h{l6WpcpM~41U`KJmA7wPeEuB+A}<|IYWV=kxh|kN^L8j*h42IG*EuU*kMq=j(i(=XG7t zpZta%+Q+z^nSq{&fM4cAN+9fF6W2CuiYrS)J?!ED4#khuT~YU^3l}t3L$~jbi?SVk zQLrfKqoiMvz5nQrhjQrt^84p>BPCCr$~QV9`00lyj<$|XmUW*`-RKamt#NFA^O^W@ zZiSI!kySG}dE@EluXRH5?y(X3hWUu&EZQ0lUs(D_XIVa&&%ne`Y+-4qRem9G2=5B7 zfs;vEC1G_j;d3owY@e@E6t_x(t-WtvxRLzp_EiqMS?rg$u9;1e`V*?iwz#i9`R&rN zdw8Wi(i%re)JDx%E#6C1sd`Cs_s4CKxKG>)c=$;hOLkvv&DFolezL@$`r3bUqW5U@ z)fA0ljs0bsaW<5cX8}*zkf8GWCll$uP_mfQe|TO88iJOFC@Eropm`=?mBBodx!qy=l0{SCaUx(hTxj)j_LjFa!7)@M&1c`T+azvfnz4fL{IhprlFbqQ0 zh=|1c#@U@SM!L(BFR1f)DTi1AGYal87aS9d!1~>`*|bnPh_t~-lx6rguInp zYW6JGR(+Y0(>8WWk;WHn-V#;DmR9E_uU3bfPoR1mKGeue=4mE886|DXEIse1?4cd^ z?a~ugY^9}n_9s>!GEK$|m?)GU$1sB5DvDo$n*rQ4npIM}b~P>IK6dYGT~yg~)?itl z9;nf9c(608GmaN7Dj=HAD_&4wM(p;AMj`Lu)48n=?jflSTw;A=Ot_@Mi7y^}_2kI; z#S>(i3pYs^N(pwo#S_PR`=k0o-3^nGvb=oVNUdhl&Q(V-J2n%Jve_|-eyIv`0&6GN z)g{H^r&&@c^K)lqxbw)cAa=3-JYeHh)R^_iO~_K&nC@q4atu7)ue7hP*_&1;PoiQD zX#1h7bo-~|s{8yNyCB)lYuv)4NB>MuQYchDYqM4ew%5d)SR*AklVqJHE;SP;$kl7S zeO;Ze!Oq|Zm*Uj9S7KFpuN}kE@@9Ji`Xh;dCtRqzlY12x8|`S9GCPt*`uc(z6}9ya zZ=PIot$n7e28&JP@vO@P{c)SvSw7uue>)KlQwd(FE3wzH+G03uA4NGfC(d1U|3>&y zOst5Ccx|Jzu-}BcmXz7st~E`N#;!3WyG<(!=Kp>6S!h6Qlh=%f=%1MiEcL+6rYW; zTYjD<^E%Xv*DFA=jydns>CiYUxBbzNhyLSc7SOBhy=HIEC9&Q2%l{zP1;gu#hR+Av+=RgL-km;$`z%$4+M?H z+XC3Qj-&581Y%A*`1Rxo=BWzlDgBWrm`TgxLnTc)ol|m2x>JnHXK_hEXm>$NfyT+M zbuPKj`*q~-0rlqX1Qe<4MenTek8eJ9mls{hV9a66d6Mo27 zj#s>NDXx2T?^yGPC(mBY^mAh_n*W3E{3ZIy#1}S3=3eFTh8fYL%D&}SL;;C;?hi_T zJ$Xi$maf5%a^%9jAu%01*2&|cW#_2O)O_gXN7emna%ajE!%gi@?KX2u+a+NgkUYW1(A)paTxaT;nAsx zxybJMfn(=QDg8dLGJ)?qBTU*mdOmgV(W!&z1DbrFkn_>OR;tyforqgO!yeUqLZ8Cp<|* z?0rQFe6U3*M@6kpE`&sF@2Y+(Tx!&pPL{8eDO^*KskD7OZ}Be*$kY#~>)Ade>kV>L z<+~O0V6&u7fH1cND3tL78VE%TZXtp{V*bO##%VT_Kl5)7R!?-b%jm3YK@1 zO1b$5FKE16bzlT+1p~j^Gr8hTXp|Xb-=I{WcMx#~k6gSPXIWtYtri1A9y=zh!IP&0 z^OtYBdd>ALw5E~oT)gw?^V;0RN8^qkse%z|r`%ece_QoWz0mJZm7qn3;znve=gS(@ z`<&$zUn?eDT&gi0{cc&M3JZ0u)XsShhcK&6JI#pu_eQ03{ZIDgd`l7~(Hc)vvddGc zycWs2jg_o@b(tkgS$vmqRq;HFBwE5>uQ}xP;c<4bcQo56*|mmGx4^ij3%Sw0$(hl|$u3{NTxiLR@|TD7E!AKo7|m*0+Pm24V6{E>Wt;H%(jZ%p#^L$Y}1 z?$e5EJ)HXPGu3>|eD;+~ZZo9R z6L9A~m>Q#~vnti?ZoF|>TAxn0ueaitxoYA(bK1=FwwRiuscxBCrgKkL2|c4_wZC3N0KrUT+Qj>c z-s{S&{C@K(Z2al2$Z>p}z*hoM{QW;6e!ZHQCF0H~`DuJJ#q2q?=fm>r2_pr=8{Brk zH_KSC(Tw*|SzrB5fVxg@(@YATDc_N=Tqmbsw&ha99*e#Ca++ib6`Y$WT6VuF`@@`m zyeqr6UzLX?hw#P@>no1-0ZF_bWi}e`it^n{$@cebW@1Edp4g|@!*UXUZRHdDoLJb5 zV&2wxrr2^S318xWZ1X;Pc0uc++8n8x`TqL+GTokn;_-FnQcJc*hX4%{~Qk()xCz*YG znSC;^T#9`%SHAcul%9dskC|jWgbM=uq%~$2{32qMQ>P)6$OVfvt5NaWXmet4{)=yQ-@rDxa!QS&iVLf>tUwgNiOFazU{aLOW z(iwqHd#f$ub6#;~5@=Ma3BHQos~6#tRwaG+xuy*;A95E~7*^PE|9{h0?Q zR5D)!D7J;NoUs8{-zoTT+~Ab38Y`GGPt#DWapOLBRB%w6WZ5?kyo)r6xaB?X@ja7E zN~j?CH6B?A=3pbXlfM_VK4B|rG}~LK7}BVjlfe&tR}(GJeny!L_R&&vj3c|Z;;>Al z=SDs4{@73&y1`{iA{LS%%Pk4;T3syo`M~?UU0hOw3om}o?eA2j~xy5|-G$Ak(=loL- z9n&6%&Zi6vS&nJNZ|{?H;NmBZK;O>hcYBhAKXHQ<*Wx_WM|#2(E#1;ks%UZ%`$+y) z=dooLcxMa$qFPoc^%E1OoL-uae8J)?87j%s>NzzYEB;%eDCVTT`R#xz=~wuhs3M;g z#Y2j={RRRXKOe~^huiJj%4}5L6^nQb3%z-bedHr?h-v%`NPN4cahm>kqBBeE+x_i2 z?akZm3|P!95~wV1;HboESxy?6K%)I3glmq((V6Q$6Dj(?pR0M)23p-F z#5VQG3Ur*SGR?QoJz$^Qe0fH9E$8Kb38Hn-y_vW!>WVa_`+3$uzjt~<-)`Miv>aDK z%w>3CtW&@-Z9RC>Bcc0(xA~i-hlhT$?RgRUsRdlPDOh!Kgcc3jzjFW%7 z0htT_S9aYU_6j0L#5g}P(W;)?`ua$3lh z0eXnZETb>thHKnR8|eSS|GKD>8N6%Tta45N4N;K+ZrsN=)BPEK5mahWZslsqnk({f&;Q=sVs`8)mCDn(DgIfp78%%J?l6X1oIffjdqW>F$ zD+~p87ltvGBTq?RkzZZ!(l{jWd5K&O;M}&3(PSX#;~TQ433@5Oir{W>kKs?`n>Nco zbfcPx79ExYBJYnL!;hVE>wd8LQL&1Gw0?Ffzb`am|FFjY79PWh%CPhqaSqxmiV%1u zEuxbyH6?24RUFZ}Z5$y#y766mfo*a$!hC{*3M+$?QQpp}POfeBAWIqe51FgyZ*cuG ziwB9ExcCx)A|DSIpZU3(Mq_?aMrp4$;Zu2csonIvr|4ufHZIzlRT3{S&UVs#{avcs zZCv}k(G>la$lqtr8L=KJRyii2BGBYO2!&dmS(R_n+`Eck%$v{_gzrr4^4z$S8c;kZ z@F11!X{LXKRs$(>)>0Fzm-Z(4-E0iV6v68De3MfB?BULF;Lvbuns&PJLfyuH(u``m z-A>o*rj0Ms``@N{89kbn-gi()Bo{P^Dx#Q8nBXZzLk|gAXUoTjL*N|<-`v1r{W-^! zqmqAMy(Y3gx2)jvzFW=1a!<0aUeH(MLpml}JO(Zoh23|`Vg=QrfV%N!Q=WSsNw_zJ zen-WqG)8z9b(C3HcO5#94Y1H+Wnk}q3jx{`aP|fdJocpzfJ35|rM_xo5Q8nHnyXvN zcd#^=geJmO+5GtGG12S9!o#E0p9$lHx%Jmtwu=dqdY$|(I0^o0)aN!9!!?p@32<efjn`h-)V_L}s%Md2 zhp$w!;jbr57n}A4QW;UI>_{lEM}n(FN51e`R#kxv(vhdHo|a5EkB?d4llI+yIy2bW z8|V%@H0enGO$pXdbUL&u(Pk-oSh77FA;EtF%or_VO#7QX4iT-Qd5cqfA~JXF`RCq0 z;~9~wpA(hku+i79=BP0JwS&eV3DySObqX8z;@8WGZQ~c|UA*w$+SDk+y#PO@@}<8- zfqOSi>jz~mf9v`McZsHg1?+j7YvF4gH@NChaL8JZFBX(yvP7JI z(Z(ptzrGv*L>8T?2uKBXkVvh#NvvY70`9el&syxuf>voLnI5N#*+Q8pgF&E)(APq% zHjF9xXA19pSZ37Z0WN4AN7tR(=-5H*{aye`A5MF@c-iR)_mvu%`U6zCP2{>>ntpwm z%jh~v)#&7RLaEI{E~vrp)XDWNnR*;;jV-dgJ*%)g6r&Si^%&WmP!vZ@X(&-ZAf1<# z%`bNmKMs+GI;@P?*z}7P`Hw`4O3ub`mL0yB+IwcO9)tlXorBcBSbP}!ki=W=4uTQ* ztO!QT@tyXMpt_c$Lf7$G5TaV@_TUnqKfKHks<97U8YJ>Q*vzGsaW-+ozwMRTVzC9P zcV9f+i^q|rZC*|9r%1wYF&OzgA8#a##ViLdX5$O6-e>;hb^yc3iX}Ywmf{z|PobQ| zT$q?QNH!+QI}?nR^iyj!c5En*(1zxm$06^*U1D$ejfGe{p?CnG`^)mVPw;1^-t{%p z+%G9u-l$Z&A07&+xFn?HGsbZX9&XY>{*qOvlIE`EaDT-w+k+${Th%M=7QCW648lQ+ z<@&3Ecy^Q(Pk(KKO1{(1Y7oW2)34Fw5@jrybz(tZEGSmc=}Eta$QmFvy%7jQxPUm> zTdBn$f%e-8+g5T)7;=gr4Evqbdl=BUD-?|_b;(GXmTqKS$b_I18zTQs8#9B+Zov zc;ft1N^A5Ss%P8VUI%Gvx*vrtHIy^71=kblBNHl$e+|HsGfQtDo3QDUd^QcFs8}=Ni-Ok` z&x;U{W7f=|smEU|Y*I<=A%+;14+yq8KTFr*=Z3euB`zKRgm?AuN|nH>PW?8(Cp9Vn z4-ZTeM0+CNSBd!t6NH%IRk7oH;4NH|AC{Rg-hz!jtmrFS(!@q{Pxoq{se`8w%Q!Wmmh*!~oFag!1Drqtg6o`Koqzzr(UDCAspPr!_F#}iR$q#K_FLK?qn`DD4E9JO7*Kr% zfc8WC;mmM_|z_ler#C;7UMN~f3^j3 z*dheutpNz0ZKN4^@yaFvAm_b7q^_+Q5C81UZ`r#tN*yED6mKT*6sth5NirTQ58N7` zR0U&J^*9WyYaBQeN8gF8Hs03jEL9=hXHKgKRPga>Tn(xG^Nw*ojbCwTo+y25`IfxM zkX*GT&wuf5^O;HrO69E@oB@J=P~C?3q?vo*rE#=HGMH6BTMzAxB#zXyvNV-*{)M;J z+6#3eT76{@N*}UD!%IUe;}mHbSMsK=>gZ5-nVD+QJ@8A+pux(hfbdNM6c-X;D z1`_Z0aOEDcNft3`(HKUlS(V2QssG$a0HkSGdvO>uRi9_JIv0O=&VFod8h`6Sb-ooU z^&PxkLu)%(VRBZyD^A&l7Vmit++m{gU2WBfb@wt6ElFeS8wwi}Jbme`21z!Az2%$| z*935p;}zN0`du&VDWhL>;uxKF8;n{l@e4D@9H5ax^<>8s)lcm`Wk7^HRX1K>5%;8-z&=D*ZnfR zZm-52S4MLkF+O_gsibq<-r>II`*dQODSq8+doods;%KuYA$NZKtl{Ew7W7MU#Q`^b ztDir+R$fkzKb841xYl}^@wW^~(78ttJu{$k*dT4^uqDP|TqV0rNEu2_e}! zqI@*k+v4URo;C4_U2OE&7uUwQDy7vRz32LFWQlb0ETgh4WLaz5=30yJen2+v8UAb= zuXWam<$=!^_K389+~qVlrZ2R#pIZK~+ppRH=EcjyW-odFji zD-xmK_aL}Yk1y=)H&VVp+$5#%0jXQiLN#G!Jkhk;W}jC7)37Mzyoy`*1}-W}>k(NG zrexdRqD;UB3ZYp^l59Ws{XZgQf@_A=CaFFy{cp~tnBR(A3A0Kn?;wAWC}2VNw9_wt zW6rU}CnfDGnI1d$W6U#sc>07noV_@Aalz~!S8_s@PMh;aDqmp#Jn`>+`q;fEiI>I8 z%T}q(MtDjYHTfEB?FMY>GY<`qaliB|)8H|ch)hO{TUX>)C~!?~xY4J9!hs?Nf06Vy^eE)fU~` zYCCUZ0D&9WH*GE5!;-C8^*FuHtrB=b0RxO4B7JADzFdkHy~t2GF<$ui0Y=wN)aCoL zS!BOS_ggu(^m}F_cw9F!hbJ(H639=05A$c!50Q+CI(66H%|Nn=bOGd7HpO?X+zl^p zEDBA1m^5O@@QOBHMaA{COz;SEXMlEaGlc~s*n%qfJtiu5q*ZPC4xQljV4_j=TLZ29 z)RK7n57}te;N{WRmw}%M!k?JHgo7E^7)*GIL3RdkGlx7k>agpu!%Y$OWd?x@$Jt+< z0u6IaG4{WjRSo-VopMG}KD++8=~MwK@cT7n>_#wlKntncPghKxfBqgnyV(os1Z^_i zW!anfn{U_5!w4(4_J^>%yGB#lZ9zdksRWjo)PXD)EPH2Bqg>jh@7>s}10w8VLrLNZ zfI}Djd2nE``7(lN0dy%@pwQ`KFH7d(h>_{&Efj}DNq|Xu33ncl%FGD)+BnZ`Q~q)z z$JUoEYKum!+n0%e@%V#MyLtOn=1Ub_pei!4ENIeb_|qU}xnxf+Zj0o=gOajVdz%)o z$WiwYHHZz>>9`ts5*vCYZ|0k@sV_e81X-(rrD@&s{%>3uUD4msRa&_%wRJw55}8IB zo=iBS2!ecL=dcdyv0p^Jg^OutnacoF6$HxXOOaIKriDR?0lRkb8H_07usN0#X##HYuq(HflT1q!MOL&;#1ZSgxPW!zLrOENb|nES_-UtB0>9>k%LJ#yxlZ$q z{wO#yV%;iGb$cBMXoyJGK(CibUk0clvyFq9#r8+HC<*+>!Jkpu)%U)NSTAalQIwg zX4*dH{p?|h_69=0oC$_QRZd9_o^4`!2s=_2a6zJ`gj?~lw{g7dyg2{v!QRMzRBnH( z+5(v3Nj}rx%D;V1N!zc50FlLBEBFX{Ju2<_@KN69J#E6sQAIV*$wm_dnfC2P#>1T3 z9fI&QFJC4&EzYGVTz+HwDBj&Emy3Z96eUw*VGH94No0t>wAa}k-jcp!K?fICdj!_S zM2>eqV@7z2glFBdamtr@gz*ic1lnOVR`+Aweq81EO?p8=lh=)=*f;iniUu zP5KF33%+M#od|aA4|Z*6)-W8o5zBdkUBN0PXeHKsr&)fFifaQthR-LMj)!hsKv6b8 z{qkOjS=?h(o8aM6*Hb7VjqEvR*|c_=i+q!`a-rnn0teovDVpOSRw(g{yg!*}IBqQWCE>6X*qlXw6NpNK@b$yk; zAJ%h?uUM0Shbv)Qb%zpclNbRhT19j zrHD39)h1N1%e3u)500*kGcB8$!jK zfwv*P&PqrlOhxBrRzny02)@X2%I(DZkMw&ITDDQB_Ai7#@MuYx2=F_LCYACn_Qq`u z_OGA0V-2A-FWUC}SeSldaNwPtIjw*yK;6Xi>qU`0tqdPz>Xfxf_4Y#6aH zc2zpCe+?UT)e?_k!{R>3KsHDg_6K3%&C>(Y6cM?dLZ2L88eXHopva}Fm`f(VS5%~a z8X`aO?e9+*Epp}KZDCLv2(qe>uLz6oV!@vmk%yL8|AU@JcF3MFB|IY2>1Z|?5LB@1 z;o8<}3x1}agp)GC)WzbSg zmfm3arls3;IG++aw#?Y90NH|>ahzq)uUDi5fuAm;aM2GTlsKe`hn12MRWT0YkZQrA zZt9}R&%7c!pF53{PWM0?V!u*;tNBX!373`3Hy;S_%kK08OL~ET8v}U4A z-|H~;B>rSWCf;%~@wF0Ti<$%pY7Bnj#I+5l<03YSgbM*jG%V|x~`A!Ozd-_QmW zo}LfOW($;r4Dee?nb!(dhtxL<6fvj50g*W? zj#n*>bV~$@*3gyid7_P6hiBXGpzxEXs-HCt&~PHC&_>1Y%zf~yoDVK5yEuoH_5Nh1 z^e1h$h0dZVF;>ST)wN3^9zpNTs zEd)(Opl0=}_gfnhv5Y@gKDRZkqr8lv|Le$D^j% ztw-BS?uMpT>?jUjl*)&M^80UcVt?^}T8y^uw>y}Xj-xC`iV#S=i?($NxUiL~!5_>6 zP!a?Ph@cgvSjDl6N2?}SfUiiWF6g5$h|l zVqN=Ysp+@P$@AR}F_)O!LWQFYCOR=22qFTqfg#RMl@mR?FeGymC{XSv^U)c;NL13V z^D)*J)A#3uB%1tIS>7r7#JM{y{$l`Y+i_=*@VELhi+A&e!pg+SSco~Shcs}}3LrMS zNeMQa<1uu~{~m%bgOJgw+?U46SN);4&UOm8KrMBqQ$rjAF0VvfkN?q96fSN&8x{*K zIq47^CGn_%s0_nPmPrQWH{D+;3t5%~?K7aLi8P?a&^^}{p zIg7!^^ncVZ9verQAXqA%Dl6!N)sFV#7<`{Mlzp7f2Ss@8r@jyV$)o#d{JbXF~qB z!wM{WPZDF<4&M~4`X$OD`*FC7*pVTuIw2> zfx4co2-*IO^_Y>*6_R{omp6{eG{76y1GQd>9sU!}KmmXexRkxz`JF#o0z1+y3PDc?(0+T+Fh1-$0nh%>j1a#Z_S(*s}?0Bqu}1j>aM_WiJ+E z>!4B^hhIJmvYT;Z^S#sK@nBchh$SE-9sZOB2*JkXKkF1L%-wm->+iCG_95>w@-_nK*{1OR_s>uR!2Zi8^ru_mSZ? zH|c|MBw_;=p46_pR|-|x`B?x9!Ly)Xbk#wgG7%XgR6O}F+I2w;3<+jJ;f5hoU3!$IbiHKdIOzhG+(~McHP;0JaL*kFC~)MvubDcWObrD-#BtQx z4~}NLMdhml``9Q+Oz=Zm`z{KihO>EXqDlUNxhVhJBZPY0ghFQM(CJ{BN?n)cvpoCY;{!iyA zEM`;yfO1$gc7u|7)rHaLU>D$QT%5V-G_`cyrN3$^;;#VkutQxzk7{7EZywHgF^7Sm zO=}`Np=S!Hjtm!xI$0%6y6(yuXEI{Zp=Khl-|%SbIqIS~1o?cgLP)ukbhexS-(CRo zLbvd2YUaSaoQf#czCwp>e^A*-w^qg6W$N494puV5#tklrI)%l_qV#BZ>1^%)YOB z7AtwJX18fjVCD=o$r!*pRDyJmN}J5UkD-h7-nDxmZoAp>z=uFm zbC8FFbCV2BBH+iQp-sI{`tu1VCh@u zG(GC^DZYTe2!&(TT=x#lrRv4oswP_b0z2l*uub<);%pX`XUUp>R;l-?6FdHpmBDJR zE&|~O6>Oll#hb|)GQT(Hm>F@7F$3yCP2?SWLf^t*eN#XYhB%#d&|kA*+^5~CcH&Pg zH_S5t-yh@-AH^h!{GH0bA(Zd~Qtwe@Q}n{qMkn_t7i0sL1Df_mvh;Z#1tj(p9G+b5J zB5{rcB0)Z8T`5N;skdj&LEZ>b@2L7Xu&`jIJnPGZlYexUKm2mhdmaY^d3tY%G$Xr) zU-daNEN@HZa>VnAX7w{!W2a>ho(wdUi*D$!zjK-a7B~F%mA<^F@F%0l7kAw8Ak1GifV4dMD6- z0v5x$H14A*}V$D_{*Oo((4Xoye(w7+y-0dMGqDmmn3 zZ~+RE=6TGvWtCzL%OP}$`PC--()R9O0S42K<78v!h||yu&=n^i7ti2F3$tMXC{4O` zlGLOw*E+h@B^y3hwZXZ0xfk@lr@T@+58S2(e|An^1@Xnwu93mY>=E?a^va#aXC_c~ zQjDK9{S|d|>Lw!!SA6tn?oYlcIkIEJbe`rmVAAluPBNW$A&dFPbW`=P+|6A)RQp%L zDKMd1h=R?e&WC1$Ogmmicv-Jg0-2E2R!*jVy9tvSB}5373V?3P1b3^)8J-xpD1=&8 zdf3Yik;CCHozsknkG{UT`CU5>%!u;|LksYRr&P8}XBl>IiL`)_cn7#|_iIw!OOD2l zNZTLy%naO27vmFXp6&dh-`}w?#uE?mYS=dxjEDI(&|Mno&k>pA=QEMDVF2Bl55f#I z)#kcP83n$LD(+ayb|pwEI!UBsinoS5MS$i-LsjTM?PER$NPRB+pB{F&I}i*;ydDV+ zN1km2?aT~_&3v}oG41;y?t5Z`Z{GfF`QZ^bMvzTMymH3-0iL+j9Q2C8E^y5NY$kimR|3`rDJQA<)+UrZdNI7g+H)K0G3gme zyln2Yzs+9aSizOX<%a)Y5@flGc=FdDLrjJdvc{E9DpnFpGOm+uB7)~$$b$4^`!Q>d^Xx~3(Q3}pc3 zl47MrM+x+GRlVH4Z@8UF`*d!yt*FNi&*tw>LggD#ca?H2lN1U%86TV{40QD2guMU5 z_58OR&)N$lLIsV&9n<2MBcBA#Ls5fNd7Sl3eNR3iKW#T?3A}muqQqb(dr0=pG64wsycgbC2ED!&3~gCSElwK#1Hva* z5@fLm4+~`HVp6lDD}JtX>Kl)VHZ{u+dEyayvXZYvtQ?=zUt?qr3s7v)>9+=jwGoP0}3K`P zG}4jFq{-JwrOAAoK$_Snwd9T)@brT&qK)?gpiW^cCkyCVnCe#8W1^gkY$U`*@5%2B;MFNSCy0OKt1-o0<5Hp&ecgTf?o6*be4g{{W>jb zH?*OeZoF9R+_}%w0OVKyFXHuI2ny~OqiMt)hLHO+Mc&8!AusM+TB~UMhGEhH{AVZ= z?c6;0fG5do zWK>ryiPTl@-Nwz51u5$dIBx6Z8$F9DsbrZffvx1l?^Fc^DHQ*8ag{j7|j+ z>Z%b5|M^+K7AWN2iRd%lKJU-)0y+eWqEVz@X51w+WKTA%@zo~VhwKxF6~wQ~+F~1! ztl7df1YyZ*GNjYv_gbfb)DBiw7Gyz%q4wYQ&lkbtTwi7ih*%C?duUGlzVTkkyWi33 zxyCH-PF8tNhNrjZ6>HzKlp{&$`WBwbLlsyb>$}ofP*!cO@c9GjqZp|#W6(AYGYaO! zweK^=jll7AVfaH&JQZ&ibZ)6Z+e^b8JxmkcnT+$b346De@AZ8&dkJk_96`Xn;;DMd zAcKMM`Rz`}=Z0ZQB@{k>(hmkyT-RtlLx!0a>2hUhJ3bOtly4^D?>zn+Rz|N_1ojt{ zwaKpEinsIB*i1aiRK%A3QQ_fL_FGSE>Oym03vGVjvB%FU4sbJWa+K&?-i11%%@3S^ zyAA4^J>D^jNA!}9EPS3^pgBK^ZDov2`eg`!=YteMdMvio1pHM{$#Rb9FmRl8zTPnS zX4RU#)_F!7pcxZySf&MJ>`MQHMe>mSn;qKU4>H0DE9f+w4%S_KX}Cv1>IbVlWJIs! z-j*smM9R!u0LU0t!xLKlA$!Vr^?irMbu@nIBm1!DjL)`rM{rRo0+bSE(k4^`eQuHf zX9_dXZn;~~1mNone1acp?=eu8WC1MlLeQ*9cSNrwQ1udR`b$Thz4U2+0x1@aAAeIPVZzsaWlrw1sZAcaZa(?N2~-r2)(moueo zhjweBK6dbTFK{aoa4$9$1u23}_Z@uP${@9FF`k>a%we+Oq8E=U*%ci>Oo0zoawGGf zU9PmnSScBV@ALk2h?SE518Sv*ySG{Qi3AX;w8x3|OcVb-Kf$z*PqD!AxP69*!X#4R zJ{EI$O~o#2d-zE1*j7v$_K8$~Vz$iKEqP$JT(7`v2}oZepIo9kx-3bu^vLPO)&o=c zo=L};&&v1g8j1x5ikIwry`4p*-nXMgEX>QEw79gtePqj+U=ptd0PLMk(t8uTy=dbJycoZOG{Zf|`O6;J8kg6Wi3169s&>xy z2i$&ZcTH{JeI8)A2)1Jk*Hj!aTr$LP;q|yAWQIr*4%FH`$HPhQ65VOsr$=Sh^UL1d zExO|c0|MWAAm-4-G@eBT|0Kx6Xq+d_VCY0(l8!eN$+JeoAp122gx^Fq=ETcm*1x8C zBp-Kny`{&-wcDgf*YY~^R&8tVT~+&?v3`-U^mLe@oX88Vr5NppY}EXLY$JmLDeAGm z%JlGXfn;}50uUmZeWZdjtwe|u~WCy`Mesi!YlWAcZ5%)QodFH zbPa9f&dc5o0Kz`k3EGfV5R(c4=0j1>o%VP;vWOn8lx|x;Q@nWk^cX>CO_><9u%s#T9C?0ACWz~{>W)hUj~m7Gm3&-&*A(l`+KYlO?a z<^nTPEo;1FG4#Q*uL%No6oUN;E{2j@Y$Hwa1JcIyBhF z-~&uzWgt1+HXB^Q91eW~$xp97@H0zJ0?ei=?Z}w=bJ(_6coJ&ON9vWfkPc(xo1|#6 z>mR!Dhg8}v1(`r7ryfAs7ojeScVAqy_s>oHceZ;~q2jhcYrgLKRP!{EGCzEbuVT2h zKubAyX-a>1cQUYF0!!u2*3e+D(AqnN-`~^hp@bE+LZ1mX29ZcK3u1OM8U~Y?^aTd< zD+)UTq<$Gd{!Q)GFL^3**RTkx4@_R2%^+1?HpT4`z*B1AM z8MK%=V0oO=#Ce+eMrAP{*thVwBF1;CN`ucwt~qT?=k{;mK*hX*j)I}Lib!_z5hEF$ zG{^r;f%BUEVf;dYl11WokQpI%HC{Ps(JXel1+3k^Rl?vfFqFJXo;VB_l~aS0HS^Bm zmfg}+JA8~Xg6v)xoOQepS;cazL@*?{j#po;u#v`rJKDPsnsxgpCX!lLEcD3!{hHh3 z!=?9;G7|4}cLkboRk}gKc`U_U&Xr1ny64y#sXYm$ht~!}Up^+3512~{PiQjp!}R_v z;4xVC*8c%MgD>wRSm@)kT{p?T#BT1wXxiMzMMI62RlD1RKlHXe!osf)k`(JF7{0c_ zrt$XC=$C!OnwY?vXcBPof!rs#h;{z=V?3dsX%Iw%j~PRfNLa;=Qt8thJs>ktTH-xy znbts2N|Afd4MbtuWqnC8;g<5n6>fsF0KTSUrvZJT1_0=5?x1Atk^7HPSVro)f){oc zl6N8jK^4w5TmEi`RzoK+gR*4WlUBc=3%DQBK)B_4*|U6z-3|2bH4%L4t-}7dG*59* z4SBi~s&MoV@7s{)vJsao470;6*K5}0f8K*`LH-T%v60kkfBQ5ta{jiWI%9u@OP1~e zbWPQhX3!wX>fgN#)!KH}3u|Gp1u#^J;+`dWbur^JrmN||b`%nHfy(8s$RbkEbE;=c zltGcKTIuhV3y{m!;Y^>qX&Ug6jS(L4xGW}$;xX&0n*}ltcIVOSLG475NkhyBfJnpe z?KQ#=4svM5aO9z( z_X=*Bcolo4PruiW=|#EZQw6e2LtjZyW6$>Wbn{656I2R@nY)o9xF(;90#cT-zn5={hEWI^cF>L2YqC_kK~usY}^!-0j-}ygDOxu zXi!B?)GD)5!>MMTR$5nK22B|xMpZBI;%2!Z(Ql5(2OpxVUz1*ug#Q8C!OPi-vn!nU zr#m`{?T})JMtLK?A#@gu#&nkk7P3#RW?YD1m(GABfVUBp|T-Ku2VZ4%#Ab;0iq*UnX95aWBadTI0&KTdwwlogmy z075+>8Y9$mAoZmrEE;G~st;m|J2jhTDZG6*z*vd`SkMYB)r-~;2-N#n&_jfU3yqhi zemAC4%wPL*Y2+v}(t@yDd)qk?;;;-hIt*zKDzIj*{A+gn)BJTxuR+7~?*ArAN!$Gc zfzxGjWSYj6E^Ogdt+(orTzFQ30aV*Ce@sX1t6kv+}mCe2v5H$-4V%OZa5t$&{)YWkfA=TC44 zL8&~?ieCbR$QQRlxvj@GW%~(@Ksm9Ssc`tT9%A?fi%|~w9uW50@e$G1Hah%#IpriZ zkumAGJ4Z;$=!`AB{LTr~Bd?q#uZ1&N-> z-3@ux%-K+R75oA1JAwK>rvTH7`tQ_mAacM%v_A`)bzp(6;=ySbO}ou=Ri`qPc{y#` zO%&R$>Jf&Ows^)UD(Lkz$KX6u8B_=JNqYh*nZV>EgMg zrV^;#X6s+tUS~R^QrB^%o7KsGjTb%>SBGKgnTMRTr6G+Kl0(6<&#JgW5dF#wGDFiU zkNT@zW_4}jWg7d;pKmMqP*FOw=XlMwVXH`Jmc1bAEJLvAddizhGNOfUm49`CYsf}) z_qjb^559VE>IHm>l3J{D`p*+(=nY!vVG%oZ<9&{DQok@z&0Q_0MG6Pf%XNh6?)QFt z>l@aKbPM4^-EA4W%xA7U39UpiTFvMi&_3?}>c#a>QUzVO#2~G4@FKgqy|kkRw0Dhl zo-|{h&Lb1qRt$}FRYp-^Fv;S?^6cPxg8#4Ob%|Jn6$TgD@95IKMq>0QLvVfoIWnVI z>|&E;apwuRo&dV;qr`q}5PNG3nfyZB!fVRBC;uRx9(6A)S3>XuX9zXl@*c7en4{%- zaXNJV{P@uy^*APR62w0|rpzJu z{QnsrbWsFkknp%BAd(oqa&sZnF+i&ldNQtnB~dU?#|-xrHwgv;55fd%uP3{WF{R2#^I19wuV* zaIEM$A4u`lehW3b2DYvyx&IL5qP@9tO9VC@bCOc$&6CmB@0YTz?X3|7{w~1QFvQ;L zpy=Otn1Edj*{;SRh!?q-OoJc7xg!B(z|02y`E86^?J)5_j;!N(o z+YyH^PVi{Ecy?GW56NpJlj}s+X9;M;f^TVuImp<+?Z{)BAYIAMBAp^&k;x4gt^MCX z@HI`WRPZq$i*LOpd&q#sVo~(b0x}B+eWH=*A*AF@Ubai!|ME@#CVN%%!YSn>7eq9< zRX20~d|;9(8jjJN!{zqwhX5dLG6~8?GctKNhODr+#a9EhFIgF|@?pJYRyYvT0FC4J zS8Y7$m% z7lQn8UD=hkep<y|S5xYM5#gvtY(by0z ztewIV0T7-X(te>VTe9dWj(&8u0Vl5ObZ#a$ob2|cdR>FGCS==wekUo5fc*F!2proT z@4N)!#PWnT1MlFHdH^VR9X;ZAxBQ5lle|A2dvuC}-1?}oVNX7<0DbT`9u*@6XeBD9 z46NM~w9jG>kS6x_eFNc$IiM13rTuS#R3Gs)<~`C3rqXIP4OIUQ>53S0aB;) zM}M|7VQ>82EBdrlPt+Wx@)4lV8ikGi(}A6Z3WnpYw;bfE={yURq?0Aush4eTpwlZo zqJbCn*O9gcNFc@GTDW=Yyd1*r`C-5y#eI06p`3IyaWqZ}XTiRPID3C`>?w#wvNOl{ z5#jYY@#OSKwYIPnHB}r^XcGF{_^{$a7So4mr{mY8pL9M9Acg?L{OY-=GY_2n$7Nv# zcYzzg*RSIu0LOu7=pi)%i=b!eUOIvXi>N@5NF{A2SdXK~7WF@KaE zwG%~1B?0Xdc*%NKsRjzCwjd@*%p%u~ARhRY(+?$wIJQPX%^NAc)qi^d*v2R2W7ZLk z({5KqCJgFLu7iC0%>#8c96qO#Xm1@HeE7KYUTIdOd{)ezJLgfMwq!E4@xd+=Ef;qq zasr3TH4vLd964j>c zjm4&nY9o*3*|CP136hQVRleOPAEEeSmQUMJqxn0K2n{*0t-3&bqoc=W1+pGUyMk#D z)dTs(=lGDDF<+JU|FLr5ZQE@$n2(f^6iOF28lUQr(0Cnb`Rm()rt&69 zQk(Tr0LP7!;8hfIo>o+MsHuH0T3}Mip;DCkA@dtf`)6CXb4UMP8Pw^NmM&E()CII3 z9G7%}C{E%3gfi1<81594&2>29Ey;V)aaAhOj%wJ^;I^WV?zR61Y)^a`2;Q zerz-x2WyOsKV7Z`!3<%#OWoo8&HyT~>dn=k0ysqfo&p(KJg-!bXrn+XB zxF^DPfw}f~!K-;cM_WXU8^>Vn9S=w3EvDuF)6#?7*2U3ZU}i>3a+q_sAdU0}ykn5G zbFenTIPT6wW+;LSNr$!W;qk(#$4AxQ@$5T+&!WD`wnq^RhWPp>`Jkjb@$j)Pex>_L z!qRQ-j%9fJ)vU@tF}=a6W)oTwv9Xj5a}Mg)OM3{d-fSsae`mOD^giC1V>T+`>pR0A zllOb$422v+A}Xe77H)04;*rnoc}X4?Sv8>Rw!E3MxvQe-u^*~^m@{ru*iksJmp3k1 z-Jh3``W}UA-*>O+XIOsV){&MJ*3VJl#~r414b&-*_=>vA2-ww^EuH)?1Qh)TlpoEa8H&lY*+cYJ!%7%GWb|56( zO;%M-sy_UpEP7#TuC5{d2DA7bosvdQ=c(2QEojmEO6#{JL*&DLIt!b2Jl-f0&>#7= z{$87}sHIG*wO23}YrGXLVz^0j8DFU9Lhrf7mRr%-j!n-z$>!JoA5&Ky7u6H(FCBuQ zA}t{yp|pTf3kpat5-JFYQql(9f*>i9(u+uoNH-`bjetvcigb6qxx4({d;5={-;cR- z&z$_8GiT;zhntZ0j5>X_Ss*x_8GPXJxw-d1<+pZMBY6Pj&M(gk;YGnz#V^3eY(VTOHTRG6FRh{&MkVU-Iik~FJq@9Yx4a;mJY> zh%s}qSk3n!Z0=2`i96}>2B#5J#Y`v4BRtkVQPw+}wyxxaZay`*O)z_Jw$J77{{q(ejxe8cq0kl-#v| z!{_+P|JMHg@dI!2!<^^hC4_CnYR}4ceL5Y7*fZS>uW6r4DxiF$n^dU8x$VH8$TaMSl3(x`je40St|KxNd#9A|F^C5F$3k>CbjrV7l-EEsSGS!^_%>%_`D@aJ z02wz;E4_&nijR=|rrK%Ec^&_wtt?Z~u){ZHp^JK{^n*(~^{m>lKwaEHL<8;pKd-vv zp2!^cMpopB-nzd!-){)6`izvk>vUwm*?`;86@Efl&&SbjhE^UQo|#*ZB}#Z{Exa1l zL&rtNFNh32*}{=@csW9Il&iTKN86ZKCJUAz_m&i(h@4#0bFnWz!8!q1_G9G^L zO(;ubIKGJ_cTx3mooac|@cl&0{b{q>%cIe$ug^*K>TsIO5Vj)&?f(jIhW%iZX<;fd z@7&}WV(*kZt-5bW^GnF9)#OTc#~3ZOQmXSlhi}209pkN&b2*kysnmY>aT`YS$8B3@cRTc-zHc{tzlC5knb>;^7YeT`31avl@kN< zT&rW)-S?WS1=|c#4LF_ty|vu(sBm}pDqf!bAGgL_VUhRaK6If*j!IFB{t9~|=6%<) z$Uu9l_x*v4(9lOEPt6{>#w&4B8G$~^mOGJ($*-&l_w}k=V}i`48^uHdy!7cjmVyX0 zza~6&Jt*mKxTJC0@s`Hn$q(t|K5@6-hdPy_85I$~ki^Dit-W?jtyWL?5gt1# z%;Z`r@4{K$$_KR-<^K^n3^`)Y@%{@-);+~%xn ztvYxWFjObfLeT@vSCHjI8kKghdH8Lxh7Q_#WX=EY{H`3rF@Z0cVet<)iuveR=DA9k zA?==BdIfdR4U(VtFY$|n>%4!KP(Ib;`)z{gLkCUZ37gf~__wtjY z(e5XNJ@QbH0xa&Gqfp}>H`VMkde}i^7QN^s-xGu1F!}F2A%_;q?Dd|k+IlY#w-3=3 zS-IMX=~W=$Y|CRuw}-fDWoZ5FLH6;*+)kl=-y_N5{tN9c>q2;W0(LoRNGp>RS#T~+ zh~Dc*JXb&YHf^1IZ*#%>9-D^{sW@!cF6=U&h_;FZVf?TOLa|;3LTXFJ<=lqDynKd? z>5Kwdv44Q_|H*r%VmwUsYP#TiX)}Het3;9K)yn;5Mr6UCyLRFxfbRfSQEr{p2K$bf z&_n1ueCQ&oP4w8hz2pHOs9JC(T9Ec!(x{~;+^I{A1Hqg=&Fz?xpaT?R^&0N>git%F zo!iX(2+4UeowNk!45G}FPYrq&uQ3&6I4w{YhY@Ylg(N!|(h9#)c)jSa$I&Fo*(4Pa zxT-U81*lc)uQT%w6C+p{rjSg3KhwH?FK(HXl}x_U@tv9wpK1PA)1vvQ3PHYvHwU(> zn|f#z4dIDC{Dw?bDWx(?G(*ExZb^y-VGvLw0Lw~ffq!*qHT&+TFahe^&@4zxFlOEa zPLjhOMDeA=AU!%oNC>oow%th&=`(uyx4BrR`HoPaNZdVTM~#z4d@j9b2dfTQBE{+( zFWYUmzm5+?_XF>dHMm#O&#fz(fxugZ4?sz``1Tol*tBOvGi_Z8NK^A!7~DL%Cr-K% z;V1VX?_+lIHUqWyo0#F}{E2GjbL{Ao=IFrBgB*u$O^?t#!P4S7c~;MGsa!4$bUx`( zx)uPUX0Yw3{nfP0M9G)!Y;omtgw^oQ_&%9Ztr!d|s&O=^!a&klP`B7G`Le`)j2jys zn3#A>w+eWv?^f6bamU?r{W2YYtG34LP=n`g8#~TJRC;xFd^ZRaiWRov$uELqvNc5A z(9hOz#^AG7iH>`$qaSAoCwggqhwWd?3UA0naGVxpu3lcg_8>N2LSMG%LcRCSnh-)E zxGLlAz80rqg=5e)l7-d%?tBMM-@4O)(`%JCFJ-1@IxIY1R`tHB;mP}-k#H)M!LMAj zMW$H0zsFSs{lrLhu#-MMgt9q6m&vv~LH~Lc z9XfKfX3o!JMtEp=WboRGQyuo|uez9+M$AP{-rUsu$`ED-{H`&8ua=Ra*^A%d-sdPd z7=b?OkBsq{xrdAYgnkSedFQjUf#gmD=048>z1-Yu^PPMfOL)0hU88QtReO=c5IyD< zvC^yXExvmB9P1Xa*2WukS4%k>1cBoXoH8lmy@Cs^@xdB(b7s)O~s^{{46v)0ZDXfX(T% zWAR9%vEZkO7OGsKppgd>pHDR+-Rha{t~*wf4CZjnJn;}%}q6Y;IFRLJ;?B%G#B zncJ(f_9sAjPh)QVIC;&;MPBaGR?(_RHRil{ispJ*vT}sJERt|VkdI!u8CXTc+TtHH z{*#s_fEiS8<}}pOc+tS*Px0{sk0dA|a%ZDrD6epvp=u)@;L25@uU2nAnM^xL+dqa` z1a``NBZv?H(JAqvpOa+Tb}g#fC7g!=Dy5^eYy#XTOJ0UW0pM_+ctQT|wU%++v03{SqhPFhh%6 z=!-pBCJ6e`(H#O3A6anzO|-Q{OH#@sKg7>X{68y5{If5^<+!}G`aC9iOZ?h`$jX~- zWrgBKexsWgXVPr(R7Y*4{^&Z7>MI*oZ7GlQ^6*v%nztH6KG~h4&=@e6#%7FV{WD zXV?Vh`Lo*HyYfrjj4DA}V}}(N(!Tw#5Q>v9P5l+1wtkH{&*H_}HW0L=pE1`}e0-ZoWHgO_0 z=BfufHZECFnnDDsBAF`L@l@`(ZP`J)XPo?EWk7W0Cp!t1Y^yTJ* zQpQY*sU=G36AkC z?;1f`EW(>G3b11dELNs8IR;o!%wqEdkGa|GKFO}T6iKHop4e+fEoOXV8alK}x>Y|` zcatu-FMem0GuL@Jaa1XHPc04$fMWB5Y)7@xApZVy7xiqzy9vNgR&vsqr`<^9IrCqh zB+#l*+Ow&9&5@PlRgcIu?a?A>gXXVYalA*^Bf}yB6|0vE>V%*8_uYm8T(xKer+Swh zXN;LhsR|S{#UT93e|ohLW(6ETZrY%C8y~yPzWIQH5OLR2oPs>frWMG3rpjDN(jYX1 zSTwwjYrM2O8;(Fef3Ao+=Z6Z#P2uA}-Ob_u6g!b|v@$zpy^XJX-P)ZW*8%vEEKbmP zWZ!VG-))5VeUi_}snpfvSq|CHhPwWcY6y{BJA7E}n&G-5$-Gmyuglsece6cZYOm@x zV6vrzwqybbM8C?dc}xJ%ERkRs4@MXOM3B=)l||@6j31@6;Za&{6`$P4PeQfZSZa|a zpcpJN0lorB#>$L$I8Lk~`Fugs{^*L0z+u!bkK6JQUV3jt=i`)|e!xKnfM72Rfjp=9 zmY!i9jg$mA4NJaC^->r=F{*i#{)AciRb4-_6eHoMQVzrTa0k)1KUYT-L5 ze4=QYpA_)5*!TH0I`+v3b`xCrnZ2s^kym&N;3Nvt;EpF!3>?Ry_hl5=#bYn zX?P?PSV1|<0Ghx?Jrd>byq%#n()=A?dt>7)!ty@SXd3T6 zU!>uZU)iq?1vxGhJm@g91}Xcmr$Ue5Tr3VaUI7!~LD*Iw<^iwH-mEklGUdM)Hx<); zIX3AE$fz@jnA|fWl3pBLS?;=N7muCkRh{UW+ZDGBd3ujC;?@aYBy6FN01OBp$RG^` zw^Y9C1t~;tl{Gx!zEfiNlJIRs+~V+n0MAD?<_x~RL~}pnX3QT z)*t-maiuvUz#V#|CP3(jdv*&;_X8V285Me3bP^%FeVyS4$-D7zDSO9Q!mAFPOEFLHvnVJLG z8I;Y_R4)aJOE6bn+xD-{yDjH9@LiMt+x+)G{XGvBY&{?)hSG%j!ROLolOF`OF}PR= zb35ZRGzMvZkd%$KPZkV57>LZ95*fxBR_90n3gGq-?Fw zbl1+?{27%P3_$5Pyfh-}c*6tg4mxLlGfEbh(Kj7wN(P(N7(K8QdosmKFymxF*mA)H zj$5OZf%+S93{s1d24BX#A)#<0aFj3%Y~T>yz=ZZby6v0LEDZN66G9VyBMocDzI586 zLAy$G6C4?p^;=9Td)YdVOCjFWpx^rwtBY>SlJ4p6ROMSNMn0K>#NTU5;?spw5Z~n z43UG-IPuW&HsR+N;LOOk-Bh3gs!s>im2j(Wt9TRlH-LKLh zLb^Za*@x@@O-4-Dzbj|u1oMgOtXi3OYIKv3U{$&NRSZa?ri9G?hJ0Y7$sRUr8pm;g zNxutYCy=GDiAo}gL185K!XP;9)C(NR|H^QtyqsNJn{eE7f;&ZEh|>k_-y1j%CdJbQ$Ydwv#W;HC4N#*g=Mzp=&mMLq98q z>2TaSB47!wRf76#7TJ$f8bf+A0Jf=~aA@~_0=doM!B%7OLmrifEWU!wk1r``+`3ii zaOu6E3SV#De{VqCs3sT(w_|xb>I*yNl+_i@3kQMjy=Jw6B|iP>0iuodNb~6wX#2Zw zC)Jb7+Jm>4k35d^-;*GJA|Rf$^|n3*;88k(16o%1^1_<|aM4?DX^q+)BxSfm^I)?h z$4gZ1J5udr*}T~DSNjLHSd`}BxH~V1pw(I^LLv%3xCD*+#gAvvaMjrQ%8o+O{hY7k zb2k$Tc%|v53j@7@j%-d{2nT^1vJ{+QMPg=7e^&nxRBq?Y$h6V5uW=datSgfD~ia2 z^&=yCtQKF-Zu=3M(JMAoa|CH)2sLw&jB-xuTl7rxQ}_e!h7^X4FMHHNxu{mYhbFkz zNdQ-tn$$y-iKQuunu=&z^7fxRK1(l$)jO{4s5 zBiC-sO!^R|1bz)`>Gl99iVLV*A;pd4uczCBI6)uD_G{uhK5mNoX|4*UrW}c%@ClC_ zBeX#9KvJ+NF+@p{RUXt2Y8v5D#N0qlKH;#WEIYceV^i!?YX@bccknGh{hH==B|g@j#9+}E(3^a0X~ujEqoYp8B2c} zx{ui-z`5}ZSJ6z0=?%n*GK-aTP66rfm$pnqgp%@mX=)AP)~#zn-|f6B!7fEeMCY>i z?_Rr+x3Nv7CZv|_gUt3v2BP&E2sBK0z70^ zmy18Sd*O8l_z)1fHcR)M9q7sYf10Z$d5Lci&!D9-SOI~imC;PI8!NpXZQ$wKLv5dXc88NI{W#19+Mdx*OV&vjYb$ykpv7UUsRaRh4BY?ykjfKwKeS@*_8TJqyj#JdtK_|UbrkRF3p--tTr-@GDPl>e z@LmRDS@#yWiwIDEFN_8!j5qFt{pjw6$KU>Gdt|+z*ZBEc`trNL)$6Eie3bXct$E#4 zXK^wF9o;W-#?Tsm`^Tpz3^ET`NGI6^&~OcrKy+8k5sD;1RFsjdqdZYF zgxzB~G&tUW=DY7ta2+gr_&lpvJ;|&GLAWQNG@)Bi5Tv&m7q%~WUv<3W*@XK+;F!eX zHc%M)0<~?3iw=nN+4?cW*lV=p0unaA*-iECcL!vjScMBR@KP-8g;n~wvqtNfb1^6AvQ_$eVk(Ewmq&!_PVU5KalYlywy zorcJBq|bf@sCwm~y^m!$o~4jdMZ4J|+8A%+cFvqRz z1Z^Zm2U4sU@U3^S89KT+XQAMs`Uw@Xq~&vUV-&x*Jr#=T;h@C3gbD(;07Fmz;t(Tq zXZ|jKvy^oJb?_&92k%q1Nl*g72L(>Aa;1S#hVlk7jk!hTeUS$WvK_$Bsuu>tK3p#C zgceLJ8MOyej%WExQsn02uR+n^ZV6M7nOP>(6A(f4TJgD9sNclDfcZ^ZAxqZN#_$EH z375aX7EQM&sp&C?x#c~N1Idu|pD=~62jIR0puaOO@A<~-c$5joK&WH~Xt75=LHnDh z(@Hlok$vq+^Z-CSSN2#s84`7eu%cCn@Ad^Foxy$?l?K;rY3A@W(StnbK ze8{IxUYjqL)9u0jLS=tv_s4^Z&U-pbTW@%}6mXjm`ixB8^g=?|6?Y=tl}inv@N+Z6b4B?6%|rZcAJMg4YKsK$ zwCEi~_W;)~SXYCZ{ghbj0EdKN1*+ZNvn!64)01AM3e6t>#`Z0ni~khLo7s~<<%N+U z2*j4Xap~-zeWC(50A+FX7iWWWkAmY>r_u3!749#fHuK?cPiMo!oK|yY3y$$y2z`(z z%Ku%CdbuV*?EUG5FrfwEt`>^D)_t`ToZzncnnd;Ih&!?Yq>MK+Jw5S|IxRJWqm>lJ z@9N4B`d4m1(boPumgkU=sPZgpJ0U`I$pHw9y=IZJ(2504)3UYK4f5da>XNAK>T<73zcJU~ndGv)-KJ|SIu$P=E zyDfa|pEDdscnOMc0h!Tq*k$j90y^BGc7<$r40DS|zey2xnbbaGdFr(@S4aB3PBQB) zmF{l+d@2m}dOJl(*H$45>AC=)QVCtivN@KjO99U_JobqWDe>a!sKA&KpzDto|x4}RqSM3^MQ(c3viJ?1{_hL?x#_)%tKIdBzQ4ttrd%0 zOScm&)J3i@K&W>>rb@33I?z#MI{mBQ@j$SXOI5!PF5w>2!yo$IEiwzPMuL_LC5}V! zx~vbETv++SYS{>5ldTT46jp@mhpQ7wMH;*Ti!~Yfq083(-l7AOd0+l)Do8@h6RQ+m z$}5=&c191y>tD;4tZP~u*PYf%$3(KA@N0EmKK0_75V+0>740vT_{EP&rWnVfpk_YQ z&|6nr^)MTt&J}v*K8$f5=m#8UrC{mmLv%9$&2DQ z;L$12SE%=E>l(^n$tCrBvhzmo+BUOHt~`|?sBumX@zPjmi*nxo-fg2p!$O3Yyoe*O zr2_GMPDnK##f!JdjdGrKW@GZ!6>y-mxaMZag#uk5#)l~<+E(xC!7fc-mNoEUO?_-x z6f;i}=3EioJ@nr65fk|FS)KP5loxIqQ+jg_&!!S}ne9{~5wKGN9HxKp*s;{iblQ-O zF$E>vMcldVEthi>y8Ndj{FRT7nZV1kN>y5X)Fc08LGL%;=HBAY4{o|+eA2~F-^^Be z1P(*gJ78|xyT3U=umM+3nw<=3+q>g;;qPY1lxAhH+A-jCu{YY|$s2c`Bly7RQD)Q6 zU+in~*tI(@K00)I6(tIuq^^?c#|z3CgS74Fb)n`4Zz7z}ieGNOV)B+;cJLEJ3ILiR zudTPt{vZQQzkC-idvw)!h5^#=F=d6Kq>=f?cv{|9bH$*`jBkl%EQ(slK3Ch|BMe90 z;RIP0N%0qVVnhYX7gt#56Ld*sZhl{PLQ2SbpqTXGcVAZlc7R8~LKIdzZGQ+qs$P0) z1(z!eoV2%ie=XVlru%VIXO46?Tr`8XF8ocW_aw3a*O0-Z8P{>;V>vLd7Rl-i^$M#W zR)9oWyPXi-%b>d&?}m`LGt03DjpLny?@E`v;HjKtWskbma%Mctv-~;7(dotH~oC3E1<<8ul zoxH+KQq=jcr`Y=TF{gKX#55cK1L+Aa>8W>*SO_0oLtOb|bDa~JnOCpD*o34H$Mbe| zaO=XkTc1p$H`94?^q^kjuKE1)q?~#SsZdtK&elg;?}r1Ma*ddxy z-#;8I33^8W)n+_i_i`4IJlQ}+TucfhdDZ~3=&tf5aI=8B&2~ElkIk#w7tXWrxDcY= zh|&145zBOErEUq&_)tOt4C-EP1J58+ftiOm&5CfjsEp|})be6_E`MU)i$M;>t*ypN zXRdD@D{XQdHiC~Xywe#2uD(N_#ewUHomD;Uyc8upQVD`7p;xvO!lRQ~pUE{I{%XX< ztf`zK<1tmq*8wlef{8yGHcr!nnu`2Z%{EqAXS9iNu#eDBA2jPSm59kuFu| z?;OKCUG_Rae>?3c(|*roBR;Aubj?8Z0NH_Dq&BFvPe zwSixi}cKS9ZzZ0!(q?z?qa3Px7v3n&%s7my>>c4!gN)}`Zbhp>zVEsHuK7I0IVSxR@d&tRN3 zPTpkg^{9U8$!ncgU#{UHN0NYPF1wWD-z7FSU0JUpF{Djaa!A$@UkjTw%ocrsKo0hCVbY``K1|Qxc!H+gqJ@d z(ntGgHaBH$&hBWR=eFtVgK`-2kk9H>sUP0H#W z=S=pa*T-`^#4;CZz`nV^4zHP|iK9nYi+`!`9yAqsbeuSE$A#3*JCAs~b*{HSp0ccV zYrLTQBbB^e<@51eHb}Fm334t=8v7bhHtMpgfx#s+Z-!w`XoBG{S~cLfz*O79N6EfVvf<`*Z^ zmaD=;vL2`YvC+bR>u9l{l`ncB&|}_wr|6^9)L&xba+QgdEtB*8qjBmp(KE6a)3*cz!pFk}6#cIDP9=bP6VB4MqVBP_ZUJ zGu55oFHkyMC8t96xl^{X#-3PPryCa;z*reKXMD4`FZIJmFmIBjM$1-)mwu`9CNXN( zAI*@wLksN>#MUAR)1&=TzMhUGuWec7TM4BbA80lBqa)e9%dJ&1w^sj%Sp+qtEN}A& zB2DGVf%E4jCb%JJ4{lnw)6{g7KzCNMP$4$2Fu4z9j6oGcaVyVLqgJX9GwU^?f&hX4 z1iNuRMUzPnBql>_#02}aJJu8vNpNry$cXr&Dt51*=E)5tA>Mq$y-3=u!j-M%{a|-e z=;gEbI9<0TMz5t>+wRvlq=H5fk96%?mtjkstj45de=hI6E3TTY($ouLiG<0H#DsX& zTGlvOcg!+mvu5h(em%T+*YtsFhn|SFR2DY?M zlRrf>z4RsT;))m4ZZ)>4yODX9x){+*8)OmuRNV_KJXqP0AqH(M^LEdqBgXpiTNk?~ z@FEW;Z5rms>t^=Hq#ba?YlZMp=dY0wtQL!n(r(jIkjGW8eyii<1rBph42b5}X6=^5 zU~)7U91!>R3}m_`l{4KYx9oS5tZtSH;>}0VqUg;scAu5qCh#X6_86!U<7dR}$pTDh zxN&>Qr=(b1r#rTG+wGrB!qw*f)`pYKGJ>p&fK>78%5|AWhlP~zvDYxyg{s=EWy;du8Wk z&Yb&C+r4+O!FDn}foE6vte>;Q5JE+NFiaFJ+@3kBkNf zf{OjYwTuvixJ8~GDNMK(`>A|$C52Q*(-O2O`zP2v{?ro6VE>vDf{+xTChc1aenF5^x7N-hI%;1;95%;A)$WH*ZDstzdXXUg+-F?<2Ft_;Cp9nn$ob7T2TY2Eu8|jCtyhGgJs*wynrW4=!xPtw)qxzp+cWd4T zdGPmTaB>s;gH(fgG6Qrn=aH+P`9DJ$#qmw!1F+sMa3MYz;JvUw&T8H5MbB|`_wwk* zdip_fN|I>>U_VOgjIV z9)t2Sn8vJ4-Qny^3oql8@JtJlyTys4r=0bJES*$gl?dWcF2iT*Mc0=z=q7c69l>+Uhe%!vC9F^m10FecnJkRlqK(Vo5b>ANvp~p z$(UnPW=yU^EykAO!PXD>ySE<4B~)pS?ky6TAH%i>Ymbay^ov|>=`R&BAOl);9US5S zqfmxQ#0yM1yx0uyD3~GuB$zte_YogKm09v2FZc3QBR4en8!%MO7#B7hXdAv;FWqvM za+gCoJ$7X-uk8KK&oLz269c6ailY`qInW&{-_k$N>USb!0JIq? z$%$cZcrTJTRl-l;%bOSAQV=JVu-Z$Wt)qOCKpjw*u|l%AyYa*=ol;v8hHYq{>h(twE&SfvrMGyy#}qK}>ud;B{5IG7=v zw+?KxKKB!XqF+~*FA>GUIn2lf64u9GKmCF;<1Ys@PH?^gWWtfd=H5h^#z%IX8AAD)A zm9qX@E49kYo_;lE>D&cV?+;!?z;d|np6p~arGh0DM`3C1V8Pcxrku7H!Y(%_)`t|+ zaZ`sEUk_Ut>c$yMV?5?xI8(LvuRNv!;i{LK=i2_r_9SB7h8sAF<2O6|3Jqz25Zup%F#H=FKGETe0chVpLE2rn zw+-=@T~TJtefr`JyRc19sOShvZ6{Zgiv$H= zOVu77(D~=QviV6=%OnV<`2`(CC0}TO+<$^w>%nMm-Qie&6jwMnZSm-#t6pXdlD?%o z%*oxd0w|KX6LD(U7aF4f8|`?HDc(+V|LOiv(xQSK8CcrMS=qd?@vx&GiNFr|xIc$uxP>%y)$=?EkU+ zCtP_js`MzK7VSh38|y#fKMz6gKuoNbXWfd~o&v0J#yUzQ zK8UTbEi?QISM92pL-PU;aJBG1;=j1~qXRv_#BafU)ZiabO;)*sr1}YJm(kyI_z*OO4cv$i?25cL-5;j*j?81;f{YT_H`NHR7^XNq1vI;2 zl&zK_kYN^|V1AwdG>E55^+y<}> z4D{$m1Y^xd#vk|4Hs@K%xP^h8U&2O_r;(D0tLTy)m3l6x(aK6741!Xl8k3oS^6`8m zA9r}*p$un3ivh=IBap2&4Aj~+WFnfN&=2a#9*ufW#QnaNY&`(nje;+p2R}jOF!Kd< z<^i;s*myZRXqVPuXdZ?4Gr-wRoq<04tt(*O8)x%^-n2It-|%?zkJjF9|M&6XworH) zTp9bC|K-7-zJV)p|FY}f0zWVdZkz*iBieUv6=u`IJv~*n1_oesB>T};$8dj``&Wy~ z$=7h$M--p&cNDlibBBUs_^e2o%|j#@?W-@wN53!GeKD*Sc>Pb{(iV79I&e|g(w~-{ zs{KX|V{o_#Fpaz%I3%z_f_XiY{Ipdd3L#ps;Vl~qbRPoSt#^oN=FQy5LG$hd_k%;@ zC#MWLe0ZQ`v*Pra_cZcPi2{f*pD5+LF7r`C5SJJhPjE%900$XX`oMhdvweSFZti1T zp^q^7__Hi6cB5Dgc9UFis}%UPnsAZJk}q1i!M#|v!xOZ^mT0gGTz3Qr zn|8W`KN^O6Q2Lq28MIN)Ozu+nPg~B>3Iaobv)apPB^c$70v{cOd7=|8xwnd*-#PeC zx%qIO?>+B-(8nupwfqv+E6gQ_V5GLJ(6))GD?XLYpTS$)BMys}v-#g(y}FX3C%y3q zWlAVVtRKw)mMV-TH%E>*2l_un(e4woGm_Y#L#1O0z@{^hP9Z-z-v6VN$Lz3I#VOv0vT(+IU84cONui{REZN0w72T)t60De@m$&Ud{??5RuadRtT9s{ zZ5`2ti66ew@WhbzAjku_h22Sqp8bEKr)FdCU-TT>j^1Wdyz=Khz;AHuZw2v7I}jWD zwGA#({ZseFE0(<&Q32vsD(G37<@fpg-f7&kU`@;I^*I?F@G7hoL#MZKa`dkZnZM{S zwqIcoS$T9Tf zW}nPtZ&zsB2I|ZW7>?Az!kx8J5M|tY(>!YJ+cpG%DuJ;*z_H!k%a-z2`vz8=1Ma+l zS3CCH3NSprSP^D8?EV?k6(j-p6#5ZdI>0FR?*T~IQ1ESH;5RW^nxUxN4NhXG0v~e! z;ND27vCo>;A-i!$a*m-S1Yr*Qfq`qbcUb+vZOu3(&)>5n(I7d#%rC{G^xn4ig;H-@ z7oW%Thrmgp;2VJ9Y5&3ShS>O_$Ivv83M=Kz6Ps$_k4{_l&vAi+ z-Rc?pno9Y}WxENJk|OJYKEwBf>zur~;AvTvH@^!zK=><$@Z{pD;QTJNspWf+i`M5c z(CDoHIma8<%5l`ATO~&mx0bF;QGMl?%;7EX8J&_=u5}I z_6{*`kf&w7s~VTO(5 z`s+*_LOm{$<&nx4A0osMfPm5r^bD9Q%NHK>{d9cn*ssswuM^{mr9k=_9VqJrRd4cOt5Ph_9wKb{Ch`|F; zn#1SHS<|7!#ZBhZ{kr`hMyFPu`?*F=v&|X~_NTRv^~IA;jspEXjps&krtX0U{1Fh0 zOQ7wKDusM+a{7nrjvh!K&(}H4H;*G5HH-|UQpD72hQ9GHe%VkcFH$P4k_IpGHeSbP zf*P}N%rZN@uzD_BE7b%=AV_H)InP6s=0v7pL^4054om`%lpM@MBFC4&{ zt?)dDixdx}aFF-Ne?X|+z%)00#o8Mk5()BVMCym!ok>LNWa21Bh$?@@-*7ZHJEOPdmRn$ zg0%`|O*Q6yr2wPBcl9t1leVseJ$5No`gs3Rh4WvF#ZJvejog$@bH|-%LU;1_4|Apz z01=h~I#h3Ofj9yoguk#q`WAff7|TM7*|Oji`bFgkeykebuD}{?e40DyYK%wETx0L6 zTjgFxi?#YkJr8rIfLHZCAVGnNAk?E)VoS98!i%E;>#sKwZF|Lb-2yf>PS>Q?wKhk- zo*V*&K$&5ww1`<|H;)`CiFMtY)0mlxOIzOS8=D;2TXM&%^~>i@`&F9?W(;Wp3aCH< zPu?LMYa-QBJlZo|KbuhTYE|}GodzJlmt8T(`2wIp&>j!#xX}I#rV9>$rHmtW`5(r2 z*4^>}Q=cP82RJcIWy3O6;0dY+XacNp&@89x zlexxrx&6M`%7~jl={SGN6t7(xkMDK*dDjRlUBLG)vNJ{mRPelsFU{{hTs^Hwk=bwa z`pxgPy*T7m6_Qe0?J8}T=vs2(yLq*hVUSu4xKGqANgv?%LAL$vgGtL*nAuCe$yFUL z$=pnau{X*GA7dF^OuRbYvgECEb9~9S^oFxAI&(O``%|N%gj)_cC<{)F31B49qZ?Go|^LuwuZ0K6Bl!o~^rMZAHLel>)2{MLq&y#J8*?-B$8&RQnD&a00Quhr7!M z0j+gz(;9WWo|d&L7$x4p`fEg0bmyxZgT_)}%n2PD(k^Zf>U1#fs%|rpDbz9B_31^R zZ82wq52FhjbENvFN6X&NOz#@HWsl!>`S)G@Xsa>%!LZw!?L%r9G8T6N()(cd04w_-9?l7KCPyK9| zX*%ictRGc&%q>l;H9Miql_!%466=>j%ed)bm8i@9#PtOzGX!zAsYYzGOS$DW?RxF$ zc%B3wN@z?OV7)vaW7RUyf9q{`b};e=Csv3pA`j$|=p6~y-PKd0lYf!<8)7mCqt!1K zdX;2W*ePK?s8V9}LiH=~+C@grA+W+esdsOagB9_~(>(S^C5C(C>j3|K$A5{nXVucU z1@7u(DfA0p9?zkF2P`mgp$sVS5A>wa=7Gi}`BL2ZOPJ{e&Ze&wI(S%Uep~qntVv+< z%pS;jm2{Wm!!E!hUw;Pz*#Y$t2pZM&XOTM0z{4oyl}W6A|BLaw?Ejq|GwbVj*jQO< z%G>fON%#kLARBhkE00tPtyfi9V3H9;YM14WSK&#!)J)m`6uEKwVJtH3ObbVQ^FLj% z0PF^11I}D#RcXZaA(Kqjg%e;_Wr2*bHdL;^d$|A$$}I!s|E*l)>>P@fb1?x*jk`uC9FnCwt#6PjoUF?k?e=3 z#h9q18eQ_1YJeb?2Dvj1NUGG4YNFIJGZ=ov^3^#ek3E?4Zm8%l*dE`!Cnj7|>4|@Z)IMdl;$urcJY587Ei-V!L3agZmVUKQv zJo;%fhPO>8{XQcTW#`%DwOcd04l*RVv7n1q>})B@n58&Rk>%_? zPsaeGCr4+??@1*yrB-#l3a^r#!GIaos19tbfEEO_MhSa`mvyW3rL4GMuAv06%tROf z_yLFrEc;{Tu!(l;Cj#n$C~|MZlm)!wtsmfYeE+V0wx*0TnK=wm%9i}L++Iq;ELGvZ z|9wfeq$!#`7`v7ZQGk{%V~lWG0E0v3#r~wT^Pc?`3*n(gCH!7V-v$4#p%_5s?f4(t z)RP;mVyZA&Kcs+_NyxB*!=;`KwUA!35+V%m^R8m`IQ~g)rOUrkn~-0YH&2((CxeQR zcg&ew)Cvsb#oI*h=kh91W6Y(v7wA|F1`mj+$+Rd(JWrsDlf5~&O(F$ySm?@`Lx^9v z7g544#Xw&b+ue;>z`!%u2$u|-R5z)7b^6^O%Zl`tPtpM=@);$NoV8%>HK5Er|Dx++<#*N{G63tn zAPJRxdu>(v$9wNT^ak{|`M>7<0qQNRI14IwK)HdzXVT`Ym-D%0Z@kKyq<`ldFy%N* zhZMC9o~28;7M`-dKW~BYy|s&$PgS`M(Ji43Ns$*=i#?}J`ZC#P`-eT{tC#-^18ZmK zhlYKuarj-XP`jzuvp?K4pS-4f9?-T2H=#*yy6j4)6`MmPOwTF2O8oHUL;v%)AZb>P z1we{{LGHUjn^n-?;=B{*pG`e~GdFa*mF=JByMR&4aNr$OP28D@ul!eOWIbd3ckP?< j`Rlo%^+4%{M9)wEyV;jobO?(|f{gNX^>bP0l+XkKPXPk3 literal 0 HcmV?d00001 From dab35b55b564f26f3027b13886ad5fa3a5640a9d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Feb 2023 13:27:35 -0800 Subject: [PATCH 019/137] add requirements file --- docs/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..93120e66c --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +docutils<0.18 From add360dd4f92659943f2c4a60f83b47c07e901d4 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Feb 2023 13:33:17 -0800 Subject: [PATCH 020/137] correct copyright year --- docs/sphinx/user_guide/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index 9854f87f8..898eb1f97 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## From 043c85bcb8ece743230219d235750b906ec123f1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Feb 2023 14:50:25 -0800 Subject: [PATCH 021/137] initial part of getting started guide. --- docs/sphinx/user_guide/getting_started.rst | 177 +++++++++++++++++++++ docs/sphinx/user_guide/index.rst | 2 +- 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 docs/sphinx/user_guide/getting_started.rst diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst new file mode 100644 index 000000000..7b36d354a --- /dev/null +++ b/docs/sphinx/user_guide/getting_started.rst @@ -0,0 +1,177 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _getting_started-label: + +********************************************* +Getting Started With RAJA Performance Suite +********************************************* + +This section should help get you up and running the RAJA Performannce Suite +quickly. + +============ +Requirements +============ + +The primary requirement for using the RAJA Performance Suite is a C++14 +standard compliant compiler. Different kernel variants use different +programming models like CUDA or HIP and must be supported by the compiler +you chose to build and run them. For the most part, available configuration +options and how to enable or disable them are similar to those in RAJA, +which are described in `RAJA Build Options `_. Later in this +section, we describe a few options that are specific to the RAJA Performance +Suite. + +To build the RAJA Performance Suite and run basic kernel variants, you will +need: + +- C++ compiler with C++14 support +- `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. + +================== +Get the Code +================== + +The RAJA Performance Suite project is hosted on GitHub: +`GitHub RAJA Performance Suite project `_. +To get the code, clone the repository into a local working space using the +command:: + + $ git clone --recursive https://github.com/LLNL/RAJAPerf.git + +The ``--recursive`` option above is used to pull RAJA Performance Suite +Git *submodules*, on which it depends, into your local copy of the repository. + +After running the clone command, a copy of the RAJA Performance Suite +repository will reside in the ``RAJAPerf`` subdirectory where you ran the +clone command. You will be on the ``develop`` branch, which is the default +RAJA Performance Suite branch. + +If you do not pass the ``--recursive`` argument to the ``git clone`` +command, you can also type the following commands after cloning:: + + $ cd RAJAPerf + $ git submodule update --init --recursive + +Either way, the end result is the same and you should be good to configure the +code and build it. + +.. note:: * If you switch branches in a RAJA Performance Suite repo (e.g., + you are on a branch, with everything up-to-date, and you run the + command ``git checkout ``, you may need to + run the command ``git submodule update`` to set the Git submodule + versions to what is used by the new branch. + * If the set of submodules in a new branch is different than the + previous branch you were on, you may need to run the command + ``git submodule update --init --recursive`` to pull in the + correct set of submodule and versions. + +.. _getting_started_depend-label: + +================== +Dependencies +================== + +The RAJA Performance Suite has several dependencies that are required based +on how you want to build and use it. The RAJA Performance Suite Git repository +has submodules that contain these dependencies. We recommend using the +submodules which are pinned to specific versions of those libraries for +most usage. + +The most important dependencies are: + +- `RAJA `_ +- `BLT build system `_ +- `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. +- A C++ 14 standard compliant compiler + +Please see `RAJA Dependencies `_ for more information. + +.. _getting_started_build-label: + +================== +Build and Install +================== + +The process to build and install the RAJA Performance Suite is similar to +the process for RAJA. Please see `RAJA Build and Install `_ +for more information. + +When building the RAJA Performance Suite, +RAJA and the RAJA Performance Suite are built together using the same CMake +configuration. For convenience, we include scripts in the ``scripts`` +directory that invoke corresponding configuration files (CMake cache files) +in the RAJA submodule. For example, the ``scripts/lc-builds`` directory +contains scripts that show how we build code for testing on platforms in +the Lawrence Livermore Computing Center. Each build script creates a +descriptively-named build space directory in the top-level RAJA Performance +Suite directory and runs CMake with a configuration appropriate for the +platform and compilers used. After CMake completes, enter the build directory +and type `make` (or `make -j ` for a parallel build using N processor +cores). If you omit the number of cores, the code will build in parallel +using all available cores on the node you are running on to compile the code. +For example:: + + $ ./scripts/blueos_nvcc_clang.sh 10.2.89 70 10.0.1 + $ cd build_blueos_nvcc10.2.89-sm_70-clang10.0.1 + $ make -j + +will build the code for CPU-GPU execution using the clang 10.0.1 compiler for +the CPU and CUDA 10.2.89 for the GPU. The GPU executable code will target +the CUDA compute architecture ``sm_70``. + +.. note:: The scripts in the ``scripts/lc-builds`` directory contain + helpful examples of running CMake to generate a variety of + build configurations. + +The provided configurations will only build the Performance Suite code by +default; i.e., it will not build the RAJA Performance Suite test codes. If you +want to build the tests, for example, to verify your build is working properly, +just pass the following options to CMake ``-DENABLE_TESTS=On`` and +``-DRAJA_PERFSUITE_ENABLE_TESTS=On``, either on the command line if you run +CMake directly or edit the script you are running to do this. Then, when the +build completes, you can type `make test` to run the RAJA Performance Suite +tests. + +You can also create your own build directory and run CMake with your own +options from there. For example:: + + & mkdir my-build + & cd my-build + & cmake ../ + & make -j + +Building with MPI +----------------- + +Some of the provided configurations will build the Performance Suite with +MPI support enabled. For example:: + + $ ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 70 10.0.1 + $ cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1 + $ make -j + +In general MPI support can be enabled by passing the `-DENABLE_MPI=On` option +to CMake and providing a mpi compiler wrapper via the +``-DMPI_CXX_COMPILER=/path/to/mpic++`` option to CMake in addition to other +CMake options. For example:: + + $ mkdir my-mpi-build + $ cd my-mpi-build + $ cmake -DENABLE_MPI=On -DMPI_CXX_COMPILER=/path/to/mpic++ ../ + $ make -j + +When MPI is enabled, you can run the RAJA Performance Suite in a way that +mimicks how a real application would run, such as by fully utilizing a GPU +or all CPU cores. MPI-enabled execution is supported to generate realistic +performance data with the Suite. + +Building with specific GPU thread-block size tunings +----------------------------------------------------- + diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index 898eb1f97..53a620d3e 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -23,4 +23,4 @@ both of which contain code examples. .. toctree:: :maxdepth: 3 -.. ## getting_started + getting_started From 22b043dfb13d91263110bedc27e4632ad4f7dd0d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 9 Feb 2023 16:29:38 -0800 Subject: [PATCH 022/137] Adding basic user guide content --- docs/index.rst | 62 +++++++++--- docs/sphinx/user_guide/getting_started.rst | 106 ++++++++++++++------- docs/sphinx/user_guide/index.rst | 10 +- docs/sphinx/user_guide/running.rst | 96 +++++++++++++++++++ 4 files changed, 223 insertions(+), 51 deletions(-) create mode 100644 docs/sphinx/user_guide/running.rst diff --git a/docs/index.rst b/docs/index.rst index 969d4fe1a..191025ea5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,9 +12,52 @@ RAJA Performance Suite ######################## ============================= -Background and Motivation +Motivation and Background ============================= +The RAJA Performance Suite is designed to explore performance of loop-based +computational kernels found in HPC applications. Specifically, it can be +used to assess and monitor runtime performance of kernels implemented using +`RAJA `_ C++ performance portability +abstractions. The Suite contains a variety of kernels implemented using +common parallel programming models, such as OpenMP and CUDA. Some important +terminology used in the Suite includes: + + * `Kernel` is a distinct loop-based computation that appears in the Suite in + multiple variants (or implementations), each of which performs the same + computation. + * `Group` is a collection of kernels in the Suite that are grouped together + because they originate from the same source, such as a specific benchmark + suite. + * `Variant` refers to implementations of Suite kernels that share the same + approach/abstraction and programming model, such as baseline OpenMP, RAJA + OpenMP, etc. + * `Tuning` is a particular implementation of a variant of a kernel in the + Suite, such as GPU thread-block size 128, GPU thread-block size 256, etc. + +Each kernel in the Suite appears in multiple RAJA and non-RAJA (i.e., baseline) +variants using parallel programming models that RAJA supports. Some kernels have +multiple tunings of a variant to explore some of the parametrization that the +programming model supports. The kernels originate from various HPC benchmark +suites and applications. For example, the "Stream" group contains kernels from +the Babel Stream benchmark, the "Apps" group contains kernels extracted from +real scientific computing applications, and so forth. + +The Suite can be run as a single process or with multiple processes when +configured with MPI support. When running with multiple MPI ranks, the same +code is executed on all ranks. Ranks are synchronized before and after each +kernel executes to gather timing data to rank zero. Running with MPI in the +same configuration used by an HPC app allows the Suite to generate performance +data that is more relevant for that HPC app than performance data generated +running with a single process. For example, running sequentially with one MPI +rank per core vs running sequentially with a single process yields different +performance results on most multi-core CPUs because bandwidth resources are +exercised differently. + +More information about running the Suite for different types of performance +studies is provided in the +:doc:`RAJA Performance Suite User Guide ` + ================================= Git Repository and Issue Tracking ================================= @@ -35,26 +78,19 @@ interested in improving the Suite and exploring new ways to use it. The best way to communicate with us is via our email list: ``raja-dev@llnl.gov`` -A brief description of how to start a contribution to RAJA can be found in -:ref:`contributing-label`. - -=========================================== -RAJA Performance Suite User Documentation -=========================================== +========================================================= +RAJA Performance Suite User and Developer Documentation +========================================================= * :doc:`RAJA Performance Suite User Guide ` -================================================ -RAJA Performance Suite Developer Documentation -================================================ - - * :doc:`RAJA Developer Guide ` + * :doc:`RAJA Performance Suite Developer Guide ` ====================================================== RAJA Copyright and License Information ====================================================== -Please see :ref:`raja-copyright`. +Please see :ref:`rajaperf-copyright`. .. toctree:: :hidden: diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst index 7b36d354a..9716fc33e 100644 --- a/docs/sphinx/user_guide/getting_started.rst +++ b/docs/sphinx/user_guide/getting_started.rst @@ -12,9 +12,11 @@ Getting Started With RAJA Performance Suite ********************************************* -This section should help get you up and running the RAJA Performannce Suite +This section should help get you building the RAJA Performance Suite code quickly. +.. _getting_started_reqs-label: + ============ Requirements ============ @@ -34,8 +36,10 @@ need: - C++ compiler with C++14 support - `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. +.. _getting_started_getcode-label: + ================== -Get the Code +Getting the Code ================== The RAJA Performance Suite project is hosted on GitHub: @@ -45,18 +49,22 @@ command:: $ git clone --recursive https://github.com/LLNL/RAJAPerf.git -The ``--recursive`` option above is used to pull RAJA Performance Suite +The ``--recursive`` option is used to pull all RAJA Performance Suite Git *submodules*, on which it depends, into your local copy of the repository. -After running the clone command, a copy of the RAJA Performance Suite +After running the ``git clone`` command, a copy of the RAJA Performance Suite repository will reside in the ``RAJAPerf`` subdirectory where you ran the clone command. You will be on the ``develop`` branch, which is the default -RAJA Performance Suite branch. +RAJA Performance Suite branch. For example:: + + $ cd RAJAPerf + $ git branch | grep \* + * develop If you do not pass the ``--recursive`` argument to the ``git clone`` -command, you can also type the following commands after cloning:: +command, you can also type the following command in the ``RAJAPerf`` +directory after cloning:: - $ cd RAJAPerf $ git submodule update --init --recursive Either way, the end result is the same and you should be good to configure the @@ -78,11 +86,10 @@ code and build it. Dependencies ================== -The RAJA Performance Suite has several dependencies that are required based -on how you want to build and use it. The RAJA Performance Suite Git repository -has submodules that contain these dependencies. We recommend using the -submodules which are pinned to specific versions of those libraries for -most usage. +The RAJA Performance Suite has several dependencies that are required, and +which are contained in the Suite submodules. For most usage, we recommend +using the submodules which are pinned to specific versions of those libraries +for each branch/release. The most important dependencies are: @@ -91,7 +98,8 @@ The most important dependencies are: - `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. - A C++ 14 standard compliant compiler -Please see `RAJA Dependencies `_ for more information. +Please see `RAJA Dependencies `_ for more information about +RAJA dependencies. .. _getting_started_build-label: @@ -99,24 +107,24 @@ Please see `RAJA Dependencies `_ for more information. When building the RAJA Performance Suite, RAJA and the RAJA Performance Suite are built together using the same CMake configuration. For convenience, we include scripts in the ``scripts`` -directory that invoke corresponding configuration files (CMake cache files) +directory that invoke associated configuration files (CMake cache files) in the RAJA submodule. For example, the ``scripts/lc-builds`` directory contains scripts that show how we build code for testing on platforms in -the Lawrence Livermore Computing Center. Each build script creates a +the Computing Center at Lawrence Livermore National Laboratory. Each build +script creates a descriptively-named build space directory in the top-level RAJA Performance Suite directory and runs CMake with a configuration appropriate for the -platform and compilers used. After CMake completes, enter the build directory -and type `make` (or `make -j ` for a parallel build using N processor -cores). If you omit the number of cores, the code will build in parallel -using all available cores on the node you are running on to compile the code. -For example:: +platform and specified compiler(s). After CMake completes, enter the build +directory and type ``make`` (or ``make -j `` or ``make -j`` for a parallel +build using N processor cores, or all available processor cores on a node, +respectively). For example:: $ ./scripts/blueos_nvcc_clang.sh 10.2.89 70 10.0.1 $ cd build_blueos_nvcc10.2.89-sm_70-clang10.0.1 @@ -130,15 +138,6 @@ the CUDA compute architecture ``sm_70``. helpful examples of running CMake to generate a variety of build configurations. -The provided configurations will only build the Performance Suite code by -default; i.e., it will not build the RAJA Performance Suite test codes. If you -want to build the tests, for example, to verify your build is working properly, -just pass the following options to CMake ``-DENABLE_TESTS=On`` and -``-DRAJA_PERFSUITE_ENABLE_TESTS=On``, either on the command line if you run -CMake directly or edit the script you are running to do this. Then, when the -build completes, you can type `make test` to run the RAJA Performance Suite -tests. - You can also create your own build directory and run CMake with your own options from there. For example:: @@ -147,10 +146,35 @@ options from there. For example:: & cmake ../ & make -j +When no CMake test options are provided, only the RAJA Performance Suite code +will be built. If you want to build both the Suite tests and RAJA tests (to +verify that everything is built properly), pass the following options to +CMake: ``-DENABLE_TESTS=On`` and ``-DRAJA_PERFSUITE_ENABLE_TESTS=On``. This +can be done on the command line if you run CMake directly or by editing the +build script you are using. If you want to build the Suite tests, but not +RAJA tests, pass the two CMake options above plus the option +``-DRAJA_ENABLE_TESTS=Off``. + +In any case, after the build completes, you can type `make test` to run the +tests you have chosen to build and see the results. + +.. note:: Which kernel variants that can be run depend on which programming + model features have been enabled for a build. By default, only + *sequential* CPU RAJA and baseline variants will be built. To + additionally enable OpenMP variants, for example, you must pass the + ``DENABLE_OPENMP=On`` option to CMake. Similarly, for CUDA, HIP, + and other programming model variants. + +.. important:: For GPU-enabled builds, only one GPU back-end can be enabled + in a single executable. However, CPU and GPU enabled execution + can be enabled in a single executable. For example, one can + enable CPU sequential, OpenMP, and CUDA GPU variants in a build. + Similarly for HIP GPU variants. + Building with MPI ----------------- -Some of the provided configurations will build the Performance Suite with +Some provided configurations will build the Performance Suite with MPI support enabled. For example:: $ ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 70 10.0.1 @@ -158,20 +182,34 @@ MPI support enabled. For example:: $ make -j In general MPI support can be enabled by passing the `-DENABLE_MPI=On` option -to CMake and providing a mpi compiler wrapper via the +to CMake and providing a MPI compiler wrapper via the ``-DMPI_CXX_COMPILER=/path/to/mpic++`` option to CMake in addition to other CMake options. For example:: $ mkdir my-mpi-build $ cd my-mpi-build - $ cmake -DENABLE_MPI=On -DMPI_CXX_COMPILER=/path/to/mpic++ ../ + $ cmake ``. For example:: + + $ mkdir my-gpu-build + $ cd my-gpu-build + $ cmake \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ + .. + $ make -j + diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index 53a620d3e..f7c86d90e 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -12,15 +12,17 @@ RAJA Performance Suite User Guide ################################### If you have some familiarity with RAJA and want to get up and running quickly, -check out :ref:`getting_started-label`. This guide contains information -about accessing the RAJA Performance Suite code, building it, and basic usage. +check out :ref:`getting_started-label`, which contains information about +getting the RAJA Performance Suite code, how to build it, how to run it, etc. -If you want information about RAJA, please see the +If you want more information about RAJA, please see the `RAJA User Guide `_ -That Guidecontains a detailed discussion of RAJA features and a RAJA tutorial, +That Guide contains a detailed discussion of RAJA features and a RAJA tutorial, both of which contain code examples. .. toctree:: :maxdepth: 3 getting_started + running +.. ## output diff --git a/docs/sphinx/user_guide/running.rst b/docs/sphinx/user_guide/running.rst new file mode 100644 index 000000000..4edce4ce6 --- /dev/null +++ b/docs/sphinx/user_guide/running.rst @@ -0,0 +1,96 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _running-label: + +********************************************* +Running the RAJA Performance Suite +********************************************* + +This section describes how to run the Suite and which execution options are +available. + +After the Suite is compiled, the executable will be located in the ``bin`` +subdirectory of the build space directory. The executable will be able to run +all kernels and variants that have been built depending on which programming +model back-ends have been enabled via CMake options. + +To execute the Suite in its default mode, run the executable with no +command-line arguments:: + + $ ./bin/raja-perf.exe + +This will run all kernels and variants that have been built in their default +configurations. Some information describing how the Suite will run along with +some information about each kernel will appear on the screen. More information +about kernel details will also appear in a run report file generated in your +run directory after Suite execution completes. + +.. note:: You can pass the ``--dryrun`` command-line option to the executable + to see a summary of how the Suite will execute without actually + running it. + +The Suite can be run in a variety of ways that are determined by the options +passed to the executable. For example, you can run or exclude subsets of +kernels, variants, or groups. You can also pass other options to set problem +sizes, number of times each kernel is run (sampled), etc. The idea is to build +the code once and use scripts or other means to run the Suite in different +ways for analyses you want to perform. + +All options appear in a *long form* with a double hyphen prefix (i.e., '--'). +Commonly used options are also available in a one or two character *short form* +with a single hyphen prefix (i.e., '-') for convenience. To see available +options along with a brief description of each, pass the `--help` or `-h` +option to the executable:: + + $ ./bin/raja-perf.exe --help + +or + + $ ./bin/raja-perf.exe -h + +.. note:: To see all available Suite options, pass the `--help` or `-h` + option to the executable. + +Lastly, the program will emit a summary of command-line arguments it was given +if the input contains something that the code does not know how to parse. +The ill-formed input will be noted in the summary. Hopefully, this makes +it easy for users to correct erroneous usage, such as mis-spelled option names. + +================== +Running with MPI +================== + +Running the Suite with MPI is just like running any other MPI application. +For example:: + + $ srun -n 2 ./bin/raja-perf.exe + +will run the entire Suite (all kernels and variants) in their default +configurations on each of 2 MPI ranks. + +The kernel information output shows how +each kernel is run on each rank. The total problem size across all MPI ranks +can be calculated by multiplying the number of MPI ranks by the problem +size in the kernel information. Timing is reported on rank 0 and is gathered +by doing an MPI barrier, starting the timer, running the kernel repetitions, +doing an MPI barrier, and then stopping the timer. + +====================== +OpenMP target offload +====================== + +The OpenMP target offload variants of the kernels in the Suite are +considered a work-in-progress since the RAJA OpenMP target offload back-end +is a work-in-progress. If you configure them to build, they can be run with +the executable `./bin/raja-perf-omptarget.exe` which is distinct from the one +described above. At the time the OpenMP target offload variants were +developed, it was not possible for them to co-exist in the same executable +as the CUDA variants, for example. In the future, the build system may +be reworked so that the OpenMP target variants can be run from the same +executable as the other variants. From 0cb7a35239acb97e948a57c04669bb16017f84e9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Feb 2023 14:52:03 -0800 Subject: [PATCH 023/137] Clean up text and clarify points. --- docs/index.rst | 72 ++++++++------- docs/sphinx/user_guide/getting_started.rst | 102 ++++++++++----------- docs/sphinx/user_guide/index.rst | 13 ++- 3 files changed, 92 insertions(+), 95 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 191025ea5..f716cdf24 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,43 +16,49 @@ Motivation and Background ============================= The RAJA Performance Suite is designed to explore performance of loop-based -computational kernels found in HPC applications. Specifically, it can be +computational kernels found in HPC applications. Specifically, it is used to assess and monitor runtime performance of kernels implemented using -`RAJA `_ C++ performance portability -abstractions. The Suite contains a variety of kernels implemented using -common parallel programming models, such as OpenMP and CUDA. Some important -terminology used in the Suite includes: +`RAJA C++ performance portability abstractions `_. +The Suite contains a variety of kernels implemented using common parallel +programming models, such as OpenMP and CUDA. Some important +terminology used in the Suite implementation and discussion includes: - * `Kernel` is a distinct loop-based computation that appears in the Suite in + * **Kernel** is a distinct loop-based computation that appears in the Suite in multiple variants (or implementations), each of which performs the same computation. - * `Group` is a collection of kernels in the Suite that are grouped together - because they originate from the same source, such as a specific benchmark - suite. - * `Variant` refers to implementations of Suite kernels that share the same - approach/abstraction and programming model, such as baseline OpenMP, RAJA - OpenMP, etc. - * `Tuning` is a particular implementation of a variant of a kernel in the - Suite, such as GPU thread-block size 128, GPU thread-block size 256, etc. - -Each kernel in the Suite appears in multiple RAJA and non-RAJA (i.e., baseline) -variants using parallel programming models that RAJA supports. Some kernels have -multiple tunings of a variant to explore some of the parametrization that the -programming model supports. The kernels originate from various HPC benchmark -suites and applications. For example, the "Stream" group contains kernels from -the Babel Stream benchmark, the "Apps" group contains kernels extracted from -real scientific computing applications, and so forth. + * **Group** is a subset of kernels in the Suite that originated from the + same source, such as a specific benchmark suite. + * **Variant** refers to implementations of Suite kernels that share the same + implementation approach and programming model, such as *baseline OpenMP*, + *RAJA OpenMP*, etc. + * **Tuning** refers to an implementation of kernels with a particular + execution parameterization, such as GPU thread-block size 128, GPU + thread-block size 256, etc. + +The kernels in the Suite originate from various HPC benchmark suites and +applications. For example, the "Stream" group contains kernels from the Babel +Stream benchmark, the "Apps" group contains kernels extracted from +real scientific computing applications, and so forth. Each kernel in the Suite +appears in multiple RAJA and non-RAJA (i.e., *baseline*) variants that use +parallel programming models supported by RAJA. Some kernels have multiple +tunings of a variant to explore the performance implications of options that +a programming model supports. + +.. note:: Available variants for a kernel do not need to include all possible + variants in the Suite. In some cases, a kernel appears only in the + subset of variants that makes sense for the particular kernel. The Suite can be run as a single process or with multiple processes when -configured with MPI support. When running with multiple MPI ranks, the same -code is executed on all ranks. Ranks are synchronized before and after each -kernel executes to gather timing data to rank zero. Running with MPI in the -same configuration used by an HPC app allows the Suite to generate performance -data that is more relevant for that HPC app than performance data generated -running with a single process. For example, running sequentially with one MPI +configured with MPI support. When running with multiple MPI processes, the same +code is executed on each rank. Ranks are synchronized before and after each +kernel executes to gather timing data to rank zero. Running with multiple +MPI processes helps the Suite generate performance data that is more +realistic for HPC applications than performance data generated running with +a single process. For example, running sequentially with one MPI rank per core vs running sequentially with a single process yields different performance results on most multi-core CPUs because bandwidth resources are -exercised differently. +exercised differently. Similarly, for GPU systems where multiple MPI ranks +are necessary to fully utilize GPU compute resources. More information about running the Suite for different types of performance studies is provided in the @@ -62,10 +68,10 @@ studies is provided in the Git Repository and Issue Tracking ================================= -The main interaction hub for the RAJA Performance Suite is -`GitHub `_ There you will find the Git -source code repository, issue tracker, release history, and other information -about the project. +The main interaction hub for the RAJA Performance Suite is its +`GitHub project `_. There you will find +the Git source code repository, issue tracker, release history, and other +information about the project. ================================ Communicating with the RAJA Team diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst index 9716fc33e..6ee0fa3f9 100644 --- a/docs/sphinx/user_guide/getting_started.rst +++ b/docs/sphinx/user_guide/getting_started.rst @@ -12,8 +12,8 @@ Getting Started With RAJA Performance Suite ********************************************* -This section should help get you building the RAJA Performance Suite code -quickly. +This section should help you build the RAJA Performance Suite code so you can +start running it. .. _getting_started_reqs-label: @@ -21,21 +21,19 @@ quickly. Requirements ============ -The primary requirement for using the RAJA Performance Suite is a C++14 -standard compliant compiler. Different kernel variants use different -programming models like CUDA or HIP and must be supported by the compiler -you chose to build and run them. For the most part, available configuration -options and how to enable or disable them are similar to those in RAJA, -which are described in `RAJA Build Options `_. Later in this -section, we describe a few options that are specific to the RAJA Performance -Suite. - -To build the RAJA Performance Suite and run basic kernel variants, you will -need: +The primary requirement for building the RAJA Performance Suite are: - C++ compiler with C++14 support - `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. +Different kernel variants use different +programming models, like CUDA or HIP. Those models must be supported by the +compiler you are using to be able to build and run the corresponding variants. +For the most part, available configuration options and how to enable or +disable them are similar to `RAJA build options `_. + +Later, we discuss options that are specific to the RAJA Performance Suite. + .. _getting_started_getcode-label: ================== @@ -49,7 +47,7 @@ command:: $ git clone --recursive https://github.com/LLNL/RAJAPerf.git -The ``--recursive`` option is used to pull all RAJA Performance Suite +The ``--recursive`` option is required to pull all RAJA Performance Suite Git *submodules*, on which it depends, into your local copy of the repository. After running the ``git clone`` command, a copy of the RAJA Performance Suite @@ -73,9 +71,9 @@ code and build it. .. note:: * If you switch branches in a RAJA Performance Suite repo (e.g., you are on a branch, with everything up-to-date, and you run the command ``git checkout ``, you may need to - run the command ``git submodule update`` to set the Git submodule - versions to what is used by the new branch. - * If the set of submodules in a new branch is different than the + run the command ``git submodule update`` to set the Git *submodule + versions* to what is used by the new branch. + * If the *set of submodules* in a new branch is different than the previous branch you were on, you may need to run the command ``git submodule update --init --recursive`` to pull in the correct set of submodule and versions. @@ -89,16 +87,14 @@ Dependencies The RAJA Performance Suite has several dependencies that are required, and which are contained in the Suite submodules. For most usage, we recommend using the submodules which are pinned to specific versions of those libraries -for each branch/release. +in each branch or release. The most important dependencies are: - `RAJA `_ - `BLT build system `_ -- `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. -- A C++ 14 standard compliant compiler -Please see `RAJA Dependencies `_ for more information about +RAJA also contains dependencies. Please see `RAJA Dependencies `_ for more information about RAJA dependencies. .. _getting_started_build-label: @@ -113,18 +109,27 @@ for more information. When building the RAJA Performance Suite, RAJA and the RAJA Performance Suite are built together using the same CMake -configuration. For convenience, we include scripts in the ``scripts`` -directory that invoke associated configuration files (CMake cache files) -in the RAJA submodule. For example, the ``scripts/lc-builds`` directory -contains scripts that show how we build code for testing on platforms in -the Computing Center at Lawrence Livermore National Laboratory. Each build -script creates a -descriptively-named build space directory in the top-level RAJA Performance -Suite directory and runs CMake with a configuration appropriate for the -platform and specified compiler(s). After CMake completes, enter the build -directory and type ``make`` (or ``make -j `` or ``make -j`` for a parallel -build using N processor cores, or all available processor cores on a node, -respectively). For example:: +configuration. The basic process for generating a build space and configuration +is to create a build directory and run CMake from there. For example:: + + $ pwd + path/to/RAJAPerf + $ mkdir my-build + $ cd my-build + $ cmake .. + $ make -j + +For convenience, we include scripts in the ``scripts`` directory for various +build configurations that invoke associated *host-config* files (CMake cache +files) in the RAJA submodule. For example, the ``scripts/lc-builds`` directory +contains scripts that can be used to generate build configurations for machines +in the Computing Center at Lawrence Livermore National Laboratory. These +scripts are designed to be run in the top-level RAJAPerf directory. Each build +script creates a descriptively-named build space directory and runs CMake with +a configuration appropriate for the platform and specified compiler(s). After +CMake completes, enter the build directory and type ``make`` +(or ``make -j `` or ``make -j`` for a parallel build using N processor +cores, or all available processor cores on a node, respectively). For example:: $ ./scripts/blueos_nvcc_clang.sh 10.2.89 70 10.0.1 $ cd build_blueos_nvcc10.2.89-sm_70-clang10.0.1 @@ -138,14 +143,6 @@ the CUDA compute architecture ``sm_70``. helpful examples of running CMake to generate a variety of build configurations. -You can also create your own build directory and run CMake with your own -options from there. For example:: - - & mkdir my-build - & cd my-build - & cmake ../ - & make -j - When no CMake test options are provided, only the RAJA Performance Suite code will be built. If you want to build both the Suite tests and RAJA tests (to verify that everything is built properly), pass the following options to @@ -153,10 +150,8 @@ CMake: ``-DENABLE_TESTS=On`` and ``-DRAJA_PERFSUITE_ENABLE_TESTS=On``. This can be done on the command line if you run CMake directly or by editing the build script you are using. If you want to build the Suite tests, but not RAJA tests, pass the two CMake options above plus the option -``-DRAJA_ENABLE_TESTS=Off``. - -In any case, after the build completes, you can type `make test` to run the -tests you have chosen to build and see the results. +``-DRAJA_ENABLE_TESTS=Off``. In any case, after the build completes, you can +type `make test` to run the tests you have chosen to build and see the results. .. note:: Which kernel variants that can be run depend on which programming model features have been enabled for a build. By default, only @@ -174,14 +169,16 @@ tests you have chosen to build and see the results. Building with MPI ----------------- -Some provided configurations will build the Performance Suite with -MPI support enabled. For example:: +Earlier, we mentioned that the Suite can be built with MPI enabled and +described why this is useful. Some configuration scripts we provide will +configure a build for the Performance Suite with MPI support enabled. +For example:: $ ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 70 10.0.1 $ cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1 $ make -j -In general MPI support can be enabled by passing the `-DENABLE_MPI=On` option +In general, MPI support can be enabled by passing the `-DENABLE_MPI=On` option to CMake and providing a MPI compiler wrapper via the ``-DMPI_CXX_COMPILER=/path/to/mpic++`` option to CMake in addition to other CMake options. For example:: @@ -193,20 +190,15 @@ CMake options. For example:: .. $ make -j -When MPI is enabled, you can run the RAJA Performance Suite in a way that -mimics how a real application would run, such as by fully utilizing a GPU -or all CPU cores. MPI-enabled execution is supported to generate realistic -performance data with the Suite. - Building with specific GPU thread-block size tunings ----------------------------------------------------- If desired, you can build a version of the RAJA Performance Suite code with -multiple GPU kernel versions that will run with different GPU thread-block +multiple versions of GPU kernels that will run with different GPU thread-block sizes. The CMake option for this is ``-DRAJA_PERFSUITE_GPU_BLOCKSIZES=``. For example:: - $ mkdir my-gpu-build + $ mkdir my-gnu-build $ cd my-gpu-build $ cmake \ -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index f7c86d90e..e65fd9132 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -11,14 +11,13 @@ RAJA Performance Suite User Guide ################################### -If you have some familiarity with RAJA and want to get up and running quickly, -check out :ref:`getting_started-label`, which contains information about -getting the RAJA Performance Suite code, how to build it, how to run it, etc. +If you have some familiarity with RAJA and want to get up and running with +the RAJA Performance Suite quickly, check out :ref:`getting_started-label`, +which contains information about getting the code, building it, running it, etc. -If you want more information about RAJA, please see the -`RAJA User Guide `_ -That Guide contains a detailed discussion of RAJA features and a RAJA tutorial, -both of which contain code examples. +If you want to learn more about RAJA, please see the +`RAJA User Guide `_, which contains a detailed discussion of RAJA features and a RAJA +tutorial, both of which contain code examples. .. toctree:: :maxdepth: 3 From f09f0caeffee0b069fce8633100b7657abc66b72 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Feb 2023 15:48:17 -0800 Subject: [PATCH 024/137] section clean up. --- docs/sphinx/user_guide/getting_started.rst | 109 +++++++++++---------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst index 6ee0fa3f9..48f40896f 100644 --- a/docs/sphinx/user_guide/getting_started.rst +++ b/docs/sphinx/user_guide/getting_started.rst @@ -26,13 +26,11 @@ The primary requirement for building the RAJA Performance Suite are: - C++ compiler with C++14 support - `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. -Different kernel variants use different -programming models, like CUDA or HIP. Those models must be supported by the -compiler you are using to be able to build and run the corresponding variants. For the most part, available configuration options and how to enable or disable them are similar to `RAJA build options `_. -Later, we discuss options that are specific to the RAJA Performance Suite. +Later in this section, we discuss options that are specific to the +RAJA Performance Suite. .. _getting_started_getcode-label: @@ -40,7 +38,7 @@ Later, we discuss options that are specific to the RAJA Performance Suite. Getting the Code ================== -The RAJA Performance Suite project is hosted on GitHub: +The RAJA Performance Suite code is hosted on the `GitHub RAJA Performance Suite project `_. To get the code, clone the repository into a local working space using the command:: @@ -48,7 +46,7 @@ command:: $ git clone --recursive https://github.com/LLNL/RAJAPerf.git The ``--recursive`` option is required to pull all RAJA Performance Suite -Git *submodules*, on which it depends, into your local copy of the repository. +Git *submodules* on which it depends into your local copy of the repository. After running the ``git clone`` command, a copy of the RAJA Performance Suite repository will reside in the ``RAJAPerf`` subdirectory where you ran the @@ -65,18 +63,20 @@ directory after cloning:: $ git submodule update --init --recursive -Either way, the end result is the same and you should be good to configure the +Either way, the result is the same and you should be good to configure the code and build it. -.. note:: * If you switch branches in a RAJA Performance Suite repo (e.g., - you are on a branch, with everything up-to-date, and you run the - command ``git checkout ``, you may need to +.. note:: * If you are in your local copy of the RAJA Performance Suite repo + and you switch to a different repo branch (e.g., you run the + command ``git checkout ``), you may need to run the command ``git submodule update`` to set the Git *submodule - versions* to what is used by the new branch. + versions* to what is used by the new branch. To see if this is + required, the ``git status`` command will indicate whether the + submodules are at the proper versions. * If the *set of submodules* in a new branch is different than the previous branch you were on, you may need to run the command - ``git submodule update --init --recursive`` to pull in the - correct set of submodule and versions. + ``git submodule update --init --recursive`` (described above) to + pull in the correct set of submodule and versions. .. _getting_started_depend-label: @@ -84,8 +84,8 @@ code and build it. Dependencies ================== -The RAJA Performance Suite has several dependencies that are required, and -which are contained in the Suite submodules. For most usage, we recommend +The RAJA Performance Suite has several required dependencies. These are +contained in the Suite Git submodules. So for most usage, we recommend using the submodules which are pinned to specific versions of those libraries in each branch or release. @@ -94,8 +94,8 @@ The most important dependencies are: - `RAJA `_ - `BLT build system `_ -RAJA also contains dependencies. Please see `RAJA Dependencies `_ for more information about -RAJA dependencies. +RAJA also contains dependencies, which are discussed in +`RAJA Dependencies `_. .. _getting_started_build-label: @@ -104,13 +104,12 @@ Build and Install ================== The build and install process for the RAJA Performance Suite is similar to -the process for RAJA. Please see `RAJA Build and Install `_ -for more information. +the process for RAJA, which is described in `RAJA Build and Install `_. -When building the RAJA Performance Suite, -RAJA and the RAJA Performance Suite are built together using the same CMake -configuration. The basic process for generating a build space and configuration -is to create a build directory and run CMake from there. For example:: +When building the RAJA Performance Suite, RAJA and the RAJA Performance Suite +are built together using the same CMake configuration. The basic process for +generating a build space and configuration is to create a build directory and +run CMake in it. For example:: $ pwd path/to/RAJAPerf @@ -119,19 +118,21 @@ is to create a build directory and run CMake from there. For example:: $ cmake .. $ make -j -For convenience, we include scripts in the ``scripts`` directory for various -build configurations that invoke associated *host-config* files (CMake cache -files) in the RAJA submodule. For example, the ``scripts/lc-builds`` directory -contains scripts that can be used to generate build configurations for machines -in the Computing Center at Lawrence Livermore National Laboratory. These -scripts are designed to be run in the top-level RAJAPerf directory. Each build +For convenience and informational purposes, we maintain scripts in the +``scripts`` directory for various build configurations. These scripts invoke +associated *host-config* files (CMake cache files) in the RAJA submodule. For +example, the ``scripts/lc-builds`` directory contains scripts that we use +during development to generate build configurations for machines +in the Livermore Computing Center at Lawrence Livermore National Laboratory. +These scripts are designed to be run in the top-level RAJAPerf directory. Each script creates a descriptively-named build space directory and runs CMake with -a configuration appropriate for the platform and specified compiler(s). After -CMake completes, enter the build directory and type ``make`` -(or ``make -j `` or ``make -j`` for a parallel build using N processor -cores, or all available processor cores on a node, respectively). For example:: +a configuration appropriate for the platform and specified compiler(s). To +compile the code after CMake completes, enter the build directory and type +``make`` (or ``make -j `` or ``make -j`` for a parallel build using N +processor cores, or all available processor cores on a node, respectively). +For example:: - $ ./scripts/blueos_nvcc_clang.sh 10.2.89 70 10.0.1 + $ ./scripts/lc-builds/blueos_nvcc_clang.sh 10.2.89 70 10.0.1 $ cd build_blueos_nvcc10.2.89-sm_70-clang10.0.1 $ make -j @@ -151,41 +152,43 @@ can be done on the command line if you run CMake directly or by editing the build script you are using. If you want to build the Suite tests, but not RAJA tests, pass the two CMake options above plus the option ``-DRAJA_ENABLE_TESTS=Off``. In any case, after the build completes, you can -type `make test` to run the tests you have chosen to build and see the results. +type ``make test`` to run the tests you have built and see the results. -.. note:: Which kernel variants that can be run depend on which programming - model features have been enabled for a build. By default, only - *sequential* CPU RAJA and baseline variants will be built. To - additionally enable OpenMP variants, for example, you must pass the - ``DENABLE_OPENMP=On`` option to CMake. Similarly, for CUDA, HIP, - and other programming model variants. +.. note:: Which kernel variants that can be run depends on which programming + model features have been enabled in a build configuration. By + default, only *sequential* CPU RAJA and baseline variants will be + built. To additionally enable OpenMP variants, for example, you must + pass the ``DENABLE_OPENMP=On`` option to CMake. Similar options will + enable other variants for CUDA, HIP, and other programming models. -.. important:: For GPU-enabled builds, only one GPU back-end can be enabled - in a single executable. However, CPU and GPU enabled execution - can be enabled in a single executable. For example, one can - enable CPU sequential, OpenMP, and CUDA GPU variants in a build. - Similarly for HIP GPU variants. +.. note:: For GPU-enabled builds, only one GPU back-end can be enabled in a + single executable. However, CPU and GPU enabled execution can be + enabled in a single executable. For example, one can enable CPU + sequential, OpenMP, and CUDA GPU variants in a build. Similarly + for HIP GPU variants. Building with MPI ----------------- Earlier, we mentioned that the Suite can be built with MPI enabled and described why this is useful. Some configuration scripts we provide will -configure a build for the Performance Suite with MPI support enabled. -For example:: +configure a build with MPI support enabled. For example:: - $ ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 70 10.0.1 + $ ./scripts/lc-builds/lc-blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 70 10.0.1 $ cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1 $ make -j +This will configure a build to use the *rolling release* of the Spectrum MPI +implementation for an appropriate Livermore Computing system. + In general, MPI support can be enabled by passing the `-DENABLE_MPI=On` option to CMake and providing a MPI compiler wrapper via the -``-DMPI_CXX_COMPILER=/path/to/mpic++`` option to CMake in addition to other -CMake options. For example:: +``-DMPI_CXX_COMPILER=/path/to/mpic++`` option to CMake, in addition to other +necessary CMake options. For example:: $ mkdir my-mpi-build $ cd my-mpi-build - $ cmake \ -DENABLE_MPI=On -DMPI_CXX_COMPILER=/path/to/mpic++ \ .. $ make -j @@ -205,3 +208,5 @@ sizes. The CMake option for this is .. $ make -j +will build versions of GPU kernels that use 64, 128, 256, 512, and 1024 threads +per GPU thread-block. From 96585a6c368237ba964b40de16fa868e967253d8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 17 Feb 2023 10:07:22 -0800 Subject: [PATCH 025/137] Use RAJA::allocate_aligned_type for all host memory --- src/common/DataUtils.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index ad607e413..197737057 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -84,21 +84,20 @@ void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) */ void allocData(Int_ptr& ptr, int len) { - // Should we do this differently for alignment?? If so, change dealloc() - ptr = new Int_type[len]; + ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, + len*sizeof(Int_type)); } void allocData(Real_ptr& ptr, int len) { - ptr = - RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Real_type)); + ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, + len*sizeof(Real_type)); } void allocData(Complex_ptr& ptr, int len) { - // Should we do this differently for alignment?? If so, change dealloc() - ptr = new Complex_type[len]; + ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, + len*sizeof(Complex_type)); } @@ -108,8 +107,8 @@ void allocData(Complex_ptr& ptr, int len) void deallocData(Int_ptr& ptr) { if (ptr) { - delete [] ptr; - ptr = 0; + RAJA::free_aligned(ptr); + ptr = nullptr; } } @@ -117,15 +116,15 @@ void deallocData(Real_ptr& ptr) { if (ptr) { RAJA::free_aligned(ptr); - ptr = 0; + ptr = nullptr; } } void deallocData(Complex_ptr& ptr) { if (ptr) { - delete [] ptr; - ptr = 0; + RAJA::free_aligned(ptr); + ptr = nullptr; } } From 6bc8c51a902dac06f195a3ff14afcc71c1d3377b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 17 Feb 2023 10:31:09 -0800 Subject: [PATCH 026/137] Add alignment arg to Data utils Add helper functions in KernelBase that call the Data utils to avoid having to change allocations everywhere. --- src/apps/HALOEXCHANGE.cpp | 62 +++++++++++------------------- src/apps/HALOEXCHANGE.hpp | 17 +++++++++ src/apps/HALOEXCHANGE_FUSED.cpp | 62 +++++++++++------------------- src/apps/HALOEXCHANGE_FUSED.hpp | 17 +++++++++ src/common/DataUtils.cpp | 51 +++++++++++++------------ src/common/DataUtils.hpp | 48 ++++++++++++----------- src/common/KernelBase.cpp | 2 +- src/common/KernelBase.hpp | 67 +++++++++++++++++++++++++++++++++ 8 files changed, 198 insertions(+), 128 deletions(-) diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 0d8eac44f..1a8c1272a 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -19,28 +19,6 @@ namespace rajaperf namespace apps { -namespace { - -void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); -void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); -void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); -void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid); - -} - - HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE, params) { @@ -172,14 +150,17 @@ struct Extent Index_type k_max; }; +} + // // Function to generate index lists for packing. // -void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE::create_pack_lists( + std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) { std::vector pack_index_list_extents(num_neighbors); @@ -304,9 +285,10 @@ void create_pack_lists(std::vector& pack_index_lists, // // Function to destroy packing index lists. // -void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE::destroy_pack_lists( + std::vector& pack_index_lists, + const Index_type num_neighbors, + VariantID vid) { (void) vid; @@ -318,11 +300,12 @@ void destroy_pack_lists(std::vector& pack_index_lists, // // Function to generate index lists for unpacking. // -void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE::create_unpack_lists( + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) { std::vector unpack_index_list_extents(num_neighbors); @@ -447,9 +430,10 @@ void create_unpack_lists(std::vector& unpack_index_lists, // // Function to destroy unpacking index lists. // -void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE::destroy_unpack_lists( + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid) { (void) vid; @@ -458,7 +442,5 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, } } -} // end namespace - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 50854612e..1f21d9616 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -126,6 +126,23 @@ class HALOEXCHANGE : public KernelBase std::vector m_pack_index_list_lengths; std::vector m_unpack_index_lists; std::vector m_unpack_index_list_lengths; + + void create_pack_lists(std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + void destroy_pack_lists(std::vector& pack_index_lists, + const Index_type num_neighbors, + VariantID vid); + void create_unpack_lists(std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + void destroy_unpack_lists(std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid); }; } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index b7c0bfd84..5c986df75 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -19,28 +19,6 @@ namespace rajaperf namespace apps { -namespace { - -void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); -void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); -void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); -void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid); - -} - - HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params) { @@ -172,14 +150,17 @@ struct Extent Index_type k_max; }; +} + // // Function to generate index lists for packing. // -void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE_FUSED::create_pack_lists( + std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) { std::vector pack_index_list_extents(num_neighbors); @@ -304,9 +285,10 @@ void create_pack_lists(std::vector& pack_index_lists, // // Function to destroy packing index lists. // -void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE_FUSED::destroy_pack_lists( + std::vector& pack_index_lists, + const Index_type num_neighbors, + VariantID vid) { (void) vid; @@ -318,11 +300,12 @@ void destroy_pack_lists(std::vector& pack_index_lists, // // Function to generate index lists for unpacking. // -void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE_FUSED::create_unpack_lists( + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) { std::vector unpack_index_list_extents(num_neighbors); @@ -447,9 +430,10 @@ void create_unpack_lists(std::vector& unpack_index_lists, // // Function to destroy unpacking index lists. // -void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) +void HALOEXCHANGE_FUSED::destroy_unpack_lists( + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid) { (void) vid; @@ -458,7 +442,5 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, } } -} // end namespace - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index f9ec78d8d..b0af7e60e 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -170,6 +170,23 @@ class HALOEXCHANGE_FUSED : public KernelBase std::vector m_pack_index_list_lengths; std::vector m_unpack_index_lists; std::vector m_unpack_index_list_lengths; + + void create_pack_lists(std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + void destroy_pack_lists(std::vector& pack_index_lists, + const Index_type num_neighbors, + VariantID vid); + void create_unpack_lists(std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + void destroy_unpack_lists(std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid); }; } // end namespace apps diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 197737057..0d78f8915 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -16,6 +16,9 @@ namespace rajaperf { +namespace detail +{ + static int data_init_count = 0; /* @@ -36,45 +39,42 @@ void incDataInitCount() /* - * Allocate and initialize aligned integer data arrays. + * Allocate and initialize aligned data arrays. */ -void allocAndInitData(Int_ptr& ptr, int len, VariantID vid) +void allocAndInitData(Int_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len); + allocData(ptr, len, align); initData(ptr, len, vid); } -/* - * Allocate and initialize aligned data arrays. - */ -void allocAndInitData(Real_ptr& ptr, int len, VariantID vid ) +void allocAndInitData(Real_ptr& ptr, int len, int align, VariantID vid ) { - allocData(ptr, len); + allocData(ptr, len, align); initData(ptr, len, vid); } -void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val, +void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, VariantID vid) { - allocData(ptr, len); + allocData(ptr, len, align); initDataConst(ptr, len, val, vid); } -void allocAndInitDataRandSign(Real_ptr& ptr, int len, VariantID vid) +void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len); + allocData(ptr, len, align); initDataRandSign(ptr, len, vid); } -void allocAndInitDataRandValue(Real_ptr& ptr, int len, VariantID vid) +void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len); + allocData(ptr, len, align); initDataRandValue(ptr, len, vid); } -void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) +void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len); + allocData(ptr, len, align); initData(ptr, len, vid); } @@ -82,22 +82,22 @@ void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) /* * Allocate data arrays of given type. */ -void allocData(Int_ptr& ptr, int len) +void allocData(Int_ptr& ptr, int len, int align) { - ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Int_type)); + ptr = RAJA::allocate_aligned_type( + align, len*sizeof(Int_type)); } -void allocData(Real_ptr& ptr, int len) +void allocData(Real_ptr& ptr, int len, int align) { - ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Real_type)); + ptr = RAJA::allocate_aligned_type( + align, len*sizeof(Real_type)); } -void allocData(Complex_ptr& ptr, int len) +void allocData(Complex_ptr& ptr, int len, int align) { - ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Complex_type)); + ptr = RAJA::allocate_aligned_type( + align, len*sizeof(Complex_type)); } @@ -328,6 +328,7 @@ void initData(Real_type& d, VariantID vid) incDataInitCount(); } +} // closing brace for detail namespace /* * Calculate and return checksum for data arrays. diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index d3ca53cfd..5251e59b9 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -31,6 +31,8 @@ namespace rajaperf { +namespace detail +{ /*! * Reset counter for data initialization. @@ -48,16 +50,16 @@ void incDataInitCount(); * * Array is initialized using method initData(Int_ptr& ptr...) below. */ -void allocAndInitData(Int_ptr& ptr, int len, - VariantID vid = NumVariants); +void allocAndInitData(Int_ptr& ptr, int len, int align, + VariantID vid); /*! * \brief Allocate and initialize aligned Real_type data array. * * Array is initialized using method initData(Real_ptr& ptr...) below. */ -void allocAndInitData(Real_ptr& ptr, int len, - VariantID vid = NumVariants); +void allocAndInitData(Real_ptr& ptr, int len, int align, + VariantID vid); /*! * \brief Allocate and initialize aligned Real_type data array. @@ -65,16 +67,16 @@ void allocAndInitData(Real_ptr& ptr, int len, * Array entries are initialized using the method * initDataConst(Real_ptr& ptr...) below. */ -void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid = NumVariants); +void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, + VariantID vid); /*! * \brief Allocate and initialize aligned Real_type data array with random sign. * * Array is initialized using method initDataRandSign(Real_ptr& ptr...) below. */ -void allocAndInitDataRandSign(Real_ptr& ptr, int len, - VariantID vid = NumVariants); +void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, + VariantID vid); /*! * \brief Allocate and initialize aligned Real_type data array with random @@ -82,23 +84,23 @@ void allocAndInitDataRandSign(Real_ptr& ptr, int len, * * Array is initialized using method initDataRandValue(Real_ptr& ptr...) below. */ -void allocAndInitDataRandValue(Real_ptr& ptr, int len, - VariantID vid = NumVariants); +void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, + VariantID vid); /*! * \brief Allocate and initialize aligned Complex_type data array. */ -void allocAndInitData(Complex_ptr& ptr, int len, - VariantID vid = NumVariants); +void allocAndInitData(Complex_ptr& ptr, int len, int align, + VariantID vid); /*! * \brief Allocate data arrays. */ -void allocData(Int_ptr& ptr, int len); +void allocData(Int_ptr& ptr, int len, int align); /// -void allocData(Real_ptr& ptr, int len); +void allocData(Real_ptr& ptr, int len, int align); /// -void allocData(Complex_ptr& ptr, int len); +void allocData(Complex_ptr& ptr, int len, int align); /*! * \brief Free data arrays. @@ -118,7 +120,7 @@ void deallocData(Complex_ptr& ptr); * a value > 1, one to a value < -1. */ void initData(Int_ptr& ptr, int len, - VariantID vid = NumVariants); + VariantID vid); /*! * \brief Initialize Real_type data array. @@ -128,7 +130,7 @@ void initData(Int_ptr& ptr, int len, * and the order in which this method is called. */ void initData(Real_ptr& ptr, int len, - VariantID vid = NumVariants); + VariantID vid); /*! * \brief Initialize Real_type data array. @@ -136,7 +138,7 @@ void initData(Real_ptr& ptr, int len, * Array entries are set to given constant value. */ void initDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid = NumVariants); + VariantID vid); /*! * \brief Initialize Real_type data array with random sign. @@ -145,7 +147,7 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val, * initData(Real_ptr& ptr...) above, but with random sign. */ void initDataRandSign(Real_ptr& ptr, int len, - VariantID vid = NumVariants); + VariantID vid); /*! * \brief Initialize Real_type data array with random values. @@ -153,7 +155,7 @@ void initDataRandSign(Real_ptr& ptr, int len, * Array entries are initialized with random values in the interval [0.0, 1.0]. */ void initDataRandValue(Real_ptr& ptr, int len, - VariantID vid = NumVariants); + VariantID vid); /*! * \brief Initialize Complex_type data array. @@ -162,7 +164,7 @@ void initDataRandValue(Real_ptr& ptr, int len, * method allocAndInitData(Real_ptr& ptr...) above. */ void initData(Complex_ptr& ptr, int len, - VariantID vid = NumVariants); + VariantID vid); /*! * \brief Initialize Real_type scalar data. @@ -171,7 +173,9 @@ void initData(Complex_ptr& ptr, int len, * initData(Real_ptr& ptr...) above. */ void initData(Real_type& d, - VariantID vid = NumVariants); + VariantID vid); + +} // closing brace for detail namespace /*! * \brief Calculate and return checksum for data arrays. diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 764770ca7..c620c4880 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -160,7 +160,7 @@ void KernelBase::execute(VariantID vid, size_t tune_idx) resetTimer(); - resetDataInitCount(); + detail::resetDataInitCount(); this->setUp(vid, tune_idx); this->runKernel(vid, tune_idx); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 885650336..d58384066 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -188,6 +188,73 @@ class KernelBase #endif } + // run_params.getMemoryAlignment(); } + int getDataAlignment() const { return RAJA::DATA_ALIGN; } + template + void allocData(T*& ptr, int len) + { + rajaperf::detail::allocData(ptr, len, getDataAlignment()); + } + template + void allocAndInitData(T*& ptr, int len, + VariantID vid = NumVariants) + { + rajaperf::detail::allocAndInitData(ptr, len, getDataAlignment(), vid); + } + template + void allocAndInitDataConst(T*& ptr, int len, T val, + VariantID vid = NumVariants) + { + rajaperf::detail::allocAndInitDataConst(ptr, len, getDataAlignment(), val, vid); + } + template + void allocAndInitDataRandSign(T*& ptr, int len, + VariantID vid = NumVariants) + { + rajaperf::detail::allocAndInitDataRandSign(ptr, len, getDataAlignment(), vid); + } + template + void allocAndInitDataRandValue(T*& ptr, int len, + VariantID vid = NumVariants) + { + rajaperf::detail::allocAndInitDataRandValue(ptr, len, getDataAlignment(), vid); + } + template + void deallocData(T*& ptr) + { + rajaperf::detail::deallocData(ptr); + } + template + void initData(T*& ptr, int len, + VariantID vid = NumVariants) + { + rajaperf::detail::initData(ptr, len, vid); + } + template + void initDataConst(T*& ptr, int len, T val, + VariantID vid = NumVariants) + { + rajaperf::detail::initDataConst(ptr, len, val, vid); + } + template + void initDataRandSign(T*& ptr, int len, + VariantID vid = NumVariants) + { + rajaperf::detail::initDataRandSign(ptr, len, vid); + } + template + void initDataRandValue(T*& ptr, int len, + VariantID vid = NumVariants) + { + rajaperf::detail::initDataRandValue(ptr, len, vid); + } + template + void initData(T& d, + VariantID vid = NumVariants) + { + rajaperf::detail::initData(d, vid); + } + void startTimer() { synchronize(); From ec074ce025f45ecb68ff9efcb93555e4cae04a96 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 17 Feb 2023 14:30:18 -0800 Subject: [PATCH 027/137] Add vid to allocation and deallocation --- src/algorithm/MEMCPY.cpp | 4 +-- src/algorithm/MEMSET.cpp | 2 +- src/algorithm/REDUCE_SUM.cpp | 2 +- src/algorithm/SCAN.cpp | 4 +-- src/algorithm/SORT.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 4 +-- src/apps/CONVECTION3DPA.cpp | 12 +++---- src/apps/DEL_DOT_VEC_2D.cpp | 10 +++--- src/apps/DIFFUSION3DPA.cpp | 10 +++--- src/apps/ENERGY.cpp | 38 +++++++++++----------- src/apps/FIR.cpp | 4 +-- src/apps/HALOEXCHANGE.cpp | 8 ++--- src/apps/HALOEXCHANGE_FUSED.cpp | 8 ++--- src/apps/LTIMES.cpp | 6 ++-- src/apps/LTIMES_NOVIEW.cpp | 6 ++-- src/apps/MASS3DPA.cpp | 10 +++--- src/apps/NODAL_ACCUMULATION_3D.cpp | 4 +-- src/apps/PRESSURE.cpp | 18 +++++----- src/apps/VOL3D.cpp | 8 ++--- src/apps/WIP-COUPLE.cpp | 10 +++--- src/basic/DAXPY.cpp | 6 ++-- src/basic/DAXPY_ATOMIC.cpp | 6 ++-- src/basic/IF_QUAD.cpp | 10 +++--- src/basic/INDEXLIST.cpp | 4 +-- src/basic/INDEXLIST_3LOOP.cpp | 4 +-- src/basic/INIT3.cpp | 10 +++--- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 6 ++-- src/basic/MULADDSUB.cpp | 10 +++--- src/basic/PI_ATOMIC.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 2 +- src/basic/REDUCE_STRUCT.cpp | 4 +-- src/common/DataUtils.cpp | 36 +++++++++++++------- src/common/DataUtils.hpp | 18 ++++++---- src/common/KernelBase.hpp | 28 ++++++++-------- src/lcals/DIFF_PREDICT.cpp | 4 +-- src/lcals/EOS.cpp | 8 ++--- src/lcals/FIRST_DIFF.cpp | 4 +-- src/lcals/FIRST_MIN.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 4 +-- src/lcals/GEN_LIN_RECUR.cpp | 8 ++--- src/lcals/HYDRO_1D.cpp | 6 ++-- src/lcals/HYDRO_2D.cpp | 22 ++++++------- src/lcals/INT_PREDICT.cpp | 18 +++++----- src/lcals/PLANCKIAN.cpp | 10 +++--- src/lcals/TRIDIAG_ELIM.cpp | 8 ++--- src/polybench/POLYBENCH_2MM.cpp | 10 +++--- src/polybench/POLYBENCH_3MM.cpp | 14 ++++---- src/polybench/POLYBENCH_ADI.cpp | 8 ++--- src/polybench/POLYBENCH_ATAX.cpp | 8 ++--- src/polybench/POLYBENCH_FDTD_2D.cpp | 8 ++--- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 4 +-- src/polybench/POLYBENCH_GEMM.cpp | 6 ++-- src/polybench/POLYBENCH_GEMVER.cpp | 18 +++++----- src/polybench/POLYBENCH_GESUMMV.cpp | 8 ++--- src/polybench/POLYBENCH_HEAT_3D.cpp | 8 ++--- src/polybench/POLYBENCH_JACOBI_1D.cpp | 8 ++--- src/polybench/POLYBENCH_JACOBI_2D.cpp | 8 ++--- src/polybench/POLYBENCH_MVT.cpp | 10 +++--- src/stream/ADD.cpp | 6 ++-- src/stream/COPY.cpp | 4 +-- src/stream/DOT.cpp | 4 +-- src/stream/MUL.cpp | 4 +-- src/stream/TRIAD.cpp | 6 ++-- 65 files changed, 288 insertions(+), 268 deletions(-) diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index fcc1af34f..583a19dea 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -71,8 +71,8 @@ void MEMCPY::updateChecksum(VariantID vid, size_t tune_idx) void MEMCPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace algorithm diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 98fc2e4db..fdc98b3fe 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -72,7 +72,7 @@ void MEMSET::updateChecksum(VariantID vid, size_t tune_idx) void MEMSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); + deallocData(m_x, vid); } } // end namespace algorithm diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 8ccd2bbab..f2f2b25d2 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -72,7 +72,7 @@ void REDUCE_SUM::updateChecksum(VariantID vid, size_t tune_idx) void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); + deallocData(m_x, vid); } } // end namespace algorithm diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 7a4d9091c..7b2933084 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -75,8 +75,8 @@ void SCAN::updateChecksum(VariantID vid, size_t tune_idx) void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace algorithm diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index ce059ad75..049c03304 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -60,7 +60,7 @@ void SORT::updateChecksum(VariantID vid, size_t tune_idx) void SORT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); + deallocData(m_x, vid); } } // end namespace algorithm diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 65ef046ff..96d79a7df 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -62,8 +62,8 @@ void SORTPAIRS::updateChecksum(VariantID vid, size_t tune_idx) void SORTPAIRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_i); + deallocData(m_x, vid); + deallocData(m_i, vid); } } // end namespace algorithm diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 83010024b..46eb341af 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -90,12 +90,12 @@ void CONVECTION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx { (void) vid; - deallocData(m_B); - deallocData(m_Bt); - deallocData(m_G); - deallocData(m_D); - deallocData(m_X); - deallocData(m_Y); + deallocData(m_B, vid); + deallocData(m_Bt, vid); + deallocData(m_G, vid); + deallocData(m_D, vid); + deallocData(m_X, vid); + deallocData(m_Y, vid); } } // end namespace apps diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 04c2d29f0..9fe3c3e85 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -96,11 +96,11 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx { (void) vid; - deallocData(m_x); - deallocData(m_y); - deallocData(m_xdot); - deallocData(m_ydot); - deallocData(m_div); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_xdot, vid); + deallocData(m_ydot, vid); + deallocData(m_div, vid); } } // end namespace apps diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index b59d21ad1..3b3a2cb31 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -90,11 +90,11 @@ void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) { (void) vid; - deallocData(m_B); - deallocData(m_G); - deallocData(m_D); - deallocData(m_X); - deallocData(m_Y); + deallocData(m_B, vid); + deallocData(m_G, vid); + deallocData(m_D, vid); + deallocData(m_X, vid); + deallocData(m_Y, vid); } } // end namespace apps diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 61799e150..8e77961b6 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -86,10 +86,10 @@ void ENERGY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) allocAndInitData(m_qq_old, getActualProblemSize(), vid); allocAndInitData(m_vnewc, getActualProblemSize(), vid); - initData(m_rho0); - initData(m_e_cut); - initData(m_emin); - initData(m_q_cut); + initData(m_rho0, vid); + initData(m_e_cut, vid); + initData(m_emin, vid); + initData(m_q_cut, vid); } void ENERGY::updateChecksum(VariantID vid, size_t tune_idx) @@ -102,21 +102,21 @@ void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_e_new); - deallocData(m_e_old); - deallocData(m_delvc); - deallocData(m_p_new); - deallocData(m_p_old); - deallocData(m_q_new); - deallocData(m_q_old); - deallocData(m_work); - deallocData(m_compHalfStep); - deallocData(m_pHalfStep); - deallocData(m_bvc); - deallocData(m_pbvc); - deallocData(m_ql_old); - deallocData(m_qq_old); - deallocData(m_vnewc); + deallocData(m_e_new, vid); + deallocData(m_e_old, vid); + deallocData(m_delvc, vid); + deallocData(m_p_new, vid); + deallocData(m_p_old, vid); + deallocData(m_q_new, vid); + deallocData(m_q_old, vid); + deallocData(m_work, vid); + deallocData(m_compHalfStep, vid); + deallocData(m_pHalfStep, vid); + deallocData(m_bvc, vid); + deallocData(m_pbvc, vid); + deallocData(m_ql_old, vid); + deallocData(m_qq_old, vid); + deallocData(m_vnewc, vid); } } // end namespace apps diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 75c0e803a..792f015d0 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -77,8 +77,8 @@ void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_in); - deallocData(m_out); + deallocData(m_in, vid); + deallocData(m_out, vid); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 1a8c1272a..53ec0ecc1 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -120,7 +120,7 @@ void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l]); + deallocData(m_buffers[l], vid); } m_buffers.clear(); @@ -133,7 +133,7 @@ void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) m_pack_index_lists.clear(); for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v]); + deallocData(m_vars[v], vid); } m_vars.clear(); } @@ -293,7 +293,7 @@ void HALOEXCHANGE::destroy_pack_lists( (void) vid; for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l]); + deallocData(pack_index_lists[l], vid); } } @@ -438,7 +438,7 @@ void HALOEXCHANGE::destroy_unpack_lists( (void) vid; for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l]); + deallocData(unpack_index_lists[l], vid); } } diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 5c986df75..5486c3645 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -120,7 +120,7 @@ void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l]); + deallocData(m_buffers[l], vid); } m_buffers.clear(); @@ -133,7 +133,7 @@ void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune m_pack_index_lists.clear(); for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v]); + deallocData(m_vars[v], vid); } m_vars.clear(); } @@ -293,7 +293,7 @@ void HALOEXCHANGE_FUSED::destroy_pack_lists( (void) vid; for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l]); + deallocData(pack_index_lists[l], vid); } } @@ -438,7 +438,7 @@ void HALOEXCHANGE_FUSED::destroy_unpack_lists( (void) vid; for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l]); + deallocData(unpack_index_lists[l], vid); } } diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index cad55c72e..b920631dd 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -99,9 +99,9 @@ void LTIMES::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_phidat); - deallocData(m_elldat); - deallocData(m_psidat); + deallocData(m_phidat, vid); + deallocData(m_elldat, vid); + deallocData(m_psidat, vid); } } // end namespace apps diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 9064f7c11..2f8dd4b40 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -98,9 +98,9 @@ void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) { (void) vid; - deallocData(m_phidat); - deallocData(m_elldat); - deallocData(m_psidat); + deallocData(m_phidat, vid); + deallocData(m_elldat, vid); + deallocData(m_psidat, vid); } } // end namespace apps diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index ab4a5095f..107f74408 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -86,11 +86,11 @@ void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_B); - deallocData(m_Bt); - deallocData(m_D); - deallocData(m_X); - deallocData(m_Y); + deallocData(m_B, vid); + deallocData(m_Bt, vid); + deallocData(m_D, vid); + deallocData(m_X, vid); + deallocData(m_Y, vid); } } // end namespace apps diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index c041d2dfa..7ed2f0399 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -89,8 +89,8 @@ void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t { (void) vid; - deallocData(m_x); - deallocData(m_vol); + deallocData(m_x, vid); + deallocData(m_vol, vid); } } // end namespace apps diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index ea32d1b19..18979f3bd 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -66,10 +66,10 @@ void PRESSURE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) allocAndInitData(m_e_old, getActualProblemSize(), vid); allocAndInitData(m_vnewc, getActualProblemSize(), vid); - initData(m_cls); - initData(m_p_cut); - initData(m_pmin); - initData(m_eosvmax); + initData(m_cls, vid); + initData(m_p_cut, vid); + initData(m_pmin, vid); + initData(m_eosvmax, vid); } void PRESSURE::updateChecksum(VariantID vid, size_t tune_idx) @@ -81,11 +81,11 @@ void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_compression); - deallocData(m_bvc); - deallocData(m_p_new); - deallocData(m_e_old); - deallocData(m_vnewc); + deallocData(m_compression, vid); + deallocData(m_bvc, vid); + deallocData(m_p_new, vid); + deallocData(m_e_old, vid); + deallocData(m_vnewc, vid); } } // end namespace apps diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 9593b2735..a1097163a 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -96,10 +96,10 @@ void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); - deallocData(m_z); - deallocData(m_vol); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_z, vid); + deallocData(m_vol, vid); } } // end namespace apps diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index 33e505231..73c8fecc9 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -193,11 +193,11 @@ void COUPLE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_t0); - deallocData(m_t1); - deallocData(m_t2); - deallocData(m_denac); - deallocData(m_denlw); + deallocData(m_t0, vid); + deallocData(m_t1, vid); + deallocData(m_t2, vid); + deallocData(m_denac, vid); + deallocData(m_denlw, vid); } } // end namespace apps diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index f9f3c9d2d..67f4b0eb0 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -63,7 +63,7 @@ void DAXPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); - initData(m_a); + initData(m_a, vid); } void DAXPY::updateChecksum(VariantID vid, size_t tune_idx) @@ -74,8 +74,8 @@ void DAXPY::updateChecksum(VariantID vid, size_t tune_idx) void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace basic diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index 1b9017273..e58516d2f 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -63,7 +63,7 @@ void DAXPY_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); - initData(m_a); + initData(m_a, vid); } void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) @@ -74,8 +74,8 @@ void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace basic diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 57ef34f7a..b0fe3469b 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -81,11 +81,11 @@ void IF_QUAD::updateChecksum(VariantID vid, size_t tune_idx) void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); - deallocData(m_b); - deallocData(m_c); - deallocData(m_x1); - deallocData(m_x2); + deallocData(m_a, vid); + deallocData(m_b, vid); + deallocData(m_c, vid); + deallocData(m_x1, vid); + deallocData(m_x2, vid); } } // end namespace basic diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index df523fbf6..f5614a7f2 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -71,8 +71,8 @@ void INDEXLIST::updateChecksum(VariantID vid, size_t tune_idx) void INDEXLIST::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_list); + deallocData(m_x, vid); + deallocData(m_list, vid); } } // end namespace basic diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index e7d4215fa..3ddb3fc0c 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -80,8 +80,8 @@ void INDEXLIST_3LOOP::updateChecksum(VariantID vid, size_t tune_idx) void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_list); + deallocData(m_x, vid); + deallocData(m_list, vid); } } // end namespace basic diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 130fbc3b4..b2be64f84 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -78,11 +78,11 @@ void INIT3::updateChecksum(VariantID vid, size_t tune_idx) void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_out1); - deallocData(m_out2); - deallocData(m_out3); - deallocData(m_in1); - deallocData(m_in2); + deallocData(m_out1, vid); + deallocData(m_out2, vid); + deallocData(m_out3, vid); + deallocData(m_in1, vid); + deallocData(m_in2, vid); } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index dd52b057e..de34c5a28 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -74,7 +74,7 @@ void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tune_idx) void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); + deallocData(m_a, vid); } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 2dcda4f4a..fe1867698 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -74,7 +74,7 @@ void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tune_idx) void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); + deallocData(m_a, vid); } } // end namespace basic diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 98cd878ce..afbaaf0c2 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -78,9 +78,9 @@ void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tune_idx) { void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void)vid; - deallocData(m_A); - deallocData(m_B); - deallocData(m_C); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_C, vid); } } // end namespace basic diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 1d5f1bfcc..4ae8d6868 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -78,11 +78,11 @@ void MULADDSUB::updateChecksum(VariantID vid, size_t tune_idx) void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_out1); - deallocData(m_out2); - deallocData(m_out3); - deallocData(m_in1); - deallocData(m_in2); + deallocData(m_out1, vid); + deallocData(m_out2, vid); + deallocData(m_out3, vid); + deallocData(m_in1, vid); + deallocData(m_in2, vid); } } // end namespace basic diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 5810f7598..e1f93dd2f 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -76,7 +76,7 @@ void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_pi); + deallocData(m_pi, vid); } } // end namespace basic diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index d5cf9f4c2..975bf8f24 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -86,7 +86,7 @@ void REDUCE3_INT::updateChecksum(VariantID vid, size_t tune_idx) void REDUCE3_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_vec); + deallocData(m_vec, vid); } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 4479d832e..5edaadede 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -92,8 +92,8 @@ void REDUCE_STRUCT::updateChecksum(VariantID vid, size_t tune_idx) void REDUCE_STRUCT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace basic diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 0d78f8915..a2f3a959a 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -43,38 +43,38 @@ void incDataInitCount() */ void allocAndInitData(Int_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align); + allocData(ptr, len, align, vid); initData(ptr, len, vid); } void allocAndInitData(Real_ptr& ptr, int len, int align, VariantID vid ) { - allocData(ptr, len, align); + allocData(ptr, len, align, vid); initData(ptr, len, vid); } void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, VariantID vid) { - allocData(ptr, len, align); + allocData(ptr, len, align, vid); initDataConst(ptr, len, val, vid); } void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align); + allocData(ptr, len, align, vid); initDataRandSign(ptr, len, vid); } void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align); + allocData(ptr, len, align, vid); initDataRandValue(ptr, len, vid); } void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align); + allocData(ptr, len, align, vid); initData(ptr, len, vid); } @@ -82,20 +82,26 @@ void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) /* * Allocate data arrays of given type. */ -void allocData(Int_ptr& ptr, int len, int align) +void allocData(Int_ptr& ptr, int len, int align, + VariantID vid) { + (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Int_type)); } -void allocData(Real_ptr& ptr, int len, int align) +void allocData(Real_ptr& ptr, int len, int align, + VariantID vid) { + (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Real_type)); } -void allocData(Complex_ptr& ptr, int len, int align) +void allocData(Complex_ptr& ptr, int len, int align, + VariantID vid) { + (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Complex_type)); } @@ -104,24 +110,30 @@ void allocData(Complex_ptr& ptr, int len, int align) /* * Free data arrays of given type. */ -void deallocData(Int_ptr& ptr) +void deallocData(Int_ptr& ptr, + VariantID vid) { + (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; } } -void deallocData(Real_ptr& ptr) +void deallocData(Real_ptr& ptr, + VariantID vid) { + (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; } } -void deallocData(Complex_ptr& ptr) +void deallocData(Complex_ptr& ptr, + VariantID vid) { + (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 5251e59b9..186969e14 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -96,20 +96,26 @@ void allocAndInitData(Complex_ptr& ptr, int len, int align, /*! * \brief Allocate data arrays. */ -void allocData(Int_ptr& ptr, int len, int align); +void allocData(Int_ptr& ptr, int len, int align, + VariantID vid); /// -void allocData(Real_ptr& ptr, int len, int align); +void allocData(Real_ptr& ptr, int len, int align, + VariantID vid); /// -void allocData(Complex_ptr& ptr, int len, int align); +void allocData(Complex_ptr& ptr, int len, int align, + VariantID vid); /*! * \brief Free data arrays. */ -void deallocData(Int_ptr& ptr); +void deallocData(Int_ptr& ptr, + VariantID vid); /// -void deallocData(Real_ptr& ptr); +void deallocData(Real_ptr& ptr, + VariantID vid); /// -void deallocData(Complex_ptr& ptr); +void deallocData(Complex_ptr& ptr, + VariantID vid); /*! diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index d58384066..ca09c04ed 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -191,66 +191,68 @@ class KernelBase // run_params.getMemoryAlignment(); } int getDataAlignment() const { return RAJA::DATA_ALIGN; } template - void allocData(T*& ptr, int len) + void allocData(T*& ptr, int len, + VariantID vid) { - rajaperf::detail::allocData(ptr, len, getDataAlignment()); + rajaperf::detail::allocData(ptr, len, getDataAlignment(), vid); } template void allocAndInitData(T*& ptr, int len, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::allocAndInitData(ptr, len, getDataAlignment(), vid); } template void allocAndInitDataConst(T*& ptr, int len, T val, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::allocAndInitDataConst(ptr, len, getDataAlignment(), val, vid); } template void allocAndInitDataRandSign(T*& ptr, int len, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::allocAndInitDataRandSign(ptr, len, getDataAlignment(), vid); } template void allocAndInitDataRandValue(T*& ptr, int len, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::allocAndInitDataRandValue(ptr, len, getDataAlignment(), vid); } template - void deallocData(T*& ptr) + void deallocData(T*& ptr, + VariantID vid) { - rajaperf::detail::deallocData(ptr); + rajaperf::detail::deallocData(ptr, vid); } template void initData(T*& ptr, int len, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::initData(ptr, len, vid); } template void initDataConst(T*& ptr, int len, T val, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::initDataConst(ptr, len, val, vid); } template void initDataRandSign(T*& ptr, int len, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::initDataRandSign(ptr, len, vid); } template void initDataRandValue(T*& ptr, int len, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::initDataRandValue(ptr, len, vid); } template void initData(T& d, - VariantID vid = NumVariants) + VariantID vid) { rajaperf::detail::initData(d, vid); } diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 785aef275..57bb9fb39 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -74,8 +74,8 @@ void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_px); - deallocData(m_cx); + deallocData(m_px, vid); + deallocData(m_cx, vid); } } // end namespace lcals diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index db9e86a75..69ffa4bc3 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -85,10 +85,10 @@ void EOS::updateChecksum(VariantID vid, size_t tune_idx) void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); - deallocData(m_z); - deallocData(m_u); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_z, vid); + deallocData(m_u, vid); } } // end namespace lcals diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 9d60d68bb..df13c2f39 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -75,8 +75,8 @@ void FIRST_DIFF::updateChecksum(VariantID vid, size_t tune_idx) void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace lcals diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 9858f91d3..427cbe0a6 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -82,7 +82,7 @@ void FIRST_MIN::updateChecksum(VariantID vid, size_t tune_idx) void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); + deallocData(m_x, vid); } } // end namespace lcals diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index bd652bd76..d5559cd6c 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -74,8 +74,8 @@ void FIRST_SUM::updateChecksum(VariantID vid, size_t tune_idx) void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); + deallocData(m_x, vid); + deallocData(m_y, vid); } } // end namespace lcals diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index e8985d77a..340219129 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -83,10 +83,10 @@ void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tune_idx) void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_b5); - deallocData(m_stb5); - deallocData(m_sa); - deallocData(m_sb); + deallocData(m_b5, vid); + deallocData(m_stb5, vid); + deallocData(m_sa, vid); + deallocData(m_sb, vid); } } // end namespace lcals diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index ef8f12eb2..16a251e2d 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -83,9 +83,9 @@ void HYDRO_1D::updateChecksum(VariantID vid, size_t tune_idx) void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); - deallocData(m_z); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_z, vid); } } // end namespace lcals diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index c23d1e8c6..0920de8c1 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -103,17 +103,17 @@ void HYDRO_2D::updateChecksum(VariantID vid, size_t tune_idx) void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_zrout); - deallocData(m_zzout); - deallocData(m_za); - deallocData(m_zb); - deallocData(m_zm); - deallocData(m_zp); - deallocData(m_zq); - deallocData(m_zr); - deallocData(m_zu); - deallocData(m_zv); - deallocData(m_zz); + deallocData(m_zrout, vid); + deallocData(m_zzout, vid); + deallocData(m_za, vid); + deallocData(m_zb, vid); + deallocData(m_zm, vid); + deallocData(m_zp, vid); + deallocData(m_zq, vid); + deallocData(m_zr, vid); + deallocData(m_zu, vid); + deallocData(m_zv, vid); + deallocData(m_zz, vid); } } // end namespace lcals diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index e2d8c93c2..1e7375752 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -65,14 +65,14 @@ void INT_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) m_px_initval = 1.0; allocAndInitDataConst(m_px, m_array_length, m_px_initval, vid); - initData(m_dm22); - initData(m_dm23); - initData(m_dm24); - initData(m_dm25); - initData(m_dm26); - initData(m_dm27); - initData(m_dm28); - initData(m_c0); + initData(m_dm22, vid); + initData(m_dm23, vid); + initData(m_dm24, vid); + initData(m_dm25, vid); + initData(m_dm26, vid); + initData(m_dm27, vid); + initData(m_dm28, vid); + initData(m_c0, vid); } void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) @@ -87,7 +87,7 @@ void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_px); + deallocData(m_px, vid); } } // end namespace lcals diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index fd97ecf3e..da178a407 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -74,11 +74,11 @@ void PLANCKIAN::updateChecksum(VariantID vid, size_t tune_idx) void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); - deallocData(m_u); - deallocData(m_v); - deallocData(m_w); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_u, vid); + deallocData(m_v, vid); + deallocData(m_w, vid); } } // end namespace lcals diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 369b3d864..98278cbcf 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -75,10 +75,10 @@ void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tune_idx) void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_xout); - deallocData(m_xin); - deallocData(m_y); - deallocData(m_z); + deallocData(m_xout, vid); + deallocData(m_xin, vid); + deallocData(m_y, vid); + deallocData(m_z, vid); } } // end namespace lcals diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index e755b09f9..80136cd49 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -102,11 +102,11 @@ void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_tmp); - deallocData(m_A); - deallocData(m_B); - deallocData(m_C); - deallocData(m_D); + deallocData(m_tmp, vid); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_C, vid); + deallocData(m_D, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 66062f9c2..e1dad595c 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -112,13 +112,13 @@ void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_A); - deallocData(m_B); - deallocData(m_C); - deallocData(m_D); - deallocData(m_E); - deallocData(m_F); - deallocData(m_G); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_C, vid); + deallocData(m_D, vid); + deallocData(m_E, vid); + deallocData(m_F, vid); + deallocData(m_G, vid); } } // end namespace basic diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index e3fe802d7..b513bdebc 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -85,10 +85,10 @@ void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_U); - deallocData(m_V); - deallocData(m_P); - deallocData(m_Q); + deallocData(m_U, vid); + deallocData(m_V, vid); + deallocData(m_P, vid); + deallocData(m_Q, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index b8c79fa90..4c159d3a7 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -88,10 +88,10 @@ void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_tmp); - deallocData(m_x); - deallocData(m_y); - deallocData(m_A); + deallocData(m_tmp, vid); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_A, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 863b19c15..37dd1f9f5 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -106,10 +106,10 @@ void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_fict); - deallocData(m_ex); - deallocData(m_ey); - deallocData(m_hz); + deallocData(m_fict, vid); + deallocData(m_ex, vid); + deallocData(m_ey, vid); + deallocData(m_hz, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 794da77cf..9770821b0 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -81,8 +81,8 @@ void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_pin); - deallocData(m_pout); + deallocData(m_pin, vid); + deallocData(m_pout, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 4af192fc3..e080c6df5 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -92,9 +92,9 @@ void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_A); - deallocData(m_B); - deallocData(m_C); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_C, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 106e40a29..99e16324f 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -108,15 +108,15 @@ void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_A); - deallocData(m_u1); - deallocData(m_v1); - deallocData(m_u2); - deallocData(m_v2); - deallocData(m_w); - deallocData(m_x); - deallocData(m_y); - deallocData(m_z); + deallocData(m_A, vid); + deallocData(m_u1, vid); + deallocData(m_v1, vid); + deallocData(m_u2, vid); + deallocData(m_v2, vid); + deallocData(m_w, vid); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_z, vid); } } // end namespace basic diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 73d2e2ab4..fdf07a58f 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -82,10 +82,10 @@ void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x); - deallocData(m_y); - deallocData(m_A); - deallocData(m_B); + deallocData(m_x, vid); + deallocData(m_y, vid); + deallocData(m_A, vid); + deallocData(m_B, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 7f46f47a8..be6c0e218 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -94,10 +94,10 @@ void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_A); - deallocData(m_B); - deallocData(m_Ainit); - deallocData(m_Binit); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_Ainit, vid); + deallocData(m_Binit, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 12d2243bc..925cd2682 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -91,10 +91,10 @@ void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_A); - deallocData(m_B); - deallocData(m_Ainit); - deallocData(m_Binit); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_Ainit, vid); + deallocData(m_Binit, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 4ec1def5f..e1b62d364 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -93,10 +93,10 @@ void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_A); - deallocData(m_B); - deallocData(m_Ainit); - deallocData(m_Binit); + deallocData(m_A, vid); + deallocData(m_B, vid); + deallocData(m_Ainit, vid); + deallocData(m_Binit, vid); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index f4f03c631..e58065f28 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -87,11 +87,11 @@ void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tune_idx) void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_x1); - deallocData(m_x2); - deallocData(m_y1); - deallocData(m_y2); - deallocData(m_A); + deallocData(m_x1, vid); + deallocData(m_x2, vid); + deallocData(m_y1, vid); + deallocData(m_y2, vid); + deallocData(m_A, vid); } } // end namespace polybench diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 3d482c80f..3ca91bed5 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -75,9 +75,9 @@ void ADD::updateChecksum(VariantID vid, size_t tune_idx) void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); - deallocData(m_b); - deallocData(m_c); + deallocData(m_a, vid); + deallocData(m_b, vid); + deallocData(m_c, vid); } } // end namespace stream diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index dd447ad71..9cbfcbff2 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -74,8 +74,8 @@ void COPY::updateChecksum(VariantID vid, size_t tune_idx) void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); - deallocData(m_c); + deallocData(m_a, vid); + deallocData(m_c, vid); } } // end namespace stream diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index c3dba74f1..cc32be5f2 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -77,8 +77,8 @@ void DOT::updateChecksum(VariantID vid, size_t tune_idx) void DOT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); - deallocData(m_b); + deallocData(m_a, vid); + deallocData(m_b, vid); } } // end namespace stream diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index ff7598965..74ce32cb0 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -75,8 +75,8 @@ void MUL::updateChecksum(VariantID vid, size_t tune_idx) void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_b); - deallocData(m_c); + deallocData(m_b, vid); + deallocData(m_c, vid); } } // end namespace stream diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 34cd06a49..4790707bb 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -80,9 +80,9 @@ void TRIAD::updateChecksum(VariantID vid, size_t tune_idx) void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_a); - deallocData(m_b); - deallocData(m_c); + deallocData(m_a, vid); + deallocData(m_b, vid); + deallocData(m_c, vid); } } // end namespace stream From a29c01e691a1399392186297d96156ad6b31e0eb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 17 Feb 2023 15:29:32 -0800 Subject: [PATCH 028/137] Add memory alignment to RunParams class --- src/common/KernelBase.hpp | 3 +-- src/common/RunParams.cpp | 1 + src/common/RunParams.hpp | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index ca09c04ed..ca4b4249f 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -188,8 +188,7 @@ class KernelBase #endif } - // run_params.getMemoryAlignment(); } - int getDataAlignment() const { return RAJA::DATA_ALIGN; } + int getDataAlignment() const { return run_params.getMemoryAlignment(); } template void allocData(T*& ptr, int len, VariantID vid) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index e32bd95da..a9c19e589 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -34,6 +34,7 @@ RunParams::RunParams(int argc, char** argv) size_meaning(SizeMeaning::Unset), size(0.0), size_factor(0.0), + memory_alignment(RAJA::DATA_ALIGN), gpu_block_sizes(), pf_tol(0.1), checkrun_reps(1), diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index b9e808533..b91df16b4 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -125,6 +125,8 @@ class RunParams { double getSizeFactor() const { return size_factor; } + int getMemoryAlignment() const { return memory_alignment; } + size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const { @@ -232,6 +234,7 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ + int memory_alignment; std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ double pf_tol; /*!< pct RAJA variant run time can exceed base for From 53b87ac890f43f64440e2d0dab7899c6694a742d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 17 Feb 2023 16:04:01 -0800 Subject: [PATCH 029/137] Add --align,--memory_alignment arguments --- src/common/RunParams.cpp | 34 ++++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 4 ++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index a9c19e589..8e818ff34 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -99,6 +99,7 @@ void RunParams::print(std::ostream& str) const str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning()); str << "\n size = " << size; str << "\n size_factor = " << size_factor; + str << "\n memory_alignment = " << memory_alignment; str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { str << "\n\t" << gpu_block_sizes[j]; @@ -317,6 +318,33 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--align") || + opt == std::string("--memory_alignment") ) { + + i++; + if ( i < argc ) { + long long align = ::atoll( argv[i] ); + long long min_align = alignof(std::max_align_t); + if ( align < min_align ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_align + << std::endl; + input_state = BadInput; + } else if ( (align & (align-1)) != 0 ) { + getCout() << "\nBad input:" + << " must give " << opt << " a power of 2" + << std::endl; + input_state = BadInput; + } else { + memory_alignment = align; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_block_size") ) { bool got_someting = false; @@ -591,6 +619,12 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --size 1000000 (runs kernels with size ~1,000,000)\n\n"; + str << "\t --memory_alignment, --align [default is RAJA::DATA_ALIGN]\n" + << "\t (minimum memory alignment for host allocations)\n" + << "\t (must be a power of 2 at least as large as the default alignment)\n"; + str << "\t\t Example...\n" + << "\t\t --align 4096 (allocates memory aligned to 4KiB boundaries)\n\n"; + str << "\t --gpu_block_size [no default]\n" << "\t (block sizes to run for all GPU kernels)\n" << "\t (GPU kernels not supporting gpu_block_size will be skipped)\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index b91df16b4..ff3b8ad9a 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -125,7 +125,7 @@ class RunParams { double getSizeFactor() const { return size_factor; } - int getMemoryAlignment() const { return memory_alignment; } + size_t getMemoryAlignment() const { return memory_alignment; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -234,7 +234,7 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ - int memory_alignment; + size_t memory_alignment; std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ double pf_tol; /*!< pct RAJA variant run time can exceed base for From 6fe25d9de63f1670f5f4e63414fff9766aff4ed4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 17 Feb 2023 17:21:12 -0800 Subject: [PATCH 030/137] Fix cuda, hip, and ompt compile --- src/common/CudaDataUtils.hpp | 2 +- src/common/HipDataUtils.hpp | 2 +- src/common/OpenMPTargetDataUtils.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 53f10f6c0..ea0097d96 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -122,7 +122,7 @@ void initCudaDeviceData(T& dptr, const T hptr, int len) len * sizeof(typename std::remove_pointer::type), cudaMemcpyHostToDevice ) ); - incDataInitCount(); + detail::incDataInitCount(); } /*! diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 570f9619c..f69680934 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -109,7 +109,7 @@ void initHipDeviceData(T& dptr, const T hptr, int len) len * sizeof(typename std::remove_pointer::type), hipMemcpyHostToDevice ) ); - incDataInitCount(); + detail::incDataInitCount(); } /*! diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 633f7ec51..d122e12a5 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -49,7 +49,7 @@ void initOpenMPDeviceData(T& dptr, const T hptr, int len, int did, int hid) { copyOpenMPDeviceData(dptr, hptr, len, did, hid); - incDataInitCount(); + detail::incDataInitCount(); } /*! From 13855fe69d569421ce59446d3234353acb5cc0ea Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Feb 2023 13:03:24 -0800 Subject: [PATCH 031/137] Use data alignment and -align as the naming convention for alignment and as the short argument form --- src/common/KernelBase.hpp | 2 +- src/common/RunParams.cpp | 14 +++++++------- src/common/RunParams.hpp | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index ca4b4249f..d01a3f788 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -188,7 +188,7 @@ class KernelBase #endif } - int getDataAlignment() const { return run_params.getMemoryAlignment(); } + int getDataAlignment() const { return run_params.getDataAlignment(); } template void allocData(T*& ptr, int len, VariantID vid) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 8e818ff34..527e962a2 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -34,7 +34,7 @@ RunParams::RunParams(int argc, char** argv) size_meaning(SizeMeaning::Unset), size(0.0), size_factor(0.0), - memory_alignment(RAJA::DATA_ALIGN), + data_alignment(RAJA::DATA_ALIGN), gpu_block_sizes(), pf_tol(0.1), checkrun_reps(1), @@ -99,7 +99,7 @@ void RunParams::print(std::ostream& str) const str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning()); str << "\n size = " << size; str << "\n size_factor = " << size_factor; - str << "\n memory_alignment = " << memory_alignment; + str << "\n data_alignment = " << data_alignment; str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { str << "\n\t" << gpu_block_sizes[j]; @@ -318,8 +318,8 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } - } else if ( opt == std::string("--align") || - opt == std::string("--memory_alignment") ) { + } else if ( opt == std::string("-align") || + opt == std::string("--data_alignment") ) { i++; if ( i < argc ) { @@ -336,7 +336,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) << std::endl; input_state = BadInput; } else { - memory_alignment = align; + data_alignment = align; } } else { getCout() << "\nBad input:" @@ -619,11 +619,11 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --size 1000000 (runs kernels with size ~1,000,000)\n\n"; - str << "\t --memory_alignment, --align [default is RAJA::DATA_ALIGN]\n" + str << "\t --data_alignment, -align [default is RAJA::DATA_ALIGN]\n" << "\t (minimum memory alignment for host allocations)\n" << "\t (must be a power of 2 at least as large as the default alignment)\n"; str << "\t\t Example...\n" - << "\t\t --align 4096 (allocates memory aligned to 4KiB boundaries)\n\n"; + << "\t\t -align 4096 (allocates memory aligned to 4KiB boundaries)\n\n"; str << "\t --gpu_block_size [no default]\n" << "\t (block sizes to run for all GPU kernels)\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index ff3b8ad9a..806faea85 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -125,7 +125,7 @@ class RunParams { double getSizeFactor() const { return size_factor; } - size_t getMemoryAlignment() const { return memory_alignment; } + size_t getDataAlignment() const { return data_alignment; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -234,7 +234,7 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ - size_t memory_alignment; + size_t data_alignment; std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ double pf_tol; /*!< pct RAJA variant run time can exceed base for From 29d460479b03ec1c0f41cbb1f99994a379378d80 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 21 Feb 2023 14:23:00 -0800 Subject: [PATCH 032/137] Clean up pass. --- .../{getting_started.rst => build.rst} | 18 +-- docs/sphinx/user_guide/index.rst | 11 +- docs/sphinx/user_guide/output.rst | 16 +++ docs/sphinx/user_guide/run.rst | 109 ++++++++++++++++++ docs/sphinx/user_guide/running.rst | 96 --------------- 5 files changed, 140 insertions(+), 110 deletions(-) rename docs/sphinx/user_guide/{getting_started.rst => build.rst} (95%) create mode 100644 docs/sphinx/user_guide/output.rst create mode 100644 docs/sphinx/user_guide/run.rst delete mode 100644 docs/sphinx/user_guide/running.rst diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/build.rst similarity index 95% rename from docs/sphinx/user_guide/getting_started.rst rename to docs/sphinx/user_guide/build.rst index 48f40896f..8b6774f24 100644 --- a/docs/sphinx/user_guide/getting_started.rst +++ b/docs/sphinx/user_guide/build.rst @@ -6,16 +6,16 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _getting_started-label: +.. _build-label: ********************************************* -Getting Started With RAJA Performance Suite +Building the RAJA Performance Suite ********************************************* -This section should help you build the RAJA Performance Suite code so you can +This section will help you build the RAJA Performance Suite code so you can start running it. -.. _getting_started_reqs-label: +.. _build_reqs-label: ============ Requirements @@ -32,7 +32,7 @@ disable them are similar to `RAJA build options `_ RAJA also contains dependencies, which are discussed in -`RAJA Dependencies `_. +`RAJA Dependencies `_. -.. _getting_started_build-label: +.. _build_build-label: ================== Build and Install ================== The build and install process for the RAJA Performance Suite is similar to -the process for RAJA, which is described in `RAJA Build and Install `_. +the process for RAJA, which is described in `RAJA Build and Install `_. When building the RAJA Performance Suite, RAJA and the RAJA Performance Suite are built together using the same CMake configuration. The basic process for diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index e65fd9132..0bd7d5570 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -12,8 +12,9 @@ RAJA Performance Suite User Guide ################################### If you have some familiarity with RAJA and want to get up and running with -the RAJA Performance Suite quickly, check out :ref:`getting_started-label`, -which contains information about getting the code, building it, running it, etc. +the RAJA Performance Suite quickly, check out :ref:`build-label` and +:ref:`run-label` which contain information about getting the code, building +it, and running it. If you want to learn more about RAJA, please see the `RAJA User Guide `_, which contains a detailed discussion of RAJA features and a RAJA @@ -22,6 +23,6 @@ tutorial, both of which contain code examples. .. toctree:: :maxdepth: 3 - getting_started - running -.. ## output + build + run + output diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst new file mode 100644 index 000000000..4bff5b3a3 --- /dev/null +++ b/docs/sphinx/user_guide/output.rst @@ -0,0 +1,16 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _output-label: + +********************************************* +RAJA Performance Suite Output +********************************************* + +This section describes the contents of output files generated by the Suite +when it is run. diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst new file mode 100644 index 000000000..43abad4e2 --- /dev/null +++ b/docs/sphinx/user_guide/run.rst @@ -0,0 +1,109 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _run-label: + +********************************************* +Running the RAJA Performance Suite +********************************************* + +This section describes how to run the Suite and which execution options are +available. + +After the Suite code is compiled, following the instructions provided in +:ref:`build-label`, the executable will reside in the ``bin`` subdirectory +of the build space. The executable will be able to run all kernels and +variants that have been built depending on which CMake options were specified +to configure the build. + +To run the Suite in its default mode, type the executable name with no +command-line arguments:: + + $ ./bin/raja-perf.exe + +This will run all kernels and variants that have been built in their default +configurations. Information describing how the Suite will run along with +some information about each kernel will appear on the screen. More information +about kernel and execution details will also appear in a run report files +generated in the run directory after Suite execution completes. + +.. note:: * You can pass the ``--dryrun`` command-line option to the executable + to see a summary of how the Suite will execute without actually + running it. + * You can choose the directory for output file names as well as + output file names using command line options. + +The Suite can be run in a variety of ways determined by the command-line +options passed to the executable. For example, you can run or exclude subsets +of kernels, variants, or groups. You can also pass options to set problem +sizes, number of times each kernel is run (sampled), and many other run +parameters.The goal is to build the code once and use scripts or other means +to run the Suite in different ways for analyses you want to perform. + +Each option appears in a *long form* with a double hyphen prefix (i.e., '--'). +Commonly used options are also available in a one or two character *short form* +with a single hyphen prefix (i.e., '-') for convenience. To see available +options along with a brief description of each, pass the `--help` or `-h` +option to the executable:: + + $ ./bin/raja-perf.exe --help + +or + + $ ./bin/raja-perf.exe -h + +.. note:: To see all available Suite execution options, pass the `--help` or + `-h` option to the executable. + +Lastly, the program will report specific errors if given incorrect input, such +as an option that requires a value and no value is provided. It will also emit +a summary of command-line arguments it was given if the input contains +something that the code does not know how to parse. + +.. note: The Suite executable will attempt to provide helpful information + if it is given incorrect input, such as command-line arguments that + it does not know how to parse. Ill-formed input will be noted in + screen output, hopefully making it easy for users to correct erroneous + usage, such as mis-spelled option names. + +.. _run_mpi-label: + +================== +Running with MPI +================== + +Running the Suite with MPI is just like running any other MPI application. +For example:: + + $ srun -n 2 ./bin/raja-perf.exe + +will run the entire Suite (all kernels and variants) in their default +configurations on each of 2 MPI ranks. + +The kernel information output shows how each kernel is run on each rank. +Timing is reported on rank 0 and is gathered by invoking an MPI barrier, +starting a timer, running the kernel, invoking an MPI barrier, and then +stopping the timer. Total problem size across all MPI ranks can be +calculated, if desired, by multiplying the number of MPI ranks by the problem +size reported in the kernel information. + +.. _run_omptarget-label: + +====================== +OpenMP target offload +====================== + +OpenMP target offload variants of the kernels in the Suite are +considered a work-in-progress since the RAJA OpenMP target offload back-end +is a work-in-progress. If you configure them to build, they can be run with +the executable `./bin/raja-perf-omptarget.exe` which is distinct from the one +described above. When the OpenMP target offload variants were developed, it +was not possible for them to co-exist in the same executable as CUDA +variants, for example. In the future, the build system may be reworked so +that the OpenMP target variants can be run from the same executable as the +other variants. diff --git a/docs/sphinx/user_guide/running.rst b/docs/sphinx/user_guide/running.rst deleted file mode 100644 index 4edce4ce6..000000000 --- a/docs/sphinx/user_guide/running.rst +++ /dev/null @@ -1,96 +0,0 @@ -.. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -.. ## and RAJA Performance Suite project contributors. -.. ## See the RAJAPerf/LICENSE file for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _running-label: - -********************************************* -Running the RAJA Performance Suite -********************************************* - -This section describes how to run the Suite and which execution options are -available. - -After the Suite is compiled, the executable will be located in the ``bin`` -subdirectory of the build space directory. The executable will be able to run -all kernels and variants that have been built depending on which programming -model back-ends have been enabled via CMake options. - -To execute the Suite in its default mode, run the executable with no -command-line arguments:: - - $ ./bin/raja-perf.exe - -This will run all kernels and variants that have been built in their default -configurations. Some information describing how the Suite will run along with -some information about each kernel will appear on the screen. More information -about kernel details will also appear in a run report file generated in your -run directory after Suite execution completes. - -.. note:: You can pass the ``--dryrun`` command-line option to the executable - to see a summary of how the Suite will execute without actually - running it. - -The Suite can be run in a variety of ways that are determined by the options -passed to the executable. For example, you can run or exclude subsets of -kernels, variants, or groups. You can also pass other options to set problem -sizes, number of times each kernel is run (sampled), etc. The idea is to build -the code once and use scripts or other means to run the Suite in different -ways for analyses you want to perform. - -All options appear in a *long form* with a double hyphen prefix (i.e., '--'). -Commonly used options are also available in a one or two character *short form* -with a single hyphen prefix (i.e., '-') for convenience. To see available -options along with a brief description of each, pass the `--help` or `-h` -option to the executable:: - - $ ./bin/raja-perf.exe --help - -or - - $ ./bin/raja-perf.exe -h - -.. note:: To see all available Suite options, pass the `--help` or `-h` - option to the executable. - -Lastly, the program will emit a summary of command-line arguments it was given -if the input contains something that the code does not know how to parse. -The ill-formed input will be noted in the summary. Hopefully, this makes -it easy for users to correct erroneous usage, such as mis-spelled option names. - -================== -Running with MPI -================== - -Running the Suite with MPI is just like running any other MPI application. -For example:: - - $ srun -n 2 ./bin/raja-perf.exe - -will run the entire Suite (all kernels and variants) in their default -configurations on each of 2 MPI ranks. - -The kernel information output shows how -each kernel is run on each rank. The total problem size across all MPI ranks -can be calculated by multiplying the number of MPI ranks by the problem -size in the kernel information. Timing is reported on rank 0 and is gathered -by doing an MPI barrier, starting the timer, running the kernel repetitions, -doing an MPI barrier, and then stopping the timer. - -====================== -OpenMP target offload -====================== - -The OpenMP target offload variants of the kernels in the Suite are -considered a work-in-progress since the RAJA OpenMP target offload back-end -is a work-in-progress. If you configure them to build, they can be run with -the executable `./bin/raja-perf-omptarget.exe` which is distinct from the one -described above. At the time the OpenMP target offload variants were -developed, it was not possible for them to co-exist in the same executable -as the CUDA variants, for example. In the future, the build system may -be reworked so that the OpenMP target variants can be run from the same -executable as the other variants. From 3fb5db2427ee94711d9593a1bc8b45d9b8b30bbf Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 21 Feb 2023 15:57:15 -0800 Subject: [PATCH 033/137] new section --- docs/sphinx/user_guide/output.rst | 150 +++++++++++++++++++++++++++++- docs/sphinx/user_guide/run.rst | 2 - 2 files changed, 147 insertions(+), 5 deletions(-) diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst index 4bff5b3a3..02a6f64c4 100644 --- a/docs/sphinx/user_guide/output.rst +++ b/docs/sphinx/user_guide/output.rst @@ -1,4 +1,4 @@ -.. ## +. ## .. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. @@ -12,5 +12,149 @@ RAJA Performance Suite Output ********************************************* -This section describes the contents of output files generated by the Suite -when it is run. +This section describes the contents of output files generated by the Suite. + +When the Suite is run, several output files are generated that contain data +describing the run. By default the files be placed in the directory where the +executable is invoked, and the file names will contain the prefix +``RAJAPerf-`` and a string indicating the contents. + +.. note:: You can provide command-line options to place the output files in a + different directory and/or give them a different file name prefix. + Such options and syntax are described in the Suite help output:: + + $ ./bin/raja-perf.exe -h + +Currently, there are five output files generated that provide information +described below. All output files are plain text files. Other than the +checksum file, all file contents are in 'csv' format for easy processing by +common tools for generating plots, etc. + + * **Timing** -- execution time (sec.) of each loop kernel and variant run. + * **Checksum** -- checksum values for each loop kernel and variant run to + ensure that they are producing correct results. Typically, a checksum + difference of ~1e-10 or less indicates that results generated by a kernel + variant match a reference variant. + * **Speedup** -- run time speedup of each kernel variant with respect to a + reference variant. The reference variant can be set with a command-line + option. If not specified, the first variant of a kernel that is run will + be used as the reference. The reference variant used is noted in the file. + * **Figure of Merit (FOM)** -- basic statistics about speedup of a RAJA + variant vs. baseline for each programming model run. When the execution + time of a RAJA variant differs from the corresponding baseline variant + by more than some tolerance, this is noted in the file with ``OVER_TOL``. + The default tolerance is 10% and can be changed via a command-line option. + * **Kernel** -- basic information about each kernel that is run, which is + the same for each variant of a kernel that is run. Kernel information + is described in more detail in the next section. + +.. _output_kerninfo-label: + +=========================== +Kernel information output +=========================== + +Information about kernels run when the RAJA Performance Suite executes is +placed in the ``RAJAPerf-kernels.csv`` file (unless the file prefix name is +changed by the user). This information is reported for rank zero when running +with multiple MPI processes. When running with more than one MPI rank, +information can be easily aggregated across all ranks if needed. For example, +the total aggregate problem size is the number of ranks times the problem size +shown in the kernel information. + +Information reported in the file for each kernel is: + + * **Name** -- full kernel name, format is group name followed by the kernel + name, separated by an underscore. + * **Feature** -- RAJA features exercised in RAJA variants of kernel. + * **Problem size** -- Size of the problem represented by a kernel. Please see section below. + * **Reps** -- Number of times a kernel runs in a single pass through the + Suite. + * **Iterations/rep** -- Sum of sizes of all parallel iteration spaces for all loops run in a single kernel execution. + * **Kernels/rep** -- total number of loop structures run (or GPU kernels + launched) in each kernel repetition. + * **Bytes/rep** -- Total number of bytes read from and written to memory for + each repetition of kernel. + * **FLOPs/rep** -- Total number of floating point operations executed for + each repetition of kernel. Currently, we count arithmetic operations + (+, -, *, /) and functions, such as exp, sin, etc. as one FLOP. We do not + currently count operations like abs and comparisons (<, >, etc.) in the + FLOP count. So these numbers are rough estimates. For actual FLOP counts, + a performance analysis tool should be used. + +.. _output_probsize-label: + +============================ +Notes about *problem size* +============================ + +This section describes how the Suite calculates problem sizes and the +rationale behind it. + + * Problem size is always reported per process/MPI rank. To get the total + problem size across all ranks when running with MPI, multiply the problem + size by the number of MPI ranks. + * The Suite uses three notions of problem size for each kernel: *default*, + *target*, and *actual*. Default is the problem size defined for a kernel + and the size that is run if no run time options are provided to run a + different size. Target is the desired problem size to run based on default + settings and alterations to those if input is provided to change the + default. Actual is the problem size that is run based on how each kernel + calculates it based on defaults and run time input. + * **The concept of problem size is subjective and can be interpreted + differently depending on the kernel structure and what one is trying to + measure.** For example, problem size could refer to the amount of data + needed to be stored in memory to run the problem, or it could refer to + the amount of parallel work that is possible, etc. + * We employ the following, admittedly loose definition, which depends on the + particular kernel structure. Of all *loop structures* (e.g., single loop, + nested loops, etc.) that are run for a kernel (note that some kernels run + multiple loops, possibly with different sizes or loop structures), problem + size refers to the size of the data set required to generate the kernel + result. The interpretation of this and the definition of problem size for + each kernel in the suite is determined by the kernel developer and team + discussion. + +Here are a few examples to give a better sense of how we determine problem +size for various kernels in the Suite. + +Vector addition:: + + for (int i = 0; i < 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } + +The problem size for this kernel is N, the loop length. Note that this happens +to match the size of the vectors a, b, c and the total amount of parallel work +in the kernel. This is common for simple, data parallel kernels. + +Matrix-vector multiplication:: + + for (int r = 0; r < N_r; ++r) { + b[r] = 0; + for (int c = 0; c < N_c; ++c) { + b[r] += A[r][c] + x[c]; + } + } + +The problem size if N_r * N_c, the size of the matrix. Note that this matches +the total size of the problem iteration space, but the total amount of +parallel work is N_r, the number of rows in the matrix and the length of the +vector b. + +Matrix-matrix multiplication:: + + for (int i = 0; i < N_i; ++i) { + for (int j = 0; j < N_j; ++j) { + A[i][j] = 0; + for (int k = 0; k < N_k; ++k) { + A[i][j] += B[i][k] * C[k][j]; + } + } + } + +Here, we are multiplying matrix B (N_i x N_k) and matrix C (N_k x N_j) and +storing the result in matrix A (N_i X N_j). Problem size could be chosen to be +the maximum number of entries in matrix B or C. We choose the size of matrix +A (N_i * N_j), which is more closely aligned with the number of independent +operations (i.e., the amount of parallel work) in the kernels. diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index 43abad4e2..f7947c484 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -35,8 +35,6 @@ generated in the run directory after Suite execution completes. .. note:: * You can pass the ``--dryrun`` command-line option to the executable to see a summary of how the Suite will execute without actually running it. - * You can choose the directory for output file names as well as - output file names using command line options. The Suite can be run in a variety of ways determined by the command-line options passed to the executable. For example, you can run or exclude subsets From 05166b22dccd5da4f0860d26543f2fc70e7d1515 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Feb 2023 16:12:18 -0800 Subject: [PATCH 034/137] Partial conversion to DataSpaces --- src/common/DataUtils.cpp | 28 +++--- src/common/DataUtils.hpp | 182 ++++++++++++++++++++++++++++++++--- src/common/KernelBase.hpp | 71 +++++++++++--- src/common/RAJAPerfSuite.cpp | 70 ++++---------- src/common/RAJAPerfSuite.hpp | 58 +++++------ 5 files changed, 283 insertions(+), 126 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 0474bcaf3..07802111a 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -46,38 +46,38 @@ void incDataInitCount() */ void allocAndInitData(Int_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align, vid); + allocHostData(ptr, len, align, vid); initData(ptr, len, vid); } void allocAndInitData(Real_ptr& ptr, int len, int align, VariantID vid ) { - allocData(ptr, len, align, vid); + allocHostData(ptr, len, align, vid); initData(ptr, len, vid); } void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, VariantID vid) { - allocData(ptr, len, align, vid); + allocHostData(ptr, len, align, vid); initDataConst(ptr, len, val, vid); } void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align, vid); + allocHostData(ptr, len, align, vid); initDataRandSign(ptr, len, vid); } void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align, vid); + allocHostData(ptr, len, align, vid); initDataRandValue(ptr, len, vid); } void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) { - allocData(ptr, len, align, vid); + allocHostData(ptr, len, align, vid); initData(ptr, len, vid); } @@ -85,7 +85,7 @@ void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) /* * Allocate data arrays of given type. */ -void allocData(Int_ptr& ptr, int len, int align, +void allocHostData(Int_ptr& ptr, int len, int align, VariantID vid) { (void)vid; @@ -93,7 +93,7 @@ void allocData(Int_ptr& ptr, int len, int align, align, len*sizeof(Int_type)); } /// -void allocData(Index_type*& ptr, int len, int align, +void allocHostData(Index_type*& ptr, int len, int align, VariantID vid) { (void)vid; @@ -101,7 +101,7 @@ void allocData(Index_type*& ptr, int len, int align, align, len*sizeof(Index_type)); } -void allocData(Real_ptr& ptr, int len, int align, +void allocHostData(Real_ptr& ptr, int len, int align, VariantID vid) { (void)vid; @@ -109,7 +109,7 @@ void allocData(Real_ptr& ptr, int len, int align, align, len*sizeof(Real_type)); } -void allocData(Complex_ptr& ptr, int len, int align, +void allocHostData(Complex_ptr& ptr, int len, int align, VariantID vid) { (void)vid; @@ -121,7 +121,7 @@ void allocData(Complex_ptr& ptr, int len, int align, /* * Free data arrays of given type. */ -void deallocData(Int_ptr& ptr, +void deallocHostData(Int_ptr& ptr, VariantID vid) { (void)vid; @@ -131,7 +131,7 @@ void deallocData(Int_ptr& ptr, } } -void deallocData(Index_type*& ptr, +void deallocHostData(Index_type*& ptr, VariantID vid) { (void)vid; @@ -141,7 +141,7 @@ void deallocData(Index_type*& ptr, } } -void deallocData(Real_ptr& ptr, +void deallocHostData(Real_ptr& ptr, VariantID vid) { (void)vid; @@ -151,7 +151,7 @@ void deallocData(Real_ptr& ptr, } } -void deallocData(Complex_ptr& ptr, +void deallocHostData(Complex_ptr& ptr, VariantID vid) { (void)vid; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 18b6aa087..ad58201db 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -45,6 +45,164 @@ void resetDataInitCount(); void incDataInitCount(); + + +/*! + * \brief Allocate data array (ptr). + */ +template +void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID vid) +{ + switch (dataSpace) { + case dataSpace::Host: + case dataSpace::Omp: + case dataSpace::CudaHost: + case dataSpace::HipHost: + { + allocHostData(ptr, len, align, vid); + } break; + + case dataSpace::OmpTarget: + { + allocOpenMPDeviceData(ptr, len, did); + } break; + + case dataSpace::CudaPinned: + { + allocCudaPinnedData(ptr, len, vid); + } break; + case dataSpace::CudaManaged: + { + allocCudaManagedData(ptr, len, vid); + } break; + case dataSpace::CudaDevice: + { + allocCudaDeviceData(ptr, len, vid); + } break; + + case dataSpace::HipHostAdviseFine: + { + allocHostData(ptr, len, align, vid); + adviseHipFineData(ptr, len, vid); + } break; + case dataSpace::HipHostAdviseCoarse: + { + allocHostData(ptr, len, align, vid); + adviseHipCoarseData(ptr, len, vid); + } break; + case dataSpace::HipPinned: + { + allocHipPinnedData(ptr, len, vid); + } break; + case dataSpace::HipPinnedFine: + { + allocHipPinnedFineData(ptr, len, vid); + } break; + case dataSpace::HipPinnedCoarse: + { + allocHipPinnedCoarseData(ptr, len, vid); + } break; + case dataSpace::HipManaged: + { + allocHipManagedData(ptr, len, vid); + } break; + case dataSpace::HipManagedAdviseFine: + { + allocHipManagedData(ptr, len, vid); + adviseHipFineData(ptr, len, vid); + } break; + case dataSpace::HipManagedAdviseCoarse: + { + allocHipManagedData(ptr, len, vid); + adviseHipCoarseData(ptr, len, vid); + } break; + case dataSpace::HipDevice: + { + allocHipDeviceData(ptr, len, vid); + } break; + case dataSpace::HipDeviceFine: + { + allocHipDeviceFineData(ptr, len, vid); + } break; + + default: + { + throw std::invalid_argument("allocData : Unknown memory type"); + } break; + } +} + +/*! + * \brief Deallocate data array (ptr). + */ +template +void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) +{ + switch (dataSpace) { + case dataSpace::Host: + case dataSpace::Omp: + case dataSpace::CudaHost: + case dataSpace::HipHost: + case dataSpace::HipHostAdviseFine: + case dataSpace::HipHostAdviseCoarse: + { + deallocHostData(ptr, vid); + } break; + + case dataSpace::OmpTarget: + { + deallocOpenMPDeviceData(ptr, did); + } break; + + case dataSpace::CudaPinned: + { + deallocCudaPinnedData(ptr, vid); + } break; + case dataSpace::CudaManaged: + { + deallocCudaManagedData(ptr, vid); + } break; + case dataSpace::CudaDevice: + { + deallocCudaDeviceData(ptr, vid); + } break; + + case dataSpace::HipPinned: + { + deallocHipPinnedData(ptr, vid); + } break; + case dataSpace::HipPinnedFine: + { + deallocHipPinnedFineData(ptr, vid); + } break; + case dataSpace::HipPinnedCoarse: + { + deallocHipPinnedCoarseData(ptr, vid); + } break; + case dataSpace::HipManaged: + case dataSpace::HipManagedAdviseFine: + case dataSpace::HipManagedAdviseCoarse: + { + deallocHipManagedData(ptr, vid); + } break; + case dataSpace::HipDevice: + { + deallocHipDeviceData(ptr, vid); + } break; + case dataSpace::HipDeviceFine: + { + deallocHipDeviceFineData(ptr, vid); + } break; + + default: + { + throw std::invalid_argument("deallocData : Unknown memory type"); + } break; + } +} + + + /*! * \brief Allocate and initialize Int_type data array. * @@ -96,32 +254,24 @@ void allocAndInitData(Complex_ptr& ptr, int len, int align, /*! * \brief Allocate data arrays. */ -void allocData(Int_ptr& ptr, int len, int align, - VariantID vid); +void allocHostData(Int_ptr& ptr, int len, int align, VariantID vid); /// -void allocData(Index_type*& ptr, int len, int align, - VariantID vid); +void allocHostData(Index_type*& ptr, int len, int align, VariantID vid); /// -void allocData(Real_ptr& ptr, int len, int align, - VariantID vid); +void allocHostData(Real_ptr& ptr, int len, int align, VariantID vid); /// -void allocData(Complex_ptr& ptr, int len, int align, - VariantID vid); +void allocHostData(Complex_ptr& ptr, int len, int align, VariantID vid); /*! * \brief Free data arrays. */ -void deallocData(Int_ptr& ptr, - VariantID vid); +void deallocHostData(Int_ptr& ptr, VariantID vid); /// -void deallocData(Index_type*& ptr, - VariantID vid); +void deallocHostData(Index_type*& ptr, VariantID vid); /// -void deallocData(Real_ptr& ptr, - VariantID vid); +void deallocHostData(Real_ptr& ptr, VariantID vid); /// -void deallocData(Complex_ptr& ptr, - VariantID vid); +void deallocHostData(Complex_ptr& ptr, VariantID vid); /*! diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index b9dcf10c7..8b663fbb0 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -191,71 +191,120 @@ class KernelBase } int getDataAlignment() const { return run_params.getDataAlignment(); } + + DataSpace getSeqDataSpace() const { return DataSpace::Host; } // run_params.getSeqDataSpace(); + DataSpace getOmpDataSpace() const { return DataSpace::Omp; } // run_params.getOmpDataSpace(); + DataSpace getOmpTargetDataSpace() const { return DataSpace::OmpTarget; } // run_params.getOmpTargetDataSpace(); + DataSpace getCudaDataSpace() const { return DataSpace::CudaDevice; } // run_params.getCudaDataSpace(); + DataSpace getHipDataSpace() const { return DataSpace::HipDevice; } // run_params.getHipDataSpace(); + DataSpace getKokkosDataSpace() const { return DataSpace::Host; } // run_params.getKokkosDataSpace(); + + DataSpace getDataSpace(VariantID vid) const + { + switch ( vid ) { + + case Base_Seq : + case Lambda_Seq : + case RAJA_Seq : + return getSeqDataSpace(); + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + return getOmpDataSpace(); + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + return getOmpTargetDataSpace(); + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + return getCudaDataSpace(); + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + return getHipDataSpace(); + + case Kokkos_Lambda : + return getKokkosDataSpace(); + + default: + throw std::invalid_argument("getDataSpace : Unknown variant id"); + } + } + template void allocData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocData(ptr, len, getDataAlignment(), vid); + rajaperf::detail::allocData( + ptr, len, getDataAlignment(), getDataSpace(vid)); } template void allocAndInitData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitData(ptr, len, getDataAlignment(), vid); + rajaperf::detail::allocAndInitData( + ptr, len, getDataAlignment(), getDataSpace(vid)); } template void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid) { - rajaperf::detail::allocAndInitDataConst(ptr, len, getDataAlignment(), val, vid); + rajaperf::detail::allocAndInitDataConst( + ptr, len, getDataAlignment(), val, getDataSpace(vid)); } template void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitDataRandSign(ptr, len, getDataAlignment(), vid); + rajaperf::detail::allocAndInitDataRandSign( + ptr, len, getDataAlignment(), getDataSpace(vid)); } template void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitDataRandValue(ptr, len, getDataAlignment(), vid); + rajaperf::detail::allocAndInitDataRandValue( + ptr, len, getDataAlignment(), getDataSpace(vid)); } template void deallocData(T*& ptr, VariantID vid) { - rajaperf::detail::deallocData(ptr, vid); + rajaperf::detail::deallocData(ptr, getDataSpace(vid)); } template void initData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::initData(ptr, len, vid); + rajaperf::detail::initData(ptr, len, getDataSpace(vid)); } template void initDataConst(T*& ptr, int len, T val, VariantID vid) { - rajaperf::detail::initDataConst(ptr, len, val, vid); + rajaperf::detail::initDataConst(ptr, len, val, getDataSpace(vid)); } template void initDataRandSign(T*& ptr, int len, VariantID vid) { - rajaperf::detail::initDataRandSign(ptr, len, vid); + rajaperf::detail::initDataRandSign(ptr, len, getDataSpace(vid)); } template void initDataRandValue(T*& ptr, int len, VariantID vid) { - rajaperf::detail::initDataRandValue(ptr, len, vid); + rajaperf::detail::initDataRandValue(ptr, len, getDataSpace(vid)); } template void initData(T& d, VariantID vid) { - rajaperf::detail::initData(d, vid); + rajaperf::detail::initData(d, getDataSpace(vid)); } #if defined(RAJA_ENABLE_CUDA) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index ce716fc2d..c073e3c37 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -321,7 +321,7 @@ static const std::string FeatureNames [] = /*! ******************************************************************************* * - * \brief Array of names for each Cuda Memory Space in suite. + * \brief Array of names for each Memory Space in suite. * * IMPORTANT: This is only modified when a new memory space is added to the suite. * @@ -330,45 +330,30 @@ static const std::string FeatureNames [] = * ******************************************************************************* */ -static const std::string CudaDataSpaceNames [] = +static const std::string DataSpaceNames [] = { - std::string("Host"), - std::string("Pinned"), - std::string("Managed"), - std::string("Device"), - - std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... -}; // END VariantNames + std::string("Omp"), + std::string("OmpTarget"), -/*! - ******************************************************************************* - * - * \brief Array of names for each Hip Memory Space in suite. - * - * IMPORTANT: This is only modified when a new memory space is added to the suite. - * - * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH - * ENUM OF HIPDATA IDS IN HEADER FILE!!! - * - ******************************************************************************* - */ -static const std::string HipDataSpaceNames [] = -{ + std::string("CudaHost"), + std::string("CudaPinned"), + std::string("CudaManaged"), + std::string("CudaDevice"), - std::string("Host"), - std::string("HostAdviseFine"), - std::string("HostAdviseCoarse"), - std::string("Pinned"), - std::string("PinnedFine"), - std::string("PinnedCoarse"), - std::string("Managed"), - std::string("ManagedAdviseFine"), - std::string("ManagedAdviseCoarse"), - std::string("Device"), - std::string("DeviceFine"), + std::string("HipHost"), + std::string("HipHostAdviseFine"), + std::string("HipHostAdviseCoarse"), + std::string("HipPinned"), + std::string("HipPinnedFine"), + std::string("HipPinnedCoarse"), + std::string("HipManaged"), + std::string("HipManagedAdviseFine"), + std::string("HipManagedAdviseCoarse"), + std::string("HipDevice"), + std::string("HipDeviceFine"), std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... @@ -571,22 +556,9 @@ const std::string& getFeatureName(FeatureID fid) * ******************************************************************************* */ -const std::string& getCudaDataSpaceName(CudaDataSpace cd) -{ - return CudaDataSpaceNames[static_cast(cd)]; -} - - -/* - ******************************************************************************* - * - * Return memory space name associated with HipDataSpace enum value. - * - ******************************************************************************* - */ -const std::string& getHipDataSpaceName(HipDataSpace hd) +const std::string& getDataSpaceName(DataSpace ds) { - return HipDataSpaceNames[static_cast(hd)]; + return DataSpaceNames[static_cast(ds)]; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index af54e8180..0f9279688 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -236,7 +236,7 @@ enum FeatureID { /*! ******************************************************************************* * - * \brief Enumeration defining unique id for each (CUDA) Data memory space + * \brief Enumeration defining unique id for each Data memory space * used in suite. * * IMPORTANT: This is only modified when a new memory space is used in suite. @@ -246,44 +246,30 @@ enum FeatureID { * ******************************************************************************* */ -enum struct CudaDataSpace { +enum struct DataSpace { Host = 0, - Pinned, - Managed, - Device, - NumSpaces // Keep this one last and NEVER comment out (!!) - -}; - - -/*! - ******************************************************************************* - * - * \brief Enumeration defining unique id for each (HIP) Data memory space - * used in suite. - * - * IMPORTANT: This is only modified when a new memory space is used in suite. - * - * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH - * ARRAY OF MEMORY SPACE NAMES IN IMPLEMENTATION FILE!!! - * - ******************************************************************************* - */ -enum struct HipDataSpace { - - Host = 0, - HostAdviseFine, - HostAdviseCoarse, - Pinned, - PinnedFine, - PinnedCoarse, - Managed, - ManagedAdviseFine, - ManagedAdviseCoarse, - Device, - DeviceFine, + Omp, + + OmpTarget, + + CudaHost, + CudaPinned, + CudaManaged, + CudaDevice, + + HipHost, + HipHostAdviseFine, + HipHostAdviseCoarse, + HipPinned, + HipPinnedFine, + HipPinnedCoarse, + HipManaged, + HipManagedAdviseFine, + HipManagedAdviseCoarse, + HipDevice, + HipDeviceFine, NumSpaces // Keep this one last and NEVER comment out (!!) From 9cae50874198347b207dc313826c2f467a5be3a6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 22 Feb 2023 15:03:17 -0800 Subject: [PATCH 035/137] Add sections to dev guide to mirror RAJA. --- docs/sphinx/dev_guide/branch_development.rst | 18 ++++++++ .../sphinx/dev_guide/build_configurations.rst | 18 ++++++++ docs/sphinx/dev_guide/ci.rst | 41 +++++++++++++++++++ docs/sphinx/dev_guide/contributing.rst | 39 ++++++++++++++++++ docs/sphinx/dev_guide/index.rst | 14 +++++-- docs/sphinx/dev_guide/release_process.rst | 23 +++++++++++ docs/sphinx/user_guide/run.rst | 4 +- 7 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 docs/sphinx/dev_guide/branch_development.rst create mode 100644 docs/sphinx/dev_guide/build_configurations.rst create mode 100644 docs/sphinx/dev_guide/ci.rst create mode 100644 docs/sphinx/dev_guide/contributing.rst create mode 100644 docs/sphinx/dev_guide/release_process.rst diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst new file mode 100644 index 000000000..6f0a4ad0d --- /dev/null +++ b/docs/sphinx/dev_guide/branch_development.rst @@ -0,0 +1,18 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _branching-label: + +******************************************** +RAJA Performance Suite Branch Development +******************************************** + +The RAJA Performance Suite manages repository branches and uses branch +development in a manner identical to the RAJA project. + +Please see `RAJA Branch Development `_ for details. diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst new file mode 100644 index 000000000..a9a2a27ff --- /dev/null +++ b/docs/sphinx/dev_guide/build_configurations.rst @@ -0,0 +1,18 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _build_config-label: + +******************************************** +RAJA Performance Suite Build Configurations +******************************************** + +Build configurations for the RAJA Performance Suite are maintained and +used similarly to RAJA + +Please see `RAJA Build Configurations `_ for details. diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst new file mode 100644 index 000000000..e96feb06d --- /dev/null +++ b/docs/sphinx/dev_guide/ci.rst @@ -0,0 +1,41 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _ci-label: + +************************************************************ +RAJA Performance Suite Continuous Integration (CI) Testing +************************************************************ + +The RAJA Performance Suite uses the same continuous integration tools as RAJA. + +One important difference to note is that the RAJA Performance Suite inherits +most of its support for GitLab CI testing from its RAJA submodule. As a result, +RAJA submodules that support RAJA GitLab CI, such as +`Uberenv `_ and +`RADIUSS Spack Configs `_, +do not appear in the RAJA Performance Suite repository. However, the +RAJA Performance Suite does include files that are specific to the project +and play the same roles and follow the structure as the similarly named +files in the RAJA repository. + +Files that support Azure Pipelines testing for the RAJA Performance Suite +are also maintained in the project repository. + +Please see `RAJA Continuous Integration Testing `_ for more information. + +.. _ci_tasks-label: + +****************************************************** +Continuous Integration (CI) Testing Maintenance Tasks +****************************************************** + +Tasks for maintaining continuous integration in the RAJA Performance Suite +are similar to those for RAJA. Please see `RAJA Continuous Integration Testing +Maintenance Tasks `_ for more information. + diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst new file mode 100644 index 000000000..a2a1f9be1 --- /dev/null +++ b/docs/sphinx/dev_guide/contributing.rst @@ -0,0 +1,39 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _contributing-label: + +******************************************** +Contributing to the RAJA Performance Suite +******************************************** + +Similar to RAJA, the RAJA Performance Suite is a collaborative open-source +software project and we encourage contributions from anyone who wants to add +features or improve its capabilities. This section describes the following: + + * GitHub project access + * How to develop a RAJA *pull request* (PR) contribution. + * Requirements that must be met for a PR to be merged. + +We assume contributors are familiar with +`Git `_, which we use for source code version control, +and `GitHub `_, which is where our project is hosted. + +.. important:: * Before a PR can be merged into the RAJA Performance Suite, + all test checks must pass and the PR must be approved by at + least one member of the core RAJA team. + + +Access to the RAJA Performance Suite GitHub project and acceptance of +contributions to it are managed identically to the RAJA project. In the +RAJA Developer Guide, you will find information on accessing the RAJA GitHub +project, restrictions on a fork of the repository, contributions and pull +requests. Please follow the RAJA processes for contributions to the +RAJA Performance Suite. + +Please see `Contributing to RAJA `_ for more information. diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index 2a6bebebd..73ca99a01 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -6,6 +6,7 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## +.. developer_guide: ######################################## RAJA Performance Suite Developer Guide @@ -14,10 +15,17 @@ RAJA Performance Suite Developer Guide RAJA Performance Suite development follows nearly identical processes as RAJA development, which are documented in the `RAJA Developer Guide `_ -This guide contains information about differences in development and -maintenance specific to the RAJA Performance Suite. + +This guide is structured the same as the RRAJA Developer Guide and refers to +sections in that on development topics. It also contains information about +differences in development and maintenance specific to the RAJA Performance +Suite. .. toctree:: :maxdepth: 1 -.. ## ci_tasks + contributing + branch_development + build_configurations + ci + release_process diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst new file mode 100644 index 000000000..75b910935 --- /dev/null +++ b/docs/sphinx/dev_guide/release_process.rst @@ -0,0 +1,23 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _release-label: + +************************************************************ +RAJA Performance Suite Release Process +************************************************************ + +The RAJA Performance Suite uses the same release process as RAJA. In addition, +RAJA Performance Suite releases are part of a coordinated process with other +**RAJA Portability Suite** projects. Specifically, when new releases of those +projects are done, a release of the RAJA Performance Suite follows soon after. + +Please see `RAJA Portability Suite Releases `_ for more information. + +The version naming scheme for the RAJA Performance Suite is also the same +as for RAJA Portability Suite projects. Please see `RAJA Release Version Naming `_ for details. diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index f7947c484..f8f26bd3f 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -32,9 +32,7 @@ some information about each kernel will appear on the screen. More information about kernel and execution details will also appear in a run report files generated in the run directory after Suite execution completes. -.. note:: * You can pass the ``--dryrun`` command-line option to the executable - to see a summary of how the Suite will execute without actually - running it. +.. note:: You can pass the ``--dryrun`` command-line option to the executable to see a summary of how the Suite will execute without actually running it. The Suite can be run in a variety of ways determined by the command-line options passed to the executable. For example, you can run or exclude subsets From 56bd05a342243a19aa24cf2c80fe1058ca577aad Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Feb 2023 16:14:46 -0800 Subject: [PATCH 036/137] more partial use of DataSpaces --- src/common/DataUtils.cpp | 4 +- src/common/DataUtils.hpp | 390 +++++++++++++++++++++++++---------- src/common/RAJAPerfSuite.cpp | 2 - src/common/RAJAPerfSuite.hpp | 2 - 4 files changed, 282 insertions(+), 116 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 07802111a..dd051d2ce 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -361,8 +361,6 @@ void initData(Real_type& d, VariantID vid) incDataInitCount(); } -} // closing brace for detail namespace - /* * Calculate and return checksum for data arrays. */ @@ -432,4 +430,6 @@ long double calcChecksum(const Complex_ptr ptr, int len, return tchk; } +} // closing brace for detail namespace + } // closing brace for rajaperf namespace diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index ad58201db..b34cb3e07 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -45,6 +45,154 @@ void resetDataInitCount(); void incDataInitCount(); +/*! + * \brief Allocate data arrays. + */ +void allocHostData(Int_ptr& ptr, int len, int align, VariantID vid); +/// +void allocHostData(Index_type*& ptr, int len, int align, VariantID vid); +/// +void allocHostData(Real_ptr& ptr, int len, int align, VariantID vid); +/// +void allocHostData(Complex_ptr& ptr, int len, int align, VariantID vid); + +/*! + * \brief Free data arrays. + */ +void deallocHostData(Int_ptr& ptr, VariantID vid); +/// +void deallocHostData(Index_type*& ptr, VariantID vid); +/// +void deallocHostData(Real_ptr& ptr, VariantID vid); +/// +void deallocHostData(Complex_ptr& ptr, VariantID vid); + + +/*! + * \brief Initialize Int_type data array. + * + * Array entries are randomly initialized to +/-1. + * Then, two randomly-chosen entries are reset, one to + * a value > 1, one to a value < -1. + */ +void initData(Int_ptr& ptr, int len, + VariantID vid); + +/*! + * \brief Initialize Real_type data array. + * + * Array entries are set (non-randomly) to positive values + * in the interval (0.0, 1.0) based on their array position (index) + * and the order in which this method is called. + */ +void initData(Real_ptr& ptr, int len, + VariantID vid); + +/*! + * \brief Initialize Real_type data array. + * + * Array entries are set to given constant value. + */ +void initDataConst(Real_ptr& ptr, int len, Real_type val, + VariantID vid); + +/*! + * \brief Initialize Real_type data array with random sign. + * + * Array entries are initialized in the same way as the method + * initData(Real_ptr& ptr...) above, but with random sign. + */ +void initDataRandSign(Real_ptr& ptr, int len, + VariantID vid); + +/*! + * \brief Initialize Real_type data array with random values. + * + * Array entries are initialized with random values in the interval [0.0, 1.0]. + */ +void initDataRandValue(Real_ptr& ptr, int len, + VariantID vid); + +/*! + * \brief Initialize Complex_type data array. + * + * Real and imaginary array entries are initialized in the same way as the + * method allocAndInitData(Real_ptr& ptr...) above. + */ +void initData(Complex_ptr& ptr, int len, + VariantID vid); + +/*! + * \brief Initialize Real_type scalar data. + * + * Data is set similarly to an array enttry in the method + * initData(Real_ptr& ptr...) above. + */ +void initData(Real_type& d, + VariantID vid); + + +/*! + * \brief Calculate and return checksum for data arrays. + * + * Checksums are computed as a weighted sum of array entries, + * where weight is a simple function of elemtn index. + * + * Checksumn is multiplied by given scale factor. + */ +long double calcChecksum(Int_ptr d, int len, + Real_type scale_factor = 1.0); +/// +long double calcChecksum(Real_ptr d, int len, + Real_type scale_factor = 1.0); +/// +long double calcChecksum(Complex_ptr d, int len, + Real_type scale_factor = 1.0); + +/*! + * \brief Get an host accessible data space for this dataSpace. + * + * Intended to be a space that is quick to copy to from the given space if + * the given space is not accessible on the Host. + */ +DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case dataSpace::Host: + case dataSpace::Omp: + case dataSpace::CudaPinned: + case dataSpace::HipHostAdviseFine: + case dataSpace::HipHostAdviseCoarse: + case dataSpace::HipPinned: + case dataSpace::HipPinnedFine: + case dataSpace::HipPinnedCoarse: + return dataSpace; + + case dataSpace::OmpTarget: + return dataSpace::Host; + + case dataSpace::CudaManaged: + case dataSpace::CudaDevice: + return dataSpace::CudaPinned; + + case dataSpace::HipManaged: + case dataSpace::HipManagedAdviseFine: + case dataSpace::HipManagedAdviseCoarse: + return dataSpace; + + case dataSpace::HipDevice: + case dataSpace::HipDeviceFine: + return dataSpace::HipPinned; + + default: + { + throw std::invalid_argument("accessibleDataSpace : Unknown memory type"); + } break; + } +} + +} // closing brace for detail namespace + /*! @@ -56,8 +204,6 @@ void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID vid) switch (dataSpace) { case dataSpace::Host: case dataSpace::Omp: - case dataSpace::CudaHost: - case dataSpace::HipHost: { allocHostData(ptr, len, align, vid); } break; @@ -141,8 +287,6 @@ void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) switch (dataSpace) { case dataSpace::Host: case dataSpace::Omp: - case dataSpace::CudaHost: - case dataSpace::HipHost: case dataSpace::HipHostAdviseFine: case dataSpace::HipHostAdviseCoarse: { @@ -201,23 +345,94 @@ void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) } } +/*! + * \brief Copy data from one array to another. + */ +template +void copyData(DataSpace dst_dataSpace, T* dst_ptr, + DataSpace src_dataSpace, const T* src_ptr, + int len, VariantID vid) +{ + + switch (dst_dataSpace) { + case dataSpace::Host: + case dataSpace::Omp: + case dataSpace::HipHostAdviseFine: + case dataSpace::HipHostAdviseCoarse: + { + copyHostData(dst_ptr, src_ptr, sizeof(T)*len, vid); + } break; + + case dataSpace::OmpTarget: + { + copyOpenMPDeviceData(dst_ptr, src_ptr, sizeof(T)*len, did); + } break; + + case dataSpace::CudaPinned: + case dataSpace::CudaManaged: + case dataSpace::CudaDevice: + { + copyCudaData(dst_ptr, src_ptr, sizeof(T)*len, vid); + } break; + + case dataSpace::HipPinned: + case dataSpace::HipPinnedFine: + case dataSpace::HipPinnedCoarse: + case dataSpace::HipManaged: + case dataSpace::HipManagedAdviseFine: + case dataSpace::HipManagedAdviseCoarse: + case dataSpace::HipDevice: + case dataSpace::HipDeviceFine: + { + copyHipData(dst_ptr, src_ptr, sizeof(T)*len, vid); + } break; + default: + { + throw std::invalid_argument("copyData : Unknown memory type"); + } break; + } +} /*! - * \brief Allocate and initialize Int_type data array. - * - * Array is initialized using method initData(Int_ptr& ptr...) below. + * \brief Move data array into new dataSpace. */ -void allocAndInitData(Int_ptr& ptr, int len, int align, - VariantID vid); +template +void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, + T*& ptr, int len, int align, VariantID vid) +{ + if (new_dataSpace != old_dataSpace) { + + T* new_ptr = nullptr; + + detail::allocData(new_dataSpace, new_ptr, len, align, vid); + + detail::copyData(new_dataSpace, new_ptr, old_dataSpace, ptr, len, vid); + + detail::deallocData(old_dataSpace, ptr, len, vid); + + ptr = new_ptr; + } +} /*! - * \brief Allocate and initialize aligned Real_type data array. - * - * Array is initialized using method initData(Real_ptr& ptr...) below. + * \brief Allocate and initialize data array. */ -void allocAndInitData(Real_ptr& ptr, int len, int align, - VariantID vid); +template +void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align, + VariantID vid) +{ + DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + + detail::allocData(init_dataSpace, ptr, len, align, vid); + + detail::initData(ptr, len, vid); + + if (init_dataSpace != dataSpace) { + detail::moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + } +} + /*! * \brief Allocate and initialize aligned Real_type data array. @@ -245,117 +460,72 @@ void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid); -/*! - * \brief Allocate and initialize aligned Complex_type data array. - */ -void allocAndInitData(Complex_ptr& ptr, int len, int align, - VariantID vid); -/*! - * \brief Allocate data arrays. +/* + * Allocate and initialize aligned data arrays. */ -void allocHostData(Int_ptr& ptr, int len, int align, VariantID vid); -/// -void allocHostData(Index_type*& ptr, int len, int align, VariantID vid); -/// -void allocHostData(Real_ptr& ptr, int len, int align, VariantID vid); -/// -void allocHostData(Complex_ptr& ptr, int len, int align, VariantID vid); +void allocAndInitData(Int_ptr& ptr, int len, int align, VariantID vid) +{ + allocHostData(ptr, len, align, vid); + initData(ptr, len, vid); +} -/*! - * \brief Free data arrays. - */ -void deallocHostData(Int_ptr& ptr, VariantID vid); -/// -void deallocHostData(Index_type*& ptr, VariantID vid); -/// -void deallocHostData(Real_ptr& ptr, VariantID vid); -/// -void deallocHostData(Complex_ptr& ptr, VariantID vid); +void allocAndInitData(Real_ptr& ptr, int len, int align, VariantID vid ) +{ + allocHostData(ptr, len, align, vid); + initData(ptr, len, vid); +} +void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) +{ + allocHostData(ptr, len, align, vid); + initData(ptr, len, vid); +} -/*! - * \brief Initialize Int_type data array. - * - * Array entries are randomly initialized to +/-1. - * Then, two randomly-chosen entries are reset, one to - * a value > 1, one to a value < -1. - */ -void initData(Int_ptr& ptr, int len, - VariantID vid); +void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, + VariantID vid) +{ + allocHostData(ptr, len, align, vid); + initDataConst(ptr, len, val, vid); +} -/*! - * \brief Initialize Real_type data array. - * - * Array entries are set (non-randomly) to positive values - * in the interval (0.0, 1.0) based on their array position (index) - * and the order in which this method is called. - */ -void initData(Real_ptr& ptr, int len, - VariantID vid); +void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, VariantID vid) +{ + allocHostData(ptr, len, align, vid); + initDataRandSign(ptr, len, vid); +} -/*! - * \brief Initialize Real_type data array. - * - * Array entries are set to given constant value. - */ -void initDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid); +void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid) +{ + allocHostData(ptr, len, align, vid); + initDataRandValue(ptr, len, vid); +} -/*! - * \brief Initialize Real_type data array with random sign. - * - * Array entries are initialized in the same way as the method - * initData(Real_ptr& ptr...) above, but with random sign. - */ -void initDataRandSign(Real_ptr& ptr, int len, - VariantID vid); -/*! - * \brief Initialize Real_type data array with random values. - * - * Array entries are initialized with random values in the interval [0.0, 1.0]. - */ -void initDataRandValue(Real_ptr& ptr, int len, - VariantID vid); -/*! - * \brief Initialize Complex_type data array. - * - * Real and imaginary array entries are initialized in the same way as the - * method allocAndInitData(Real_ptr& ptr...) above. - */ -void initData(Complex_ptr& ptr, int len, - VariantID vid); -/*! - * \brief Initialize Real_type scalar data. - * - * Data is set similarly to an array enttry in the method - * initData(Real_ptr& ptr...) above. +/* + * Calculate and return checksum for arrays. */ -void initData(Real_type& d, - VariantID vid); +template +long double calcChecksum(DataSpace dataSpace, const T* ptr, int len, + Real_type scale_factor, VariantID vid) +{ + const T* check_ptr = ptr; -} // closing brace for detail namespace + DataSpace check_dataSpace = hostAccessibleDataSpace(dataSpace); + if (check_dataSpace != dataSpace) { + allocData(check_dataSpace, check_ptr, len, vid); + } -/*! - * \brief Calculate and return checksum for data arrays. - * - * Checksums are computed as a weighted sum of array entries, - * where weight is a simple function of elemtn index. - * - * Checksumn is multiplied by given scale factor. - */ -long double calcChecksum(Int_ptr d, int len, - Real_type scale_factor = 1.0); -/// -long double calcChecksum(Real_ptr d, int len, - Real_type scale_factor = 1.0); -/// -long double calcChecksum(Complex_ptr d, int len, - Real_type scale_factor = 1.0); + auto val = detail::calcChecksum(check_ptr, len, scale_factor); + + if (check_dataSpace != dataSpace) { + deallocData(check_dataSpace, check_ptr, vid); + } + return val; +} /*! * \brief Holds a RajaPool object and provides access to it via a diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index c073e3c37..ec0aedb8e 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -338,12 +338,10 @@ static const std::string DataSpaceNames [] = std::string("OmpTarget"), - std::string("CudaHost"), std::string("CudaPinned"), std::string("CudaManaged"), std::string("CudaDevice"), - std::string("HipHost"), std::string("HipHostAdviseFine"), std::string("HipHostAdviseCoarse"), std::string("HipPinned"), diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 0f9279688..1ff430f66 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -254,12 +254,10 @@ enum struct DataSpace { OmpTarget, - CudaHost, CudaPinned, CudaManaged, CudaDevice, - HipHost, HipHostAdviseFine, HipHostAdviseCoarse, HipPinned, From eef5e6d998d4aefab391e454b017659f66b16255 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 23 Feb 2023 12:06:49 -0800 Subject: [PATCH 037/137] Proofread and new content. --- docs/sphinx/dev_guide/contributing.rst | 13 +++++++++---- docs/sphinx/dev_guide/index.rst | 12 ++++++------ docs/sphinx/dev_guide/structure.rst | 25 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 docs/sphinx/dev_guide/structure.rst diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst index a2a1f9be1..534599b74 100644 --- a/docs/sphinx/dev_guide/contributing.rst +++ b/docs/sphinx/dev_guide/contributing.rst @@ -12,9 +12,15 @@ Contributing to the RAJA Performance Suite ******************************************** -Similar to RAJA, the RAJA Performance Suite is a collaborative open-source -software project and we encourage contributions from anyone who wants to add -features or improve its capabilities. This section describes the following: +The most important things to know about making a contribution to the +RAJA Performance Suite involve the organization of the source code and +how it works, especially when adding a new Kernel or Variant. These items +are described in detail in :ref:`structure-label`. + +At a more basic level, the RAJA Performance Suite is a collaborative +open-source software project, similar to RAJA, and we encourage contributions +from anyone who wants to add features or improve its capabilities. This +section describes the following: * GitHub project access * How to develop a RAJA *pull request* (PR) contribution. @@ -28,7 +34,6 @@ and `GitHub `_, which is where our project is hosted. all test checks must pass and the PR must be approved by at least one member of the core RAJA team. - Access to the RAJA Performance Suite GitHub project and acceptance of contributions to it are managed identically to the RAJA project. In the RAJA Developer Guide, you will find information on accessing the RAJA GitHub diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index 73ca99a01..94e872212 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -12,14 +12,14 @@ RAJA Performance Suite Developer Guide ######################################## -RAJA Performance Suite development follows nearly identical processes as -RAJA development, which are documented in the +The RAJA Performance Suite project follows nearly identical development +processes as RAJA, which are documented in the `RAJA Developer Guide `_ -This guide is structured the same as the RRAJA Developer Guide and refers to -sections in that on development topics. It also contains information about -differences in development and maintenance specific to the RAJA Performance -Suite. +This guide is structured similarly to the RAJA Developer Guide and refers to +sections in that on various development topics to avoid redundancy. It does +contains information about differences in development and maintenance +practices that are specific to the RAJA Performance Suite. .. toctree:: :maxdepth: 1 diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst new file mode 100644 index 000000000..142fbe6b8 --- /dev/null +++ b/docs/sphinx/dev_guide/structure.rst @@ -0,0 +1,25 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _structure-label: + +************************************************************************ +RAJA Performance Suite Structure: Adding Kernels, Variants, and Tunings +************************************************************************ + +This section describes how to add new Kernels, Variants, and Tunings to the +Suite. The discussion should make clear the organization of the code and +how it works, which is useful to understand when making a contribution. + +It is important to note that Group and Feature modifications are not required +unless a new Group or exercised RAJA Feature is added when a new Kernel is +introduced. + +It is also essential that the appropriate targets are updated in the +appropriate ``CMakeLists.txt`` files when files are added to the Suite so +that they will be compiled. From 40e6fac393834b2d698699d652220310f5fe4676 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 23 Feb 2023 13:47:26 -0800 Subject: [PATCH 038/137] Clean up and improve readability --- docs/sphinx/dev_guide/branch_development.rst | 7 ++-- .../sphinx/dev_guide/build_configurations.rst | 4 +-- docs/sphinx/dev_guide/ci.rst | 23 ++++++------ docs/sphinx/dev_guide/contributing.rst | 35 +++++++------------ docs/sphinx/dev_guide/index.rst | 4 +-- docs/sphinx/dev_guide/release_process.rst | 11 +++--- 6 files changed, 34 insertions(+), 50 deletions(-) diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst index 6f0a4ad0d..8d2e04437 100644 --- a/docs/sphinx/dev_guide/branch_development.rst +++ b/docs/sphinx/dev_guide/branch_development.rst @@ -12,7 +12,6 @@ RAJA Performance Suite Branch Development ******************************************** -The RAJA Performance Suite manages repository branches and uses branch -development in a manner identical to the RAJA project. - -Please see `RAJA Branch Development `_ for details. +The RAJA Performance Suite project manages repository branches and does branch +development in a manner identical to the RAJA project. Please see +`RAJA Branch Development `_ for more information. diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst index a9a2a27ff..7ce70decf 100644 --- a/docs/sphinx/dev_guide/build_configurations.rst +++ b/docs/sphinx/dev_guide/build_configurations.rst @@ -13,6 +13,4 @@ RAJA Performance Suite Build Configurations ******************************************** Build configurations for the RAJA Performance Suite are maintained and -used similarly to RAJA - -Please see `RAJA Build Configurations `_ for details. +used similarly to RAJA. Please see `RAJA Build Configurations `_ for details. diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst index e96feb06d..2db276d6e 100644 --- a/docs/sphinx/dev_guide/ci.rst +++ b/docs/sphinx/dev_guide/ci.rst @@ -12,22 +12,21 @@ RAJA Performance Suite Continuous Integration (CI) Testing ************************************************************ -The RAJA Performance Suite uses the same continuous integration tools as RAJA. +The RAJA Performance Suite project uses the same continuous integration tools +as the RAJA project, namely Azure Pipelines and GitLab CI in the Livermore +Computing Collaboration Zone (LC CZ). Please see `RAJA Continuous Integration Testing `_ for more information. One important difference to note is that the RAJA Performance Suite inherits most of its support for GitLab CI testing from its RAJA submodule. As a result, -RAJA submodules that support RAJA GitLab CI, such as +submodules that support RAJA GitLab CI, such as `Uberenv `_ and `RADIUSS Spack Configs `_, -do not appear in the RAJA Performance Suite repository. However, the -RAJA Performance Suite does include files that are specific to the project -and play the same roles and follow the structure as the similarly named -files in the RAJA repository. +do not appear in the RAJA Performance Suite repository. -Files that support Azure Pipelines testing for the RAJA Performance Suite -are also maintained in the project repository. - -Please see `RAJA Continuous Integration Testing `_ for more information. +The RAJA Performance Suite project does include files that support GitLab and +Azure Pipelines CI testing that are specific to the project. These file are +similar to those in the RAJA project and play the same roles and follow the +same structure as in the RAJA project. Such files are described in `RAJA Continuous Integration Testing `_. .. _ci_tasks-label: @@ -36,6 +35,6 @@ Continuous Integration (CI) Testing Maintenance Tasks ****************************************************** Tasks for maintaining continuous integration in the RAJA Performance Suite -are similar to those for RAJA. Please see `RAJA Continuous Integration Testing -Maintenance Tasks `_ for more information. +project are similar to those for RAJA. Please see +`RAJA Continuous Integration Testing Maintenance Tasks `_ for details. diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst index 534599b74..74f86d3cd 100644 --- a/docs/sphinx/dev_guide/contributing.rst +++ b/docs/sphinx/dev_guide/contributing.rst @@ -12,33 +12,22 @@ Contributing to the RAJA Performance Suite ******************************************** -The most important things to know about making a contribution to the -RAJA Performance Suite involve the organization of the source code and -how it works, especially when adding a new Kernel or Variant. These items -are described in detail in :ref:`structure-label`. - -At a more basic level, the RAJA Performance Suite is a collaborative -open-source software project, similar to RAJA, and we encourage contributions -from anyone who wants to add features or improve its capabilities. This -section describes the following: - - * GitHub project access - * How to develop a RAJA *pull request* (PR) contribution. - * Requirements that must be met for a PR to be merged. - -We assume contributors are familiar with -`Git `_, which we use for source code version control, -and `GitHub `_, which is where our project is hosted. - -.. important:: * Before a PR can be merged into the RAJA Performance Suite, - all test checks must pass and the PR must be approved by at - least one member of the core RAJA team. +Like RAJA, the RAJA Performance Suite is a collaborative open-source software +project. We encourage contributions from anyone who wants to add features +or improve RAJA Performance Suite capabilities. Contributors should be familiar +with `Git `_, which is the tool we use for source code +version control, and `GitHub `_, which is where the +project is hosted. Access to the RAJA Performance Suite GitHub project and acceptance of contributions to it are managed identically to the RAJA project. In the RAJA Developer Guide, you will find information on accessing the RAJA GitHub project, restrictions on a fork of the repository, contributions and pull requests. Please follow the RAJA processes for contributions to the -RAJA Performance Suite. +RAJA Performance Suite. Please see `Contributing to RAJA `_ for details. + +The most important things to know about making a contribution to the +RAJA Performance Suite involve the organization of the source code and +how it works, especially when adding a new Kernel or Variant. These items +are described in detail in :ref:`structure-label`. -Please see `Contributing to RAJA `_ for more information. diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index 94e872212..11f572a4a 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -17,8 +17,8 @@ processes as RAJA, which are documented in the `RAJA Developer Guide `_ This guide is structured similarly to the RAJA Developer Guide and refers to -sections in that on various development topics to avoid redundancy. It does -contains information about differences in development and maintenance +sections in that on various development topics to avoid redundancy. In +addition, it contains information about development and maintenance practices that are specific to the RAJA Performance Suite. .. toctree:: diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst index 75b910935..8b1942758 100644 --- a/docs/sphinx/dev_guide/release_process.rst +++ b/docs/sphinx/dev_guide/release_process.rst @@ -12,12 +12,11 @@ RAJA Performance Suite Release Process ************************************************************ -The RAJA Performance Suite uses the same release process as RAJA. In addition, -RAJA Performance Suite releases are part of a coordinated process with other -**RAJA Portability Suite** projects. Specifically, when new releases of those -projects are done, a release of the RAJA Performance Suite follows soon after. - -Please see `RAJA Portability Suite Releases `_ for more information. +The RAJA Performance Suite project uses the same release process as RAJA +project and RAJA Performance Suite releases are part of the coordinated +release process for other **RAJA Portability Suite** projects. Specifically, +when new releases of those projects are done, a release of the RAJA +Performance Suite follows soon after. Please see `RAJA Portability Suite Releases `_ for more information. The version naming scheme for the RAJA Performance Suite is also the same as for RAJA Portability Suite projects. Please see `RAJA Release Version Naming `_ for details. From a5562bbfe62c5c5e2595bf020f13fe0dd5bb8b25 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 23 Feb 2023 16:41:27 -0800 Subject: [PATCH 039/137] New content for adding kernels --- docs/sphinx/dev_guide/index.rst | 1 + docs/sphinx/dev_guide/structure.rst | 158 ++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 8 deletions(-) diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index 11f572a4a..f4fa1520c 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -29,3 +29,4 @@ practices that are specific to the RAJA Performance Suite. build_configurations ci release_process + structure diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst index 142fbe6b8..963f18930 100644 --- a/docs/sphinx/dev_guide/structure.rst +++ b/docs/sphinx/dev_guide/structure.rst @@ -12,14 +12,156 @@ RAJA Performance Suite Structure: Adding Kernels, Variants, and Tunings ************************************************************************ -This section describes how to add new Kernels, Variants, and Tunings to the -Suite. The discussion should make clear the organization of the code and +This section describes how to add Kernels, Variants, Groups, and Tunings to the +Suite. The discussion aims to make clear the organization of the code and how it works, which is useful to understand when making a contribution. -It is important to note that Group and Feature modifications are not required -unless a new Group or exercised RAJA Feature is added when a new Kernel is -introduced. +All kernel and RAJA Performance Suite infrastructure files reside in the +``src`` directory of the project. If you list the contents of that directory, +you will see the following:: + + $ ls -c1 -F + lcals/ + stream/ + stream-kokkos/ + rajaperf_config.hpp.in + polybench/ + lcals-kokkos/ + common/ + basic/ + basic-kokkos/ + apps/ + algorithm/ + RAJAPerfSuiteDriver.cpp + CMakeLists.txt + +Each directory contains files for kernels in the Group associated with the +directory name. For example, the ``lcals`` directory contains kernels from +the LCALS benchmark suite, the ``stream`` directory contains kernels from a +stream benchmark suite, and so on. The one exception is the ``common`` +directory, which contains the Suite infrastructure files and utilities used +throughout the Suite. + +The following discussion describes how to modify and add files with new content +in the Suite. + +.._structure_addkernel-label: + +================ +Adding a Kernel +================ + +Adding a kernel to the Suite involves five main steps: + +#. Add a unique kernel ID and a unique kernel name to the Suite. +#. If the kernel is part of a new kernel group or exercises a new RAJA feature, + add a unique group ID and group name. +#. If the kernel exercises a RAJA feature that is not currently used in the + Suite, add a unique feature ID and feature name. +#. Implement a kernel class that defines all operations needed to run it, + along with source files that contain kernel variant implementations, + organized as described below. +#. Add appropriate targets to the `CMakeLists.txt`` files when new kernel + files are added to the Suite so that they will be compiled. + +These steps are described in the following sections. + +.._structure_addkernel_name-label: + +Adding a kernel ID and name +---------------------------- + +Two key pieces of information are used to identify each kernel in the Suite: +the group in which it resides and the name of the kernel itself. The files +``RAJAPerfSuite.hpp`` and ``RAJAPerfSuite.cpp`` in the ``src/common`` +directory define kernel IDs as enumeration values and arrays of strings for +kernel names, respectively. + +For concreteness in the following discussion, we describe how one would add +the ``ADD`` kernel, which already exists in the Suite in the "Stream" kernel +group. + +First, add an enumeration value identifier for the kernel, that is unique +among all Suite kernels, in the enum ``KernelID`` in the +``src/common/RAJAPerfSuite.hpp`` header file:: + + enum KernelID { + + ... + + // + // Stream kernels... + // + Stream_ADD, + ... + + }; + +Several conventions are important to note because following them will ensure +that the kernel works properly within the RAJA Performance Suite machinery. + +.. note:: * The enumeration value label for a kernel is the **group name followed by the kernel name separated by an underscore**. + * Kernel ID enumeration values for kernels in the same group must + appear consecutively in the enumeration. + * Kernel ID enumeration labels must in alphabetical order, with + respect to the base kernel name in each group. + +Second, add the kernel name to the array of strings ``KernelNames`` in the +``src/common/RAJAPerfSuite.cpp`` source file:: + + static const std::string KernelNames [] = + { + + ... + + // + // Stream kernels... + // + std::string("Stream_ADD"), + ... + + }; + +Again, several conventions are important to note because following them will +ensure that the kernel works properly within the RAJA Performance Suite +machinery. + +.. note:: * The kernel string name is just a string version of the kernel ID. + * The values in the ``KernelID`` enum must match the strings in the + ``KernelNames`` array one-to-one and in the same order. + +Typically, adding a new Group or Feature when adding a Kernel is not required. +One or both of these needs to be added only if the Kernel is not part of an +existing Suite Group, or exercises a RAJA Feature that is not used in an +existing Kernel. + +.._structure_addkernel_group-label: + +Add a new group +---------------------------- + +If a kernel is added as part of a new group of kernels in the Suite, a new +value must be added to the ``GroupID`` enum in the ``RAJAPerfSuite.hpp`` +header file and an associated group string name must be added to the +``GroupNames`` string array in the ``RAJAPerfSuite.cpp`` source file. The +process is similar to adding a new kernel ID and name described above. +Again, the enumeration values and string array entries must be kept consistent, +in the same order and matching one-to-one. + +.._structure_addkernel_feature-label: + +Add a new Feature +---------------------------- + +If a kernel is added that exercises a RAJA Feature that is not used in an +existing Kernel, a new value must be added to the ``FeatureID`` enum in the +``RAJAPerfSuite.hpp`` header file and an associated feature string name must +be added to the ``FeatureNames`` string array in the ``RAJAPerfSuite.cpp`` +source file. The process is similar to adding a new kernel ID and name +described above. Again, the enumeration values and string array entries must +be kept consistent, in the same order and matching one-to-one. + + + + -It is also essential that the appropriate targets are updated in the -appropriate ``CMakeLists.txt`` files when files are added to the Suite so -that they will be compiled. From 267ee1140146b231af08597c221596e197563e07 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 24 Feb 2023 14:02:39 -0800 Subject: [PATCH 040/137] Cleanup and add discussion of kernel class implementation --- docs/sphinx/dev_guide/index.rst | 1 + docs/sphinx/dev_guide/kernel_class.rst | 116 +++++++++++++++++++++++++ docs/sphinx/dev_guide/structure.rst | 97 ++++++++++----------- src/stream/ADD.hpp | 2 + 4 files changed, 167 insertions(+), 49 deletions(-) create mode 100644 docs/sphinx/dev_guide/kernel_class.rst diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index f4fa1520c..ce465ee35 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -30,3 +30,4 @@ practices that are specific to the RAJA Performance Suite. ci release_process structure + kernel_class diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst new file mode 100644 index 000000000..c6cc74f28 --- /dev/null +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -0,0 +1,116 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _kernel_class-label: + +==================== +Kernel Class Files +==================== + +Each kernel in the Suite is implemented in a class whose header and +implementation files reside in ``src`` subdirectory named for the group in +which the kernel lives. A kernel class is responsible for implementing all +operations that manage data, execute, and record execution timing, checksum, +and other information for each variant and tuning of a kernel. To properly +integrate into the RAJA Performance Suite framework, the kernel class must be +a subclass of the ``KernelBase`` base class that defines the interface for +kernels in the Suite. The ``KernelBase.hpp`` header file resides in the +``src/common`` directory. + +Continuing with the example we started discussing above, we add the +``ADD.hpp`` header file for the **ADD** class to the ``stream`` directory +along with multiple implementation files. We describe the contents of these +files in the following sections: + + * ``ADD.cpp`` contains methods to set up and tear down the memory for the + **ADD** kernel, and compute and record a checksum on the result after + it executes. It also specifies **ADD** kernel information in the ``ADD`` + class constructor. + * ``ADD-Seq.cpp`` contains sequential CPU variants and tunings of the kernel. + * ``ADD-OMP.cpp`` contains OpenMP CPU multithreading variants and tunings of + the kernel. + * ``ADD-OMPTarget.cpp`` contains OpenMP target offload variants and tunings + of the kernel. + * ``ADD-Cuda.cpp`` contains CUDA GPU variants and tunings of the kernel. + * ``FOO-Hip.cpp`` contains HIP GPU variants and tunings of the kernel. + +.. note:: All kernels in the Suite follow the same file organization and + implementation pattern. Inspection of the files for any kernel + helps to understand the overall organization. + +.. important:: If a new execution back-end variant is added that is not listed + here, that variant should be placed in a file named to clearly + distinguish the back-end implementation, such as + ``ADD-.cpp``. Keeping the variants for each back-end + in a separate file helps to understand compiler optimizations + when looking at generated assembly code, for example. + +------------------------- +Kernel class header file +------------------------- + +In its entirety, the **ADD** kernel class header file is: + +.. literalinclude:: ../../../src/stream/ADD.hpp + :start-after: _add_header_start + :end-before: _add_header_end + :language: C++ + +The key ingredients of a kernel class header file are: + + * **Copyright statement** at the top of the file. + + .. note:: Each file in the RAJA Performance Suite must start with a + boilerplate comment for the project copyright information. + + * **Reference implementation**, which is a comment section that shows the + kernel as it appears in the original code in which it was taken. This is + helpful to understand the origin and intent of the original. + + * **Uniquely-named header file include guard, that contains the contents of + the file. + + * **Macro definitions** that contain source lines of code that appear in + multiple places in the kernel class implementation. While it obfuscates + the code somewhat, it is useful to reduce the amount of code to maintain + and ensure consistency. + + * **Class definition** derived from the ``KernelBase`` class. We describe + this in more detail below. + +.. note:: * All types, methods, etc. in the RAJA Performance Suite reside in + the ``rajaperf`` namespace. + * In addition, each kernel class lives in the namespace of the + kernel group of which the kernel is a member. For example, + here, the ``ADD`` class is in the ``stream`` namespace. + * Each kernel class **must** be derived from the ``KernelBase`` + class so that the kernel implementation integrates properly into + the Suite. + +The class must provide a constructor that takes a reference to a ``RunParams`` +object, which contains input parameters for running the Suite -- we'll say more +about this later. The class constructor may or may not allocate storage for +a class object. If it does, the storage should be deallocated in the class +destructor. + +Several methods in the ``KernelBase`` class are pure virtual and the derived +kernel class must provide implementations of those methods. These methods +take a ``VariantID`` argument and a tuning index. They include: ``setUp``, +``updateChecksum``, and ``tearDown``, and methods to run the different kernel +variants. While these method names are descriptive of what they do, we'll +provide more details about them when we describe the class implementation in +the next section. + +Other methods in the code above, such as ``setCudaTuningDefinitions`` are +virtual in the ``KernelBase`` class and so they may be provided optionally by +the kernel class for kernel specific operations. + +Lastly, any data members used in the class implementation are defined, +typically in a ``private`` member section so they don't *bleed* out of the +kernel class. + diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst index 963f18930..d42e703c6 100644 --- a/docs/sphinx/dev_guide/structure.rst +++ b/docs/sphinx/dev_guide/structure.rst @@ -16,9 +16,9 @@ This section describes how to add Kernels, Variants, Groups, and Tunings to the Suite. The discussion aims to make clear the organization of the code and how it works, which is useful to understand when making a contribution. -All kernel and RAJA Performance Suite infrastructure files reside in the -``src`` directory of the project. If you list the contents of that directory, -you will see the following:: +All files containing RAJA Performance Suite infrastructure and kernels reside +in the ``src`` directory of the project. If you list the contents of that +directory, you will see the following:: $ ls -c1 -F lcals/ @@ -39,13 +39,13 @@ Each directory contains files for kernels in the Group associated with the directory name. For example, the ``lcals`` directory contains kernels from the LCALS benchmark suite, the ``stream`` directory contains kernels from a stream benchmark suite, and so on. The one exception is the ``common`` -directory, which contains the Suite infrastructure files and utilities used -throughout the Suite. +directory, which contains files that implement the Suite infrastructure and +utilities, such as data management routines, used throughout the Suite. -The following discussion describes how to modify and add files with new content -in the Suite. +The following discussion describes how to modify and add files with new +content to the Suite, such as new kernels, variants, etc. -.._structure_addkernel-label: +.. _structure_addkernel-label: ================ Adding a Kernel @@ -58,28 +58,27 @@ Adding a kernel to the Suite involves five main steps: add a unique group ID and group name. #. If the kernel exercises a RAJA feature that is not currently used in the Suite, add a unique feature ID and feature name. -#. Implement a kernel class that defines all operations needed to run it, - along with source files that contain kernel variant implementations, - organized as described below. -#. Add appropriate targets to the `CMakeLists.txt`` files when new kernel - files are added to the Suite so that they will be compiled. +#. Implement a kernel class that defines all operations needed to integrate + the kernel into the Suite. This includes adding the kernel class header + file and source files that contain kernel variant implementations. +#. Add appropriate targets to ``CMakeLists.txt`` files, where needed so + that the new kernel code will be compiled when the Suite is built. -These steps are described in the following sections. +We describe the steps in the following sections. -.._structure_addkernel_name-label: +.. _structure_addkernel_name-label: Adding a kernel ID and name ---------------------------- Two key pieces of information are used to identify each kernel in the Suite: -the group in which it resides and the name of the kernel itself. The files -``RAJAPerfSuite.hpp`` and ``RAJAPerfSuite.cpp`` in the ``src/common`` -directory define kernel IDs as enumeration values and arrays of strings for -kernel names, respectively. +the group in which it resides and the name of the kernel. Kernel IDs and +names are maintained in the files ``RAJAPerfSuite.hpp`` and +``RAJAPerfSuite.cpp``, respectively, which reside in the ``src/common`` +directory. -For concreteness in the following discussion, we describe how one would add -the ``ADD`` kernel, which already exists in the Suite in the "Stream" kernel -group. +For concreteness, we describe how one would add the **ADD** kernel, which +already exists in the Suite in the **Stream** kernel group. First, add an enumeration value identifier for the kernel, that is unique among all Suite kernels, in the enum ``KernelID`` in the @@ -97,16 +96,7 @@ among all Suite kernels, in the enum ``KernelID`` in the }; -Several conventions are important to note because following them will ensure -that the kernel works properly within the RAJA Performance Suite machinery. - -.. note:: * The enumeration value label for a kernel is the **group name followed by the kernel name separated by an underscore**. - * Kernel ID enumeration values for kernels in the same group must - appear consecutively in the enumeration. - * Kernel ID enumeration labels must in alphabetical order, with - respect to the base kernel name in each group. - -Second, add the kernel name to the array of strings ``KernelNames`` in the +Second, add the kernel name to the array of strings ``KernelNames`` in the ``src/common/RAJAPerfSuite.cpp`` source file:: static const std::string KernelNames [] = @@ -118,26 +108,32 @@ Second, add the kernel name to the array of strings ``KernelNames`` in the // Stream kernels... // std::string("Stream_ADD"), - ... + ... }; -Again, several conventions are important to note because following them will -ensure that the kernel works properly within the RAJA Performance Suite -machinery. +Several conventions are important to note for a kernel ID and name. Following +them will ensure that the kernel integrates properly into the RAJA Performance +Suite machinery. -.. note:: * The kernel string name is just a string version of the kernel ID. +.. note:: * The enumeration value label for a kernel is the **group name followed by the kernel name separated by an underscore**. + * Kernel ID enumeration values for kernels in the same group must + appear consecutively in the enumeration. + * Kernel ID enumeration labels must in alphabetical order, with + respect to the base kernel name in each group. + * The kernel string name is just a string version of the kernel ID. * The values in the ``KernelID`` enum must match the strings in the ``KernelNames`` array one-to-one and in the same order. -Typically, adding a new Group or Feature when adding a Kernel is not required. +Typically, adding a new Group or Feature is not needed when adding a Kernel. One or both of these needs to be added only if the Kernel is not part of an -existing Suite Group, or exercises a RAJA Feature that is not used in an -existing Kernel. +existing Group of kernels, or exercises a RAJA Feature that is not used in an +existing Kernel. For completeness, we describe the addition of a new group and +feature in case either is needed. -.._structure_addkernel_group-label: +.. _structure_addkernel_group-label: -Add a new group +Adding a group ---------------------------- If a kernel is added as part of a new group of kernels in the Suite, a new @@ -145,21 +141,24 @@ value must be added to the ``GroupID`` enum in the ``RAJAPerfSuite.hpp`` header file and an associated group string name must be added to the ``GroupNames`` string array in the ``RAJAPerfSuite.cpp`` source file. The process is similar to adding a new kernel ID and name described above. -Again, the enumeration values and string array entries must be kept consistent, -in the same order and matching one-to-one. -.._structure_addkernel_feature-label: +.. note:: Enumeration values and string array entries for Groups must be kept + consistent, in the same order and matching one-to-one. + +.. _structure_addkernel_feature-label: -Add a new Feature +Adding a feature ---------------------------- If a kernel is added that exercises a RAJA Feature that is not used in an -existing Kernel, a new value must be added to the ``FeatureID`` enum in the +existing kernel, a new value must be added to the ``FeatureID`` enum in the ``RAJAPerfSuite.hpp`` header file and an associated feature string name must be added to the ``FeatureNames`` string array in the ``RAJAPerfSuite.cpp`` source file. The process is similar to adding a new kernel ID and name -described above. Again, the enumeration values and string array entries must -be kept consistent, in the same order and matching one-to-one. +described above. + +.. note:: Enumeration values and string array entries for Features must be kept + consistent, in the same order and matching one-to-one. diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 49e09a602..ec7dda9ec 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -1,3 +1,4 @@ +// _add_header_start //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Copyright (c) 2017-23, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. @@ -75,3 +76,4 @@ class ADD : public KernelBase } // end namespace rajaperf #endif // closing endif for header file include guard +// _add_header_end From b5f7fc6fcdbc34b30cba1aef153c269b78df15a5 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 24 Feb 2023 15:07:37 -0800 Subject: [PATCH 041/137] Clean up and monir reorg --- docs/sphinx/dev_guide/index.rst | 1 + docs/sphinx/dev_guide/kernel_class.rst | 17 ++++++---- docs/sphinx/dev_guide/kernel_class_impl.rst | 35 +++++++++++++++++++++ src/stream/ADD.hpp | 2 -- 4 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 docs/sphinx/dev_guide/kernel_class_impl.rst diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index ce465ee35..c2c976ff3 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -31,3 +31,4 @@ practices that are specific to the RAJA Performance Suite. release_process structure kernel_class + kernel_class_impl diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst index c6cc74f28..0c3684158 100644 --- a/docs/sphinx/dev_guide/kernel_class.rst +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -50,15 +50,15 @@ files in the following sections: in a separate file helps to understand compiler optimizations when looking at generated assembly code, for example. +.. _kernel_class_header-label: + ------------------------- Kernel class header file ------------------------- -In its entirety, the **ADD** kernel class header file is: +In its entirety, the **ADD** kernel class header file ``ADD.hpp`` is: .. literalinclude:: ../../../src/stream/ADD.hpp - :start-after: _add_header_start - :end-before: _add_header_end :language: C++ The key ingredients of a kernel class header file are: @@ -76,8 +76,9 @@ The key ingredients of a kernel class header file are: the file. * **Macro definitions** that contain source lines of code that appear in - multiple places in the kernel class implementation. While it obfuscates - the code somewhat, it is useful to reduce the amount of code to maintain + multiple places in the kernel class implementation, such as setting + data pointers and operations in the kernel body. While macros obfuscate + the code somewhat, we use them to reduce the amount of code we maintain and ensure consistency. * **Class definition** derived from the ``KernelBase`` class. We describe @@ -112,5 +113,9 @@ the kernel class for kernel specific operations. Lastly, any data members used in the class implementation are defined, typically in a ``private`` member section so they don't *bleed* out of the -kernel class. +kernel class. For example, in the **ADD** class, we see data members for +GPU block sizes. Also, there are pointer members to hold data arrays for +the kernel. Here we have ``m_a``, m_b``, and ``m_c`` for the three arrays +used in the ADD kernel. Note that we use the convention to prefix class data +members with ``m_``. diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst new file mode 100644 index 000000000..9fff97cb9 --- /dev/null +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -0,0 +1,35 @@ +.. ## +.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## and RAJA Performance Suite project contributors. +.. ## See the RAJAPerf/LICENSE file for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _kernel_class_impl-label: + +============================= +Kernel Class Implementation +============================= + +Each kernel in the Suite follows a similar source file organization and +implementation pattern for consistency and ease of analysis and understanding. +Here, we describe important and conventions applies in each kernel class +implementation that must be followed to ensure that all kernels integrate into +the RAJA Performance Suite in the same way. + +.. _kernel_class_impl_gen-label: + +---------------------- +General class methods +---------------------- + +Class methods that do not execute kernel variants and which are not specific to +any kernel variant implementation are defined in one implementation file. For +the **ADD** kernel that we are describing this is the source file ``ADD.cpp``, +which in its entirety is: + +.. literalinclude:: ../../../src/stream/ADD.cpp + :start-after: _add_impl_gen_start + :end-before: _add_impl_gen_end + :language: C++ diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index ec7dda9ec..49e09a602 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -1,4 +1,3 @@ -// _add_header_start //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Copyright (c) 2017-23, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. @@ -76,4 +75,3 @@ class ADD : public KernelBase } // end namespace rajaperf #endif // closing endif for header file include guard -// _add_header_end From 9f40f2cfd99fac64e51c6d4df804610cfb7a9c3a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 24 Feb 2023 16:18:25 -0800 Subject: [PATCH 042/137] Add more content about kernel operations --- docs/sphinx/dev_guide/kernel_class_impl.rst | 64 ++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 9fff97cb9..6b6cb6339 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -26,10 +26,72 @@ General class methods Class methods that do not execute kernel variants and which are not specific to any kernel variant implementation are defined in one implementation file. For -the **ADD** kernel that we are describing this is the source file ``ADD.cpp``, +the **ADD** kernel that we are describing, this is the source file ``ADD.cpp``, which in its entirety is: .. literalinclude:: ../../../src/stream/ADD.cpp :start-after: _add_impl_gen_start :end-before: _add_impl_gen_end :language: C++ + +The methods in the source file are: + + * **Class constructor**, which calls the ``KernelBase`` class constructor + passing the ``KernelID`` and the ``RunParams`` object, which are used + to initialize the base class. The constructor calls other base class + methods to set information about the kernel, which is specific to the + kernel. Such information includes: + + * Default problem size and number of kernel repetitions to generate + execution run time. + * The actual problem size that will be run, which is a function of + the default size and command-line input. + * The number of *loop iterations* that are performed and the number of + loop kernels that run each time the kernel is executed. Note that the + **ADD** kernel is based on a simple, single for-loop. However, other + kernels in the Suite execute multiple loop kernels. + * The number of bytes read and written and the number of FLOPS performed + for each kernel execution. + * Which RAJA features the kernel exercises. + * Which Suite variants are defined, or implemented for the kernel. Each + variant requires a call to the ``setVariantDefined`` method. Note + that not every kernel implements every variant. So this is a mechanism + to account for what is being run for analysis proposes. + + * **Class destructor**, which must be provided to deallocate kernel state + that is allocated in the constructor and which persists throughout the + execution of the Suite. Note that in the case of the **ADD** kernel, the + destructor is empty since no state is dynamically allocated in the + constructor. + + * ``setUp`` method, which allocates and initializes data required for the + kernel to execute and produce results. + + * ``tearDown`` method, which deallocates and resets any data that will be + re-allocated and/or initialized in subsequent kernel executions. + + * ``updateChecksum`` method, which computes a checksum from the results of + an execution of the kernel and adds it to the checksum value for the + variant and tuning index that was run. + +.. important:: There will only be one instance of each kernel class created + by the program. Thus, each kernel class constructor and + destructor must only perform operations that are not specific + to any kernel variant. + +The ``setUp``, ``tearDown``, and ``updateChecksum`` methods will be +called **each time a kernel variant is run**. We allocate and deallocate +data arrays in the ``setUp`` and ``tearDown`` methods to prevent any +performance timing bias that may be introduced by artificially reusing data +in cache for example, when doing performance experiments. + +Also, note that the ``setUp`` and ``tearDown`` methods pass a ``VariantID`` +value to data allocation and initialization, and deallocation methods so +this data management can be done in a variant-specific manner as needed. + +To simplify these operations and help ensure consistency, there exist utility +methods to allocate, initialize, deallocate, and copy data, and compute +checksums defined in the various *data utils* files in the ``common`` +directory. + + From 2a28fd1789acd570d20fa04e2e6d8a5be6798432 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 27 Feb 2023 10:22:04 -0800 Subject: [PATCH 043/137] Clarify GitLab CI machinery and remove unnecessary reference to source file comment delimiters --- docs/sphinx/dev_guide/ci.rst | 26 ++++++++++++++------- docs/sphinx/dev_guide/kernel_class_impl.rst | 2 -- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst index 2db276d6e..231b00ee3 100644 --- a/docs/sphinx/dev_guide/ci.rst +++ b/docs/sphinx/dev_guide/ci.rst @@ -16,18 +16,28 @@ The RAJA Performance Suite project uses the same continuous integration tools as the RAJA project, namely Azure Pipelines and GitLab CI in the Livermore Computing Collaboration Zone (LC CZ). Please see `RAJA Continuous Integration Testing `_ for more information. -One important difference to note is that the RAJA Performance Suite inherits -most of its support for GitLab CI testing from its RAJA submodule. As a result, -submodules that support RAJA GitLab CI, such as -`Uberenv `_ and -`RADIUSS Spack Configs `_, -do not appear in the RAJA Performance Suite repository. - -The RAJA Performance Suite project does include files that support GitLab and +The RAJA Performance Suite project includes files that support GitLab and Azure Pipelines CI testing that are specific to the project. These file are similar to those in the RAJA project and play the same roles and follow the same structure as in the RAJA project. Such files are described in `RAJA Continuous Integration Testing `_. +An important difference to note between RAJA Performance Suite GitLab CI and +RAJA GitLab CI is that the RAJA Performance Suite inherits most of its support +from its RAJA submodule. As a result, submodules that support RAJA GitLab CI, +such as `Uberenv `_ and +`RADIUSS Spack Configs `_, +do not appear in the RAJA Performance Suite repository. Instead, the +``RAJAPerf/scripts/gitlab/build_and_test.sh`` script invokes the +``uberenv.py`` script in the RAJA submodule to drive Spack and the +``RAJAPerf/.uberenv_config.json`` file points to the RADIUSS Spack Configs +content in the RAJA submodule: + +.. literalinclude:: ../../../.uberenv_config.json + :language: json + +Like RAJA, the RAJA Performance Suite Spack package is maintained in the +RADIUSS Spack Configs project. + .. _ci_tasks-label: ****************************************************** diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 6b6cb6339..5fd0cdc5f 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -30,8 +30,6 @@ the **ADD** kernel that we are describing, this is the source file ``ADD.cpp``, which in its entirety is: .. literalinclude:: ../../../src/stream/ADD.cpp - :start-after: _add_impl_gen_start - :end-before: _add_impl_gen_end :language: C++ The methods in the source file are: From d18ea181bbe4330945b03cd8f1346f8878d71477 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Feb 2023 14:56:44 -0800 Subject: [PATCH 044/137] Add content about sequential run methods --- docs/sphinx/dev_guide/kernel_class_impl.rst | 56 ++++++++++++++++++++- src/stream/ADD-Seq.cpp | 3 +- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 5fd0cdc5f..b0f628065 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -92,4 +92,58 @@ methods to allocate, initialize, deallocate, and copy data, and compute checksums defined in the various *data utils* files in the ``common`` directory. - +.. _kernel_class_impl_exec-label: + +------------------------- +Kernel execution methods +------------------------- + +In the discussion of the **ADD** :ref:`kernel_class-label`, we noted +that the class implementation involves multiple files containing variants for +each execution back-end. In particular, these files contain implementations of +the *run* methods declared in the **ADD** :ref:`kernel_class_header-label` +to execute the variants. + +Each method takes a variant ID argument that identifies the variant to run and +a tuning index that identifies the tuning of the variant to run. Note that the +tuning index can be ignored when there is only one tuning. Each method is +responsible for multiple tasks which involve a combination of kernel and +variant specific operations and calling kernel base class methods, such as: + + * Setting up and initializing data needed by a kernel variant before it is run + * Starting an execution timer before a kernel is run + * Running the proper number of kernel executions + * Stopping the time after the kernel is run + * Putting the class member data in an appropriate state to update a checksum + +For example, here is the method to run sequential CPU variants of the **ADD** +kernel: + +.. literalinclude:: ../../../src/stream/ADD.hpp + :start-after: _add_run_seq_start + :end-before: _add_run_seq_end + :language: C++ + +All kernel source files follow a similar organization and implementation +pattern for each set of back-end exeuction variants. + +.. important:: Following the established implementation patterns for kernels + in the Suite help to ensure that the code is consistent, + understandable, easily maintained, and needs minimal + documentation. + +A few items are worth noting: + + * Thee tuning index argument is ignored because there is only one tuning for + the sequential kernel variants. + * Execution parameters, such as kernel loop length and number of execution + repetitions, are set by calling base class methods which return values + based on kernel defaults and input parameters. This ensures that the + execution will be consistent across run variants and results will be + what is expected. + * Simple switch-case statement logic is used to execute the proper variant + based on the ``VariantID`` argument. + * Macros defined in the ``ADD.hpp`` header file are used to reduce the amount + of redundant code, such as for data initialization (``ADD_DATA_SETUP``) + and the kernel body (``ADD_BODY``). + diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 3b7db4445..106bf108d 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -17,7 +17,7 @@ namespace rajaperf namespace stream { - +// _add_run_seq_start void ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -85,6 +85,7 @@ void ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } } +// _add_run_seq_start } // end namespace stream } // end namespace rajaperf From 1c0abc5bce50d50447791db344397b831e5b9d6e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 1 Mar 2023 16:10:05 -0800 Subject: [PATCH 045/137] More kernel implementation content. --- docs/sphinx/dev_guide/kernel_class.rst | 2 +- docs/sphinx/dev_guide/kernel_class_impl.rst | 53 ++++++++++++++++----- src/stream/ADD-Cuda.cpp | 4 +- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst index 0c3684158..952e30cf3 100644 --- a/docs/sphinx/dev_guide/kernel_class.rst +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -47,7 +47,7 @@ files in the following sections: here, that variant should be placed in a file named to clearly distinguish the back-end implementation, such as ``ADD-.cpp``. Keeping the variants for each back-end - in a separate file helps to understand compiler optimizations + in a separate file helps to understand compiler optimization when looking at generated assembly code, for example. .. _kernel_class_header-label: diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index b0f628065..dfd9fc2c4 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -83,6 +83,12 @@ data arrays in the ``setUp`` and ``tearDown`` methods to prevent any performance timing bias that may be introduced by artificially reusing data in cache for example, when doing performance experiments. +.. important:: The checksum must be computed in the same way for each variant + of a kernel so that checksums for different variants can be + compared to help identify differences, and potential errors in + implementations, compiler optimizations, programming model + execution, etc. + Also, note that the ``setUp`` and ``tearDown`` methods pass a ``VariantID`` value to data allocation and initialization, and deallocation methods so this data management can be done in a variant-specific manner as needed. @@ -117,22 +123,14 @@ variant specific operations and calling kernel base class methods, such as: * Putting the class member data in an appropriate state to update a checksum For example, here is the method to run sequential CPU variants of the **ADD** -kernel: +kernel in the ``ADD-Seq.cpp`` file: -.. literalinclude:: ../../../src/stream/ADD.hpp +.. literalinclude:: ../../../src/stream/ADD-Seq.hpp :start-after: _add_run_seq_start :end-before: _add_run_seq_end :language: C++ -All kernel source files follow a similar organization and implementation -pattern for each set of back-end exeuction variants. - -.. important:: Following the established implementation patterns for kernels - in the Suite help to ensure that the code is consistent, - understandable, easily maintained, and needs minimal - documentation. - -A few items are worth noting: +A few details are worth noting: * Thee tuning index argument is ignored because there is only one tuning for the sequential kernel variants. @@ -147,3 +145,36 @@ A few items are worth noting: of redundant code, such as for data initialization (``ADD_DATA_SETUP``) and the kernel body (``ADD_BODY``). +All kernel source files follow a similar organization and implementation +pattern for each set of back-end execution variants. However, there are some +important differences to note that we describe next in the discussion of +the CUDA variant execution file. + +The key contents related to execution of CUDA GPU variants of the **ADD** +kernel in the ``ADD-Cuda.cpp`` file are: + +.. literalinclude:: ../../../src/stream/ADD-Cuda.cpp + :start-after: _add_run_cuda_start + :end-before: _add_run_cuda_end + :language: C++ + +Notable differences with the sequential variant file are: + + * In addition to using the ``ADD_DATA_SETUP`` macro, which is also used + in the sequential variant implementation file discussed above, we + define two other macros, ``ADD_DATA_SETUP_CUDA`` and + ``ADD_DATA_TEARDOWN_CUDA``. These macros allocate GPU device data and + initialize it by copying host CPU data to it, and copy data back to the + host and deallocate the device data, respectively. + * A CUDA GPU kernel ``add`` is implemented for the ``Base_CUDA`` variant. + * The ``block_size`` template parameter for the ``ADD::runCudaVariantImpl`` + method represents the *tuning parameter*. + * The ``RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE`` macro is + used to generate different kernel tunings that use the GPU thread-block + sizes specified via command-line input mentioned in :ref:`build_build-label`. + +.. important:: Following the established implementation patterns for kernels + in the Suite help to ensure that the code is consistent, + understandable, easily maintained, and needs minimal + documentation. + diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 93ac53971..5df067a94 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -21,6 +21,7 @@ namespace rajaperf namespace stream { +// _add_run_cuda_start #define ADD_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(b, m_b, iend); \ @@ -35,7 +36,7 @@ namespace stream template < size_t block_size > __launch_bounds__(block_size) __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, - Index_type iend) + Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { @@ -112,6 +113,7 @@ void ADD::runCudaVariantImpl(VariantID vid) } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ADD, Cuda) +// _add_run_cuda_end } // end namespace stream } // end namespace rajaperf From 1128ba691aa58c656148e1c5c0d455bc21916e62 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 1 Mar 2023 16:24:35 -0800 Subject: [PATCH 046/137] Fix some formatting and file inclusion issues --- docs/sphinx/dev_guide/kernel_class.rst | 4 ++-- docs/sphinx/dev_guide/kernel_class_impl.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst index 952e30cf3..81b7d2ec5 100644 --- a/docs/sphinx/dev_guide/kernel_class.rst +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -72,8 +72,8 @@ The key ingredients of a kernel class header file are: kernel as it appears in the original code in which it was taken. This is helpful to understand the origin and intent of the original. - * **Uniquely-named header file include guard, that contains the contents of - the file. + * **Uniquely-named include guard** that guards the contents of + the header file. * **Macro definitions** that contain source lines of code that appear in multiple places in the kernel class implementation, such as setting diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index dfd9fc2c4..1e86989ee 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -125,7 +125,7 @@ variant specific operations and calling kernel base class methods, such as: For example, here is the method to run sequential CPU variants of the **ADD** kernel in the ``ADD-Seq.cpp`` file: -.. literalinclude:: ../../../src/stream/ADD-Seq.hpp +.. literalinclude:: ../../../src/stream/ADD-Seq.cpp :start-after: _add_run_seq_start :end-before: _add_run_seq_end :language: C++ From eeecffc676cda4dd59ca4991ae6dd3f44a6586e8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 1 Mar 2023 16:30:00 -0800 Subject: [PATCH 047/137] Fix broken code inclusion label --- src/stream/ADD-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 106bf108d..eb47fe3f0 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -85,7 +85,7 @@ void ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } } -// _add_run_seq_start +// _add_run_seq_end } // end namespace stream } // end namespace rajaperf From 09c5418851675c51d38f88419d0a0b96671b4252 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 1 Mar 2023 16:46:40 -0800 Subject: [PATCH 048/137] Expand implementation discussion --- docs/sphinx/dev_guide/kernel_class_impl.rst | 10 ++++++++++ src/stream/ADD-Cuda.cpp | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 1e86989ee..61113e087 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -141,6 +141,11 @@ A few details are worth noting: what is expected. * Simple switch-case statement logic is used to execute the proper variant based on the ``VariantID`` argument. + * We guard sequential variants apart from the ``Base_Seq`` variant with + the ``RUN_RAJA_SEQ`` macro. This ensures that the base sequential variant + will always run to be used as a reference variant for execution timing. + By default, we turn off the other sequential variants when we build an + executable with OpenMP target offload enabled. * Macros defined in the ``ADD.hpp`` header file are used to reduce the amount of redundant code, such as for data initialization (``ADD_DATA_SETUP``) and the kernel body (``ADD_BODY``). @@ -160,6 +165,11 @@ kernel in the ``ADD-Cuda.cpp`` file are: Notable differences with the sequential variant file are: + * Most of the file is guarded using the ``RAJA_ENABLE_CUDA`` macro. + + ..note:: The contents of all non-sequential variant implementation files + are guarded using the ``RAJA_ENABLE_`` macros. + * In addition to using the ``ADD_DATA_SETUP`` macro, which is also used in the sequential variant implementation file discussed above, we define two other macros, ``ADD_DATA_SETUP_CUDA`` and diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 5df067a94..574ec32a4 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -6,6 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// _add_run_cuda_start #include "ADD.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +22,6 @@ namespace rajaperf namespace stream { -// _add_run_cuda_start #define ADD_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(b, m_b, iend); \ @@ -113,10 +113,10 @@ void ADD::runCudaVariantImpl(VariantID vid) } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ADD, Cuda) -// _add_run_cuda_end } // end namespace stream } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA +// _add_run_cuda_end From 712a919b6dded89d8db225d26ccc0a1f3552da4c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 09:55:08 -0800 Subject: [PATCH 049/137] Finish kernel implementation disucssion. --- docs/sphinx/dev_guide/kernel_class_impl.rst | 68 ++++++++++++++----- docs/sphinx/dev_guide/structure.rst | 75 +++++++++++++++++++-- 2 files changed, 123 insertions(+), 20 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 61113e087..6a414cbf1 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -68,20 +68,13 @@ The methods in the source file are: * ``tearDown`` method, which deallocates and resets any data that will be re-allocated and/or initialized in subsequent kernel executions. - * ``updateChecksum`` method, which computes a checksum from the results of - an execution of the kernel and adds it to the checksum value for the - variant and tuning index that was run. - -.. important:: There will only be one instance of each kernel class created - by the program. Thus, each kernel class constructor and - destructor must only perform operations that are not specific - to any kernel variant. +.. important:: The ``tearDown`` method frees and/or resets all kernel data that + is allocated and/or initialized in the ``setUp`` method. -The ``setUp``, ``tearDown``, and ``updateChecksum`` methods will be -called **each time a kernel variant is run**. We allocate and deallocate -data arrays in the ``setUp`` and ``tearDown`` methods to prevent any -performance timing bias that may be introduced by artificially reusing data -in cache for example, when doing performance experiments. + * ``updateChecksum`` method, which computes a checksum from the results of + an execution of the kernel and adds it to the checksum value, which is a + member of the ``KernelBase`` class, for the variant and tuning index that + was run. .. important:: The checksum must be computed in the same way for each variant of a kernel so that checksums for different variants can be @@ -89,15 +82,58 @@ in cache for example, when doing performance experiments. implementations, compiler optimizations, programming model execution, etc. -Also, note that the ``setUp`` and ``tearDown`` methods pass a ``VariantID`` -value to data allocation and initialization, and deallocation methods so +The ``setUp``, ``tearDown``, and ``updateChecksum`` methods are +called **each time a kernel variant is run**. We allocate and deallocate +data arrays in the ``setUp`` and ``tearDown`` methods to prevent any +performance timing bias that may be introduced by artificially reusing data +in cache, for example, when doing performance experiments. Also, note that +the ``setUp`` and ``tearDown`` methods take a ``VariantID`` argument and pass +it to data allocation, initialization, and deallocation methods so this data management can be done in a variant-specific manner as needed. To simplify these operations and help ensure consistency, there exist utility methods to allocate, initialize, deallocate, and copy data, and compute checksums defined in the various *data utils* files in the ``common`` directory. - + +--------------------------- +Kernel object construction +--------------------------- + +It is important to note that there will only be one instance of each kernel +class created by the program. Thus, each kernel class constructor and +destructor must only perform operations that are not specific to any kernel +variant. + +The ``Executor`` class in the ``common`` directory creates kernel objects, +one for each kernel that will be run based on command-line input options. To +ensure a new kernel object will be created properly, add a call to its class +constructor based on its ``KernelID`` in the ``getKernelObject()`` method in +the ``RAJAPerfSuite.cpp`` file. For example:: + + KernelBase* getKernelObject(KernelID kid, + const RunParams& run_params) + { + KernelBase* kernel = 0; + + switch ( kid ) { + + ... + + case Stream_ADD : { + kernel = new stream::ADD(run_params); + break; + } + + ... + + } // end switch on kernel id + + return kernel; + } + + } + .. _kernel_class_impl_exec-label: ------------------------- diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst index d42e703c6..5c25ef2a2 100644 --- a/docs/sphinx/dev_guide/structure.rst +++ b/docs/sphinx/dev_guide/structure.rst @@ -23,13 +23,10 @@ directory, you will see the following:: $ ls -c1 -F lcals/ stream/ - stream-kokkos/ rajaperf_config.hpp.in polybench/ - lcals-kokkos/ common/ basic/ - basic-kokkos/ apps/ algorithm/ RAJAPerfSuiteDriver.cpp @@ -61,7 +58,7 @@ Adding a kernel to the Suite involves five main steps: #. Implement a kernel class that defines all operations needed to integrate the kernel into the Suite. This includes adding the kernel class header file and source files that contain kernel variant implementations. -#. Add appropriate targets to ``CMakeLists.txt`` files, where needed so +#. Add appropriate targets to ``CMakeLists.txt`` files, where needed, so that the new kernel code will be compiled when the Suite is built. We describe the steps in the following sections. @@ -160,7 +157,77 @@ described above. .. note:: Enumeration values and string array entries for Features must be kept consistent, in the same order and matching one-to-one. +.. _structure_addvariant-label: +================ +Adding a Variant +================ + +Similar to a Kernel, each Variant in the Suite is is identified by an +enumeration value and a string name. Adding a new variant requires adding +these two items in a similar fashion to adding a kernel name and ID as +described above. + +Adding a variant to the Suite involves four main steps: + +#. Add a unique variant ID and a unique variant name to the Suite. +#. Add the pure virtual method to execute the variant to the ``KernelBase`` + class header file. For example:: + + virtual void runVariant(VariantID vid, size_t tune_idx) = 0; + +#. For the kernel(s) to which the variant applies, provide kernel variant + implementations in associated ``-.cpp`` files. +#. Add appropriate targets to ``CMakeLists.txt`` files, where needed, so + that the new kernel variant code will be compiled when the Suite is built. + +We describe the steps in the following sections. + +.. _structure_addvariant_name-label: +Adding a variant ID and name +---------------------------- + +Variant IDs and names are maintained in the files ``RAJAPerfSuite.hpp`` and +``RAJAPerfSuite.cpp``, respectively, which reside in the ``src/common`` +directory. Adding a new variant ID and name is essentially the same as +adding a kernel ID and name, which is described in +:ref:`structure_addkernel_name-label`. + +.. note:: A variant string name is just a string version of the variant ID. enum value label. This convention must be followed so that each + variant works properly within the RAJA Performance Suite + machinery. Also, the values in the VariantID enum and the + strings in the VariantNames array must be kept consistent + (i.e., same order and matching one-to-one). + +.. _structure_addvariant_impl-label: + +Adding kernel variant implementations +------------------------------------- + +In the classes containing kernels to which a new variant applies, add +implementations for the variant in kernel execution methods in files named +``-.cpp``. This is described in detail in +:ref:`kernel_class_impl_exec-label`. + +.. note:: Make sure to enable the variant for those kernels in the kernel + class constructors by calling the ``KernelBase`` class method + ``setVariantDefined(VariantID vid))`` so that the variant can be + run. + +.. _structure_addtuning-label: + +================ +Adding a Tuning +================ +For kernels to which a new tuning applies, add implementations for the tuning +in the kernel execution and tuning naming methods as needed. Note that the +tuning indices are determined by the order that the tuning names are added +in the ``setTuningDefinitions(VariantID vid)`` method which is +virtual in the ``KernelBase`` class. +.. note:: ``runVariant(VariantID vid, size_t tune_idx) methods + should have similar logic to the + ``setTuningDefinitions(VariantID vid)`` method so that + the correct tuning will be run based on the index. From 4ef26f01a763f7a5ff5fe61bea4385ebbffeca4e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 12:51:13 -0800 Subject: [PATCH 050/137] Replace older documentation in README file with new RTD content. --- README.md | 865 +++--------------------------------------------------- 1 file changed, 37 insertions(+), 828 deletions(-) diff --git a/README.md b/README.md index 9049cb1ac..a0a9723d2 100644 --- a/README.md +++ b/README.md @@ -6,844 +6,55 @@ [comment]: # (# SPDX-License-Identifier: BSD-3-Clause) [comment]: # (#################################################################) +# RAJA + RAJA Performance Suite ====================== -[![Build Status](https://travis-ci.org/LLNL/RAJAPerf.svg?branch=develop)](https://travis-ci.org/LLNL/RAJAPerf) - -The RAJA Performance Suite is designed to explore performance of loop-based -computational kernels found in HPC applications. Specifically, it can be -used to assess and monitor runtime performance of kernels implemented using -[RAJA] C++ performance portability abstractions and compare those to variants -implemented using common parallel programming models, such as OpenMP and CUDA, -directly. Some important terminology used in the Suite includes: - - * `Kernel` is a distinct loop-based computation that appears in the Suite in - multiple variants (or implementations), each of which performs the same - computation. - * `Variant` is an implementation or set of implementations of a kernel in the - Suite that share the same approach/abstraction and programming model, - such as baseline OpenMP, RAJA OpenMP, etc. - * `Tuning` is a particular implementation of a variant of a kernel in the - Suite, such as gpu block size 128, gpu block size 256, etc. - * `Group` is a collection of kernels in the Suite that are grouped together - because they originate from the same source, such as a specific benchmark - suite. - -Each kernel in the Suite appears in multiple RAJA and non-RAJA (i.e., baseline) -variants using parallel programming models that RAJA supports. Some kernels have -multiple tunings of a variant to explore some of the parametrization that the -programming model supports. The kernels originate from various HPC benchmark -suites and applications. For example, the "Stream" group contains kernels from -the Babel Stream benchmark, the "Apps" group contains kernels extracted from -real scientific computing applications, and so forth. - -The suite can be run as a single process or with multiple processes when -configured with MPI support. Running with MPI in the same configuration used -by an hpc app allows the suite to gather performance data that is more relevant -for that hpc app than performance data gathered running single process. For -example running sequentially with one MPI rank per core vs running sequentially -with a single process yields different performance results on most multi-core -CPUs. - -* * * - -Table of Contents -================= - -1. [Building the Suite](#building-the-suite) -2. [Running the Suite](#running-the-suite) -3. [Generated output](#generated-output) -4. [Adding kernels and variants](#adding-kernels-and-variants) -4. [Continuous Integration](#continuous-integration) -5. [Contributions](#contributions) -6. [Authors](#authors) -7. [Copyright and Release](#copyright-and-release) - -* * * - -# Building the Suite - -To build the Suite, you must first obtain a copy of the source code by cloning -the GitHub repository. For example, - -``` -> mkdir RAJA-PERFSUITE -> cd RAJA-PERFSUITE -> git clone --recursive https://github.com/llnl/RAJAPerf.git -``` - -The repository will reside in a `RAJAPerf` sub-directory in the directory into -which is was cloned. - -The Performance Suite has two Git submodules, [RAJA] and the CMake-based [BLT] -build system. The `--recursive` option tells Git to clone the submodules -as well as any submodules that they use. If you switch to a different branch -in your working copy of the repository, you should update the submodules to -make sure you have the right versions of them for the branch. For example, - -``` -> cd RAJAPerf -> git checkout -> git submodule update --recursive -``` - -Note that the `--recursive` option will update submodules within submodules, -similar to usage with the `git clone` as described above. - -RAJA and the Performance Suite are built together using the same CMake -configuration. For convenience, we include scripts in the `scripts` -directory that invoke corresponding configuration files (CMake cache files) -in the RAJA submodule. For example, the `scripts/lc-builds` directory -contains scripts that show how we build code for testing on platforms in -the Lawrence Livermore Computing Center. Each build script creates a -descriptively-named build space directory in the top-level Performance Suite -directory and runs CMake with a configuration appropriate for the platform and -compilers used. After CMake completes, enter the build directory and type -`make` (or `make -j ` for a parallel build using N processor cores; if you -omit the number of cores, the code will build in parallel using all available -cores on the node you are running on) to compile the code. For example, - -``` -> ./scripts/blueos_nvcc_clang.sh 10.2.89 sm_70 10.0.1 -> cd build_blueos_nvcc10.2.89-cm_70-clang10.0.1 -> make -j -``` - -The build scripts and associated CMake `host-config` files in RAJA are -useful sources of information for building the Suite on various platforms. -For example, they show how to enable specific back-end kernel variants and -compiler options we use for testing. - -You can also create your own build directory and run CMake with your own -options from there. For example, : - -``` -> mkdir my-build -> cd my-build -> cmake ../ -> make -j -``` - -The provided configurations will only build the Performance Suite code by -default; i.e., it will not build any RAJA test or example codes. If you -want to build the RAJA tests, for example, to verify your build of RAJA is -working properly, just pass the `-DENABLE_TESTS=On` option to CMake, either -on the command line if you run CMake directly or edit the script you are -running to do this. Then, when the build completes, you can type `make test` -to run the RAJA tests. - -## Building with MPI - -Some of the provided configurations will build the Performance Suite with -MPI support enabled. For example, - -``` -> ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 sm_70 10.0.1 -> cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1 -> make -j -``` - -In general MPI support can be enabled by passing the `-DENABLE_MPI=On` option -to CMake and providing a mpi compiler wrapper via the -`-DMPI_CXX_COMPILER=/path/to/mpic++` option to CMake in addition to other CMake -options. For example, - -``` -> mkdir my-mpi-build -> cd my-mpi-build -> cmake -DENABLE_MPI=On -DMPI_CXX_COMPILER=/path/to/mpic++ ../ -> make -j -``` - -## Building with specific GPU block size tunings - -Some of the provided configurations will build the Performance Suite with -GPU support enabled. This will build with the default GPU block size tuning for -each kernel. For example, - -``` -> ./scripts/blueos_nvcc_clang.sh 10.2.89 sm_70 10.0.1 -> cd build_lc_blueos-nvcc10.2.89-sm_70-clang10.0.1 -> make -j -``` - -Using a specific set of GPU block sizes is done by by passing the -`-DRAJA_PERFSUITE_GPU_BLOCKSIZES=` option to CMake via the -`-DMPI_CXX_COMPILER=/path/to/mpic++` option to CMake in addition to other CMake -options. For example, - -``` -> mkdir my-gpu-build -> cd my-gpu-build -> cmake -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 ../ -> make -j -``` - -* * * - -# Running the Suite - -The Suite is run by invoking the executable in the `bin` sub-directory in the -build space directory. For example, if you provide no command line options, - -``` -> ./bin/raja-perf.exe -``` - -the entire Suite (all kernels and variants) will execute in their default -configurations. How the Suite will run and some details about each kernel -will appear on the screen before it is run. Kernel detail information will -also appear in a run report file generated in your run directory -after the Suite executes. You can pass the ''--dryrun'' option along with -any other runtime options to see a summary of how the Suite will execute -without actually running it. - -The Suite can be run in a variety of ways via options passed to the executable. -For example, you can run subsets of kernels and variants by specifying -variants, groups, or individual kernels explicitly. Other configuration -options to set problem sizes, number of times each kernel is run, etc. can -also be specified. You build the code once and use scripts or other mechanisms -to run the Suite in different ways for analyses you want to perform. - -All options appear in a 'long form' with a double hyphen prefix (i.e., '--'). -Some options are available in a one or two character 'short form' with a -single hyphen prefix (i.e., '-') for convenience. To see available options -along with a brief description of each, pass the `--help` or `-h` option: - -``` -> ./bin/raja-perf.exe --help -``` - -or - -``` -> ./bin/raja-perf.exe -h -``` - -Lastly, the program will generate a summary of provided input if it is given -input that the code does not know how to parse. Ill-formed input will be noted -in the summary output. Hopefully, this will make it easy for users to correct -erroneous usage, such as mis-spelled option names. - -## Running with MPI - -Running the Suite with MPI is as simple as running any other MPI application. -For example, - -``` -> srun -n 2 ./bin/raja-perf.exe -``` -the entire Suite (all kernels and variants) will execute in their default -configurations on each of the 2 ranks. The kernel information output shows how -each kernel is run on each rank. The total problem size across all MPI ranks -can be calculated by multiplying the number of MPI ranks by the problem -size in the kernel information. Timing is reported on rank 0 and is gathered -by doing an MPI barrier, starting the timer, running the kernel repetitions, -doing an MPI barrier, and then stopping the timer. - -## Important note - - * The OpenMP target offload variants of the kernels in the Suite are a - work-in-progress since the RAJA OpenMP target offload back-end is also - a work-in-progress. If you configure them to build, they can be run with - the executable `./bin/raja-perf-omptarget.exe` which is distinct from - the one described above. At the time the OpenMP target offload variants were - developed, it was not possible for them to co-exist in the same executable - as the CUDA variants, for example. In the future, the build system may - be reworked so that the OpenMP target variants can be run from the same - executable as the other variants. - -* * * - -# Generated output - -When the Suite is run, several output files are generated that contain -data describing the run. The file names start with the file prefix -provided via a command line option in the output directory, also specified -on the command line. If no such options are provided, files will be located -in the current run directory and be named `RAJAPerf-*`, where '*' is a string -indicating the contents of the file. - -Currently, there are five files generated: - -1. Timing -- execution time (sec.) of each loop kernel and variant run -2. Checksum -- checksum values for each loop kernel and variant run to ensure they are producing the same results (typically, checksum differences of ~1e-10 or less indicate that all kernel variants ran correctly). -3. Speedup -- runtime speedup of each loop kernel and variant with respect to a reference variant. The reference variant can be set with a command line option. If not specified, the first variant run will be used as the reference. The reference variant used will be noted in the file. -4. Figure of Merit (FOM) -- basic statistics about speedup of RAJA variant vs. baseline for each programming model run. Also, when a RAJA variant timing differs from the corresponding baseline variant timing by more than some tolerance, this will be noted in the file with `OVER_TOL`. By default the tolerance is 10%. This can be changed via a command line option. -5. Kernel -- Basic information about each kernel that is run, which is the same -for each variant of the kernel that is run. See description of output information below. - -All output files are text files. Other than the checksum file, all are in -'csv' format for easy processing by common tools and generating plots. - -## Kernel information definitions - -Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'' file. This information is for each process individually, so when running with MPI the total problem size aggregated across all ranks is the number of ranks times the problem size shown in the kernel information. Kernel information includes the following: - -1. Kernel name -- Format is group name followed by kernel name, separated by an underscore. -2. Feature -- RAJA feature(s) exercised in RAJA variants of kernel. -3. Problem size -- Size of the problem represented by a kernel. Please see notes below for more information. -4. Reps -- Number of times a kernel runs in a single pass through the Suite. -5. Iterations/rep -- Sum of sizes of all parallel iteration spaces for all loops run in a single kernel execution (see 'problem size' above). -6. Kernels/rep -- total number of loop structures run (or GPU kernels launched) in each kernel repetition. -7. Bytes/rep -- Total number of bytes read from and written to memory for each repetition of kernel. -8. FLOPs/rep -- Total number of floating point operations executed for each repetition of kernel. Currently, we count arithmetic operations (+, -, *, /) and functions, such as exp, sin, code, etc. as on FLOP. We do not currently count operations like abs and comparisons (<, >, etc.) in the FLOP count. So these numbers are rough estimates. For actual FLOP counts, a performancce analysis tool should be used. - -### Notes about 'problem size' - - * Problem size is always output per process/MPI rank. To get the total problem - size across all ranks when running with MPI multiply the problem size by - the number of MPI ranks. - * The Suite uses three notions of problem size for each kernel: 'default', - 'target', and 'actual'. Default is the 'default' problem size defined for a - kernel and the size that will be run if no runtime options are - provided to run a different size. Target is the desired problem size to run - based on default settings and alterations to that if input is provided to - change the default. Actual is the problem size that is run based on how - each kernel calculates this. - * The concept of problem size is subjective and can be interpreted differently - depending on the kernel structure and what one is trying to measure. For - example, problem size could refer to the amount of data needed to be stored - in memory to run the problem, or it could refer to the amount of parallel - work that is possible, etc. - * We employ the following, admittedly loose definition, which depends on the - particular kernel structure. Of all the 'loop structures' (e.g., single - loop, nested loops, etc.) that are run for a kernel (note that some kernels - run multiple loops, possibly with different sizes or loop structures), - problem size refers to the size of the data set required to generate the - kernel result. The interpretation of this and the definition of problem - size for each kernel in the suite is determined by the kernel developer - and team discussion. - -Here are a few examples to give a better sense of how we determine problem -size for various kernels in the Suite. - -Vector addition. -```cpp -for (int i = 0; i < 0; i < N; ++i) { - c[i] = a[i] + b[i]; -} -``` -The problem size for this kernel is 'N', the loop length. Note that this -happens to match the size of the vectors a, b, c and the total amount of -parallel work in the kernel. This is common for simple, data parallel kernels. - -Matrix-vector multiplication. -```cpp -for (int r = 0; r < N_r; ++r) { - b[r] = 0; - for (int c = 0; c < N_c; ++c) { - b[r] += A[r][c] + x[c]; - } -} -``` -The problem size if N_r * N_c, the size of the matrix. Note that this matches -the total size of the problem iteration space, but the total amount of parallel -work is N_r, the number of rows in the matrix and the length of the vector b. - -Matrix-matrix multiplication. -```cpp -for (int i = 0; i < N_i; ++i) { - for (int j = 0; j < N_j; ++j) { - A[i][j] = 0; - for (int k = 0; k < N_k; ++k) { - A[i][j] += B[i][k] * C[k][j]; - } - } -} -``` -Here, we are multiplying matrix B (N_i x N_k) and matrix C (N_k x N_j) and -storing the result in matrix A (N_i X N_j). Problem size could be chosen to -be the maximum number of entries in matrix B or C. We choose the size of -matrix A (N_i * N_j), which is more closely aligned with the number of -independent operations (i.e., the amount of parallel work) in the kernels. - - -* * * - -# Adding kernels, variants, and Tunings - -This section describes how to add new kernels, variants and/or tunings to the -Suite. *Group* and *feature* modifications are not required unless a new group -or exercised RAJA feature is added when a new kernel is introduced. The -information in this section also provides insight into how the performance -Suite operates. - -It is essential that the appropriate targets are updated in the appropriate -`CMakeLists.txt` files when files are added to the Suite so that they will -be compiled. - -## Adding a kernel - -Adding a new kernel to the Suite involves three main steps: - -1. Add a unique kernel ID and a unique kernel name to the Suite. -2. If the kernel is part of a new kernel group or exercises a new RAJA feature, also add a unique group ID and name for the group. Similarly, if a new RAJA -feature is exercised by a new kernel. -3. Implement a kernel class that contains all operations needed to run it, with source files organized as described below. - -These steps are described in the following sections. - -### Add the kernel ID and name - -Two key pieces of information identify a kernel: the group in which it -resides and the name of the kernel itself. For concreteness, we describe -how to add a kernel "FOO" that lives in the kernel group "Basic". The files -`RAJAPerfSuite.hpp` and `RAJAPerfSuite.cpp` in the `src/common` directory -define enumeration values and arrays of string names for the kernels, -respectively. - -First, add an enumeration value identifier for the kernel, that is unique -among all kernels, in the enum 'KernelID' in the header file `RAJAPerfSuite.hpp`: - -```cpp -enum KernelID { -.. - Basic_FOO, -.. -}; -``` - -Note: the enumeration value for the kernel is the group name followed -by the kernel name, separated by an underscore. It is important to follow -this convention so that the kernel works properly with the Performance -Suite machinery. - -Second, add the kernel name to the array of strings `KernelNames` in the file -`RAJAPerfSuite.cpp`: - -```cpp -static const std::string KernelNames [] = -{ -.. - std::string("Basic_FOO"), -.. -}; -``` - -Note: the kernel string name is just a string version of the kernel ID. -This convention must be followed so that the kernel works properly with the -Performance Suite machinery. Also, the values in the KernelID enum and the -strings in the `KernelNames` array must be kept consistent (i.e., same order -and matching one-to-one). Within each kernel group, we keep the kernel names -and IDs in alphabetical order to make the organization clear. - - -### Add new group if needed - -If a kernel is added as part of a new group of kernels in the Suite, a -new value must be added to the `GroupID` enum in the header file -`RAJAPerfSuite.hpp` and an associated group string name must be added to -the `GroupNames` array of strings in the file `RAJAPerfSuite.cpp`. Again, -the enumeration values and items in the string array must be kept -consistent (same order and matching one-to-one). - -Adding a new RAJA feature is similar. - - -### Add the kernel class - -Each kernel in the Suite is implemented in a class whose header and -implementation files live in the directory named for the group -in which the kernel lives. The kernel class is responsible for implementing -all operations needed to manage data, execute, and record execution timing and -checksum information for each variant and tuning of the kernel. To properly plug -in to the Performance Suite framework, the kernel class must be a subclass of -the `KernelBase` base class that defines the interface for kernels in the Suite. - -Continuing with our example, we add a 'FOO' class header file `FOO.hpp`, -and multiple implementation files described in the following sections: - - * `FOO.cpp` contains the methods to setup and teardown the memory for the - 'FOO' kernel, and compute and record a checksum on the result after it - executes. It also specifies kernel information in the kernel class - constructor. - * `FOO-Seq.cpp` contains sequential CPU variants and tunings of the kernel. - * `FOO-OMP.cpp` contains OpenMP CPU multithreading variants and tunings of the kernel. - * `FOO-OMPTarget.cpp` contains OpenMP target offload variants and tunings of the kernel. - * `FOO-Cuda.cpp` contains CUDA GPU variants and tunings of the kernel. - * `FOO-Hip.cpp` contains HIP GPU variants and tunings of the kernel. - -All kernels in the Suite follow the same implementation pattern. Inspect the -files for any kernel to understand the overall organization. +[![Azure Piepline Build Status](https://dev.azure.com/llnl/RAJAPerf/_apis/build/status/LLNL.RAJAPerf?branchName=develop)](https://dev.azure.com/llnl/RAJAPerf/_build/latest?definitionId=1&branchName=develop) +[![Documentation Status](https://readthedocs.org/projects/rajaperf/badge/?version=develop)](https://raja.readthedocs.io/en/develop/?badge=develop) -Note: if a new execution back-end variant is added that is not listed here, -that variant should go in the file `FOO-.cpp`. Keeping the -back-end variants in separate files helps to understand compiler optimizations -when looking at generated assembly code, for example. +The RAJA Performance Suite is a companion project to the [RAJA] C++ performance +portability abstraction library. The Performance Suite designed to eplore +performance of loop-based computational kernels found in HPC applications. +Specifically, it is used to assess and monitor runtime performance of kernels +implemented using [RAJA] compare those to variants implemented using common +parallel programming models, such as OpenMP and CUDA, directly. -#### Kernel class header +User Documentation +------------------- -Here is what a header file for the FOO kernel object should look like: +The RAJA Performance Suite User Guide is the best place to start learning +about it -- how to build it, how to run it, etc. -```cpp -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +The RAJA Performance Suite Developer Guide contains information about +how the source code is structured, how to contribute to it, etc. -/// -/// Foo kernel reference implementation: -/// -/// Describe it here... -/// +The most recent version of these documents (develop branch) is available here: https://rajaperf.readthedocs.io -#ifndef RAJAPerf_Basic_FOO_HPP -#define RAJAPerf_Basic_FOO_HPP +To access docs for other branches or version versions: https://readthedocs.org/projects/rajaperf/ -#include "common/KernelBase.hpp" +Please see the [RAJA] project for more information about RAJA. -namespace rajaperf -{ -class RunParams; // Forward declaration for ctor arg. +Communicate with Us +------------------- -namespace basic -{ +The most effective way to communicate with the RAJA development team +is via our mailing list: **raja-dev@llnl.gov** -class FOO : public KernelBase -{ -public: +If you have questions, find a bug, or have ideas about expanding the +functionality or applicability of the RAJA Performance Suite and are +interested in contributing to its development, please do not hesitate to +contact us. We are very interested in improving the Suite and exploring new +ways to use it. - FOO(const RunParams& params); +Authors +----------- - ~FOO(); +Please see the [RAJA Performance Suite Contributors Page](https://github.com/LLNL/RAJAPerf/graphs/contributors), to see the full list of contributors to the project. - void setUp(VariantID vid, size_t tuning_idx); - void updateChecksum(VariantID vid, size_t tuning_idx); - void tearDown(VariantID vid, size_t tuning_idx); - - void runSeqVariant(VariantID vid, size_t tuning_idx); - void runOpenMPVariant(VariantID vid, size_t tuning_idx); - void runCudaVariant(VariantID vid, size_t tuning_idx); - void runHipVariant(VariantID vid, size_t tuning_idx); - void runOpenMPTargetVariant(VariantID vid, size_t tuning_idx); - -private: - // Kernel-specific data (pointers, scalars, etc.) as needed... -}; - -} // end namespace basic -} // end namespace rajaperf - -#endif // closing endif for header file include guard -``` - -The kernel object header has a uniquely-named header file include guard and -the class is nested within the `rajaperf` and `basic` namespaces. The -constructor takes a reference to a `RunParams` object, which contains the -input parameters for running the Suite -- we'll say more about this later. -The methods that take a variant ID and tuning index arguments must be provided -as they are pure virtual in the KernelBase class. Their names are descriptive of -what they do and we'll provide more details about them when we describe the -class implementation next. - -#### Kernel class implementation - -Each kernel in the Suite follows a similar implementation pattern for -consistency and ease of analysis and understanding. Here, we describe several -key steps and conventions that must be followed to ensure that all kernels -interact with the performance Suite machinery in the same way: - -1. Initialize the `KernelBase` class object with `KernelID` and `RunParams` object passed to the FOO class constructor. -2. In the class constructor, define kernel information. This includes: default problem size, default run repetition count, iterations per rep, kernels per rep, bytes per rep, FLOPs per rep, the RAJA features used by the kernel, and kernel variants defined (i.e., implemented) by calling the appropriate members in the `KernelBase` class. See the *.cpp file for any existing kernel in the suite for examples of how this is done. Note that tuning names are added in step 6. -3. Implement data allocation and initialization operations for each kernel variant in the `setUp` method. -4. Compute the checksum for each variant in the `updateChecksum` method. -5. Deallocate and reset any data that will be allocated and/or initialized in subsequent kernel executions in the `tearDown` method. -6. Implement kernel execution for the associated variants and tunings in the `run*Variant` methods in the proper source files. Add tuning names for the tunings of each variant by overriding the `KernelBase` methods `set*TuningDefinitions`. Note that this is not necessary if there is only one tuning. - -##### Constructor and destructor - -It is important to note that there will only be one instance of each kernel -class created by the program. Thus, each kernel class constructor and -destructor must only perform operations that are not specific to any kernel -variant. - -The constructor must pass the kernel ID and RunParams object to the base -class `KernelBase` constructor. The body of the constructor must also call -base class methods to set kernel information described above. Note that -the arguments passed to each method are specific to each kernel, in general. -This code snippets shows a typical way this looks for a simple single for-loop -data parallel kernel. - -```cpp -FOO::FOO(const RunParams& params) - : KernelBase(rajaperf::Basic_Foo, params), -{ - setDefaultProblemSize(1000000); // length of the for-loop - setDefaultReps(1000); // number of times the kernel will execute - // to generate an execution run time value - - setActualProblemSize( getTargetProblemSize() ); // actual problem size may - // be different than the - // default size based on - // user-provided run time - // options - - setItsPerRep( getActualProblemSize() ); - setKernelsPerRep(1); - setBytesPerRep( ... ); // value set based on data read and written when - // kernel executes - setFLOPsPerRep( ... ); // value set based on floating-point operations - // performed when kernel executes - - setUsesFeature(Forall); // the kernel uses the RAJA::forall construct and - // no other RAJA features. - - setVariantDefined( Base_Seq ); - setVariantDefined( Lambda_Seq ); - setVariantDefined( RAJA_Seq ); - - setVariantDefined( Base_OpenMP ); - // etc. -} -``` - -The class destructor doesn't have any requirements beyond freeing memory -owned by the class object as needed. Often, it is empty. - -##### setUp() method - -The `setUp()` method is responsible for allocating and initializing data -necessary to run the kernel for the variant specified by its variant ID -argument. For example, a baseline variant may have aligned data allocation -to help enable SIMD optimizations, an OpenMP variant may initialize arrays -following a pattern of "first touch" based on how memory and threads are -mapped to CPU cores, a CUDA variant may initialize data in host memory, -which will be copied to device memory when a CUDA variant executes, etc. - -It is important to use the same data allocation and initialization operations -for all kernel variants so that checksums can be compared at the end of a run. - -Note: to simplify these operations and help ensure consistency, there exist -utility methods to allocate, initialize, deallocate, and copy data, and compute -checksums defined in the `DataUtils.hpp` `CudaDataUtils.hpp`, -`OpenMPTargetDataUtils.hpp`, etc. header files in the `common` directory. - -##### run methods - -Which files contain which 'run' methods and associated variant and tuning -implementations is described above. Each method takes a variant ID argument -which identifies the variant to be run and a tuning index which identifies -the tuning of the variant to run. Note that the tuning index can be ignored -when there is only one tuning. Each method is also responsible for calling base -class methods to start and stop execution timers when a loop variant is run. -A typical kernel execution code section may look like: - -```cpp -void Foo::runSeqVariant(VariantID vid, size_t /*tuning_idx*/) -{ - const Index_type run_reps = getRunReps(); - // ... - - switch ( vid ) { - - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // Implementation of Base_Seq kernel variant... - - } - stopTimer(); - - break; - } - -#if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // Implementation of Lambda_Seq kernel variant... - - } - stopTimer(); - - break; - } - - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // Implementation of RAJA_Seq kernel variant... - - } - stopTimer(); - - break; - } -#endif // RUN_RAJA_SEQ - - default : { - getCout() << "\n : Unknown variant id = " << vid << std::endl; - } - - } -} -``` - -All kernel implementation files are organized in this way. So following this -pattern will ensure all new additions are consistent. - -Important notes: - - * As mentioned earlier, there are multiple source files for each kernel. - The reason for this is that it makes it easier to apply unique compiler - flags to different variants and to manage compilation and linking issues - that arise when some kernel variants are combined in the same translation - unit. - - * For convenience, we make heavy use of macros to define data declarations - and kernel bodies in the Suite. While seemingly cryptic, this significantly - reduces the amount of redundant code required to implement multiple variants - for each kernel and make sure things are the same as much as possible. The - kernel class implementation files in the Suite provide many examples of - the basic pattern we use. - - * We also use macros to define some methods used with GPU block size tunings. - While seemingly cryptic, this significantly reduces the amount of redundant - code required to implement calling and naming each of the multiple tunings - for each kernel and make sure things are the same as much as possible. - -##### updateChecksum() method - -The `updateChecksum()` method is responsible for adding the checksum -for the current kernel (based on the data the kernel computes) to the -checksum value for the variant and tuning of the kernel just executed, which is -held in the KernelBase base class object. - -It is important that the checksum be computed in the same way for -each variant of the kernel so that checksums for different variants can be -compared to help identify differences, and potential errors in -implementations, compiler optimizations, programming model execution, etc. - -Note: to simplify checksum computations and help ensure consistency, there -are methods to compute checksums, a weighted sum of array values for example, -are defined in the `DataUtils.hpp` header file in the `common` directory. - -##### tearDown() method - -The `tearDown()` method frees and/or resets all kernel data that is -allocated and/or initialized in the `setUp()` method execution to prepare for -other kernel variants run subsequently. - - -### Add object construction operation - -The `Executor` class in the `common` directory is responsible for creating -kernel objects for the kernels to be run based on the Suite input options. -To ensure a new kernel object will be created properly, add a call to its -class constructor based on its `KernelID` in the `getKernelObject()` -method in the `RAJAPerfSuite.cpp` file. - - -## Adding a variant - -Each variant in the RAJA Performance Suite is identified by an enumeration -value and a string name. Adding a new variant requires adding these two -items similarly to adding those for a kernel as described above. - -### Add the variant ID and name - -First, add an enumeration value identifier for the variant, that is unique -among all variants, in the enum 'VariantID' in the header file -`RAJAPerfSuite.hpp`: - -```cpp -enum VariantID { -.. - NewVariant, -.. -}; -``` - -Second, add the variant name to the array of strings `VariantNames` in the file -`RAJAPerfSuite.cpp`: - -```cpp -static const std::string VariantNames [] = -{ -.. - std::string("NewVariant"), -.. -}; -``` - -Note that the variant string name is just a string version of the variant ID. -This convention must be followed so that the variant works properly with the -Performance Suite machinery. Also, the values in the VariantID enum and the -strings in the `VariantNames` array must be kept consistent (i.e., same order -and matching one-to-one). - -### Add kernel variant implementations - -In the classes containing kernels to which the new variant applies, -add implementations for the variant in the setup, kernel execution, -checksum computation, and teardown methods as needed. Also, make sure to -define the variant for those kernels in the kernel class constructors by -calling `setVariantDefined(NewVariant)` so that the variant can be run. -These operations are described in earlier sections for adding a new kernel -above. - -### Add kernel tuning implementations - -In the classes containing kernels to which the new tuning applies, -add implementations for the tuning in the kernel execution and tuning naming -methods as needed. Note that the tuning indices are determined by the order that -the tuning names are added in the `set*TuningDefinitions` method. Therefore -the `run*Variant` methods should have similar logic in order to run the correct -tuning based on the index. - -* * * - -# Continuous Integration - -RAJAPerf Suite uses continuous integration to ensure that changes added to the repository are well integrated and tested for compatibility with the rest of the existing code base. Our CI tests include a variety of vetted configurations that run on different LC machines. - -RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentation is therefore [shared](https://radiuss-ci.readthedocs.io/en/latest). - -* * * - -# Contributions - -The RAJA Performance Suite is a work-in-progress, with new kernels and variants -added as new features and back-end support are developed in RAJA. We encourage -interested parties to contribute to it so that C++ compiler optimizations and -support for programming models like RAJA continue to improve. - -The Suite developers follow the [GitFlow](http://nvie.com/posts/a-successful-git-branching-model/) development model. Folks wishing to contribute to the Suite, -should include their work in a feature branch created from the Performance -Suite `develop` branch. Then, create a pull request with the `develop` branch -as the destination when it is ready to be reviewed. The `develop` branch -contains the latest work in RAJA Performance Suite. Periodically, we merge the -develop branch into the `main` branch and tag a new release. - -If you would like to contribute to the RAJA Performance Suite, or have -questions about doing so, please contact the maintainer of the Suite listed -below. - -* * * - -# Authors - -The primary developer/maintainer of the RAJA Performance Suite: - - * Rich Hornung (hornung1@llnl.gov) - -Please see the {RAJA Performance Suite Contributors Page](https://github.com/LLNL/RAJAPerf/graphs/contributors), to see the full list of contributors to the -project. - -* * * - -# LICENSE +License +-------- The RAJA Performance Suite is licensed under the BSD 3-Clause license, (BSD-3-Clause or https://opensource.org/licenses/BSD-3-Clause). @@ -860,9 +71,8 @@ following: - [LICENSE](./LICENSE) - [NOTICE](./NOTICE) -* * * - -# SPDX Usage +SPDX Usage +----------- Individual files contain SPDX tags instead of the full license text. This enables machine processing of license information based on the SPDX @@ -873,9 +83,8 @@ text in the license header: SPDX-License-Identifier: (BSD-3-Clause) -* * * - -# External Packages +External Packages +------------------ The RAJA Performance Suite has some external dependencies, which are included as Git submodules. These packages are covered by various permissive licenses. From 520d8f8a6d99d507062679e40aad1916816fb727 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 12:53:38 -0800 Subject: [PATCH 051/137] Fix indentation --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a0a9723d2..bf2eee850 100644 --- a/README.md +++ b/README.md @@ -91,12 +91,12 @@ as Git submodules. These packages are covered by various permissive licenses. A summary listing follows. See the license included with each package for full details. -PackageName: BLT -PackageHomePage: https://github.com/LLNL/blt/ +PackageName: RAJA +PackageHomePage: http://github.com/LLNL/RAJA/ PackageLicenseDeclared: BSD-3-Clause -PackageName: RAJA -PackageHomePage: http://github.com/LLNL/RAJA/ +PackageName: BLT +PackageHomePage: https://github.com/LLNL/blt/ PackageLicenseDeclared: BSD-3-Clause * * * From 4f80f3a61b5240d2527b411d53dd563dd4948acf Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 13:03:53 -0800 Subject: [PATCH 052/137] Fix copyright header format --- docs/sphinx/user_guide/output.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst index 02a6f64c4..858c456cb 100644 --- a/docs/sphinx/user_guide/output.rst +++ b/docs/sphinx/user_guide/output.rst @@ -1,4 +1,4 @@ -. ## +.. ## .. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. From d00c9e4cc84c474dd4d337c50970a43a6dd341bc Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 13:10:27 -0800 Subject: [PATCH 053/137] fix awkward wording --- docs/index.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index f716cdf24..945124246 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -44,9 +44,9 @@ parallel programming models supported by RAJA. Some kernels have multiple tunings of a variant to explore the performance implications of options that a programming model supports. -.. note:: Available variants for a kernel do not need to include all possible - variants in the Suite. In some cases, a kernel appears only in the - subset of variants that makes sense for the particular kernel. +.. note:: A kernel does not need to provide all possible variants in the + Suite. Tyoically, each kernel provides only the subset of variants + that makes sense for the kernel. The Suite can be run as a single process or with multiple processes when configured with MPI support. When running with multiple MPI processes, the same From 2b87074d4207e1914e9a47636dfc6828fe717e0c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 13:12:39 -0800 Subject: [PATCH 054/137] Address reviewers comments --- docs/sphinx/user_guide/build.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst index 8b6774f24..7adb11e1d 100644 --- a/docs/sphinx/user_guide/build.rst +++ b/docs/sphinx/user_guide/build.rst @@ -46,7 +46,7 @@ command:: $ git clone --recursive https://github.com/LLNL/RAJAPerf.git The ``--recursive`` option is required to pull all RAJA Performance Suite -Git *submodules* on which it depends into your local copy of the repository. +Git *submodules* into your local copy of the repository. After running the ``git clone`` command, a copy of the RAJA Performance Suite repository will reside in the ``RAJAPerf`` subdirectory where you ran the @@ -67,7 +67,7 @@ Either way, the result is the same and you should be good to configure the code and build it. .. note:: * If you are in your local copy of the RAJA Performance Suite repo - and you switch to a different repo branch (e.g., you run the + and you switch to a different branch (e.g., you run the command ``git checkout ``), you may need to run the command ``git submodule update`` to set the Git *submodule versions* to what is used by the new branch. To see if this is From 61d63d113868dfc6f8ea98265405b4bf3971778c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 13:22:02 -0800 Subject: [PATCH 055/137] address review comments --- docs/sphinx/user_guide/build.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst index 7adb11e1d..be1bde2fe 100644 --- a/docs/sphinx/user_guide/build.rst +++ b/docs/sphinx/user_guide/build.rst @@ -133,7 +133,7 @@ processor cores, or all available processor cores on a node, respectively). For example:: $ ./scripts/lc-builds/blueos_nvcc_clang.sh 10.2.89 70 10.0.1 - $ cd build_blueos_nvcc10.2.89-sm_70-clang10.0.1 + $ cd build_blueos_nvcc10.2.89-70-clang10.0.1 $ make -j will build the code for CPU-GPU execution using the clang 10.0.1 compiler for @@ -154,7 +154,7 @@ RAJA tests, pass the two CMake options above plus the option ``-DRAJA_ENABLE_TESTS=Off``. In any case, after the build completes, you can type ``make test`` to run the tests you have built and see the results. -.. note:: Which kernel variants that can be run depends on which programming +.. note:: The kernel variants that can be run depends on which programming model features have been enabled in a build configuration. By default, only *sequential* CPU RAJA and baseline variants will be built. To additionally enable OpenMP variants, for example, you must @@ -162,7 +162,7 @@ type ``make test`` to run the tests you have built and see the results. enable other variants for CUDA, HIP, and other programming models. .. note:: For GPU-enabled builds, only one GPU back-end can be enabled in a - single executable. However, CPU and GPU enabled execution can be + single executable. However, CPU and GPU execution can be enabled in a single executable. For example, one can enable CPU sequential, OpenMP, and CUDA GPU variants in a build. Similarly for HIP GPU variants. @@ -175,7 +175,7 @@ described why this is useful. Some configuration scripts we provide will configure a build with MPI support enabled. For example:: $ ./scripts/lc-builds/lc-blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 70 10.0.1 - $ cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1 + $ cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-70-clang10.0.1 $ make -j This will configure a build to use the *rolling release* of the Spectrum MPI From 559cf592f0a4d3c60a68a469c1935666f19b4772 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 Mar 2023 13:57:25 -0800 Subject: [PATCH 056/137] reviewer comments --- docs/sphinx/user_guide/run.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index f8f26bd3f..c4f53c522 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -38,8 +38,8 @@ The Suite can be run in a variety of ways determined by the command-line options passed to the executable. For example, you can run or exclude subsets of kernels, variants, or groups. You can also pass options to set problem sizes, number of times each kernel is run (sampled), and many other run -parameters.The goal is to build the code once and use scripts or other means -to run the Suite in different ways for analyses you want to perform. +parameters. The goal is to build the code once and use scripts or other means +to run the Suite in different ways for the analyses you want to perform. Each option appears in a *long form* with a double hyphen prefix (i.e., '--'). Commonly used options are also available in a one or two character *short form* From ddc89b9f7fee88e709bcc10c7c8d45c29b256449 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Wed, 8 Mar 2023 22:15:18 +0100 Subject: [PATCH 057/137] Add caliper and adiak as optional dependencies (anticipate merge of caliper integration PRs) --- scripts/spack_packages/raja_perf/package.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 105a62f73..de457ea7f 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -43,6 +43,7 @@ class RajaPerf(CachedCMakePackage, CudaPackage, ROCmPackage): variant("libcpp", default=False, description="Uses libc++ instead of libstdc++") variant("tests", default="basic", values=("none", "basic", "benchmarks"), multi=False, description="Tests to run") + variant("caliper",default=False, description="Build with support for Caliper based profiling") depends_on("blt") depends_on("blt@0.5.2:", type="build", when="@2022.10.0:") @@ -59,9 +60,15 @@ class RajaPerf(CachedCMakePackage, CudaPackage, ROCmPackage): depends_on("rocprim", when="+rocm") + + conflicts("~openmp", when="+openmp_target", msg="OpenMP target requires OpenMP") conflicts("+cuda", when="+openmp_target", msg="Cuda may not be activated when openmp_target is ON") + depends_on("caliper@master",when="+caliper") + depends_on("caliper@master +cuda",when="+caliper +cuda") + depends_on("caliper@master +rocm",when="+caliper +rocm") + def _get_sys_type(self, spec): sys_type = str(spec.architecture) # if on llnl systems, we can use the SYS_TYPE @@ -232,6 +239,11 @@ def initconfig_package_entries(self): entries.append(cmake_cache_option("ENABLE_BENCHMARKS", "tests=benchmarks" in spec)) entries.append(cmake_cache_option("ENABLE_TESTS", not "tests=none" in spec or self.run_tests)) + entries.append(cmake_cache_option("RAJA_PERFSUITE_USE_CALIPER","+caliper" in spec)) + if "caliper" in self.spec: + entries.append(cmake_cache_path("caliper_DIR", spec["caliper"].prefix+"/share/cmake/caliper/")) + entries.append(cmake_cache_path("adiak_DIR", spec["adiak"].prefix+"/lib/cmake/adiak/")) + return entries def cmake_args(self): From bd2eb552b307e7959f4b2e6fd3472b9859e5f319 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Wed, 8 Mar 2023 22:20:40 +0100 Subject: [PATCH 058/137] Update RAJA to update radiuss-spack-configs to feature raja-perf package --- .uberenv_config.json | 2 +- scripts/spack_packages/camp/package.py | 188 --------------- scripts/spack_packages/raja_perf/package.py | 252 -------------------- tpl/RAJA | 2 +- 4 files changed, 2 insertions(+), 442 deletions(-) delete mode 100644 scripts/spack_packages/camp/package.py delete mode 100644 scripts/spack_packages/raja_perf/package.py diff --git a/.uberenv_config.json b/.uberenv_config.json index 7a9bb57c8..148959940 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -7,7 +7,7 @@ "spack_branch": "v0.19.0", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", -"spack_packages_path": "scripts/spack_packages", +"spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages", "spack_concretizer": "clingo", "spack_setup_clingo": false } diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py deleted file mode 100644 index f8d31eb42..000000000 --- a/scripts/spack_packages/camp/package.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) - -import os -import glob -import re - -from spack.package import * - - -def spec_uses_toolchain(spec): - gcc_toolchain_regex = re.compile(".*gcc-toolchain.*") - using_toolchain = list(filter(gcc_toolchain_regex.match, spec.compiler_flags["cxxflags"])) - return using_toolchain - -def spec_uses_gccname(spec): - gcc_name_regex = re.compile(".*gcc-name.*") - using_gcc_name = list(filter(gcc_name_regex.match, spec.compiler_flags["cxxflags"])) - return using_gcc_name - -def hip_repair_options(options, spec): - # there is only one dir like this, but the version component is unknown - options.append( - "-DHIP_CLANG_INCLUDE_PATH=" - + glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0] - ) - -def hip_repair_cache(options, spec): - # there is only one dir like this, but the version component is unknown - options.append( - cmake_cache_path( - "HIP_CLANG_INCLUDE_PATH", - glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0], - ) - ) - -def hip_for_radiuss_projects(options, spec, spec_compiler): - # Here is what is typically needed for radiuss projects when building with rocm - hip_root = spec["hip"].prefix - rocm_root = hip_root + "/.." - options.append(cmake_cache_path("HIP_ROOT_DIR", hip_root)) - options.append(cmake_cache_path("ROCM_ROOT_DIR", rocm_root)) - - hip_repair_cache(options, spec) - - archs = spec.variants["amdgpu_target"].value - if archs != "none": - arch_str = ",".join(archs) - options.append( - cmake_cache_string("HIP_HIPCC_FLAGS", "--amdgpu-target={0}".format(arch_str)) - ) - options.append( - cmake_cache_string("CMAKE_HIP_ARCHITECTURES", arch_str) - ) - - # adrienbernede-22-11: - # Specific to Umpire, attempt port to RAJA and CHAI - hip_link_flags = "" - if "%gcc" in spec or spec_uses_toolchain(spec): - if "%gcc" in spec: - gcc_bin = os.path.dirname(spec_compiler.cxx) - gcc_prefix = join_path(gcc_bin, "..") - else: - gcc_prefix = spec_uses_toolchain(spec)[0] - options.append(cmake_cache_string("HIP_CLANG_FLAGS", "--gcc-toolchain={0}".format(gcc_prefix))) - options.append(cmake_cache_string("CMAKE_EXE_LINKER_FLAGS", hip_link_flags + " -Wl,-rpath {}/lib64".format(gcc_prefix))) - else: - options.append(cmake_cache_string("CMAKE_EXE_LINKER_FLAGS", "-Wl,-rpath={0}/llvm/lib/".format(rocm_root))) - -def cuda_for_radiuss_projects(options, spec): - # Here is what is typically needed for radiuss projects when building with cuda - - cuda_flags = [] - if not spec.satisfies("cuda_arch=none"): - cuda_arch = spec.variants["cuda_arch"].value - cuda_flags.append("-arch sm_{0}".format(cuda_arch[0])) - options.append( - cmake_cache_string("CUDA_ARCH", "sm_{0}".format(cuda_arch[0]))) - options.append( - cmake_cache_string("CMAKE_CUDA_ARCHITECTURES", "{0}".format(cuda_arch[0]))) - if spec_uses_toolchain(spec): - cuda_flags.append("-Xcompiler {}".format(spec_uses_toolchain(spec)[0])) - if (spec.satisfies("%gcc@8.1: target=ppc64le")): - cuda_flags.append("-Xcompiler -mno-float128") - options.append(cmake_cache_string("CMAKE_CUDA_FLAGS", " ".join(cuda_flags))) - -def blt_link_helpers(options, spec, spec_compiler): - - ### From local package: - fortran_compilers = ["gfortran", "xlf"] - if any(compiler in spec_compiler.fc for compiler in fortran_compilers) and ("clang" in spec_compiler.cxx): - # Pass fortran compiler lib as rpath to find missing libstdc++ - libdir = os.path.join(os.path.dirname( - os.path.dirname(spec_compiler.fc)), "lib") - flags = "" - for _libpath in [libdir, libdir + "64"]: - if os.path.exists(_libpath): - flags += " -Wl,-rpath,{0}".format(_libpath) - description = ("Adds a missing libstdc++ rpath") - if flags: - options.append(cmake_cache_string("BLT_EXE_LINKER_FLAGS", flags, description)) - - # Ignore conflicting default gcc toolchain - options.append(cmake_cache_string("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE", - "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/x86_64-unknown-linux-gnu/4.9.3")) - - compilers_using_toolchain = ["pgi", "xl", "icpc"] - if any(compiler in spec_compiler.cxx for compiler in compilers_using_toolchain): - if spec_uses_toolchain(spec) or spec_uses_gccname(spec): - - # Ignore conflicting default gcc toolchain - options.append(cmake_cache_string("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE", - "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/x86_64-unknown-linux-gnu/4.9.3")) - - -class Camp(CMakePackage, CudaPackage, ROCmPackage): - """ - Compiler agnostic metaprogramming library providing concepts, - type operations and tuples for C++ and cuda - """ - - homepage = "https://github.com/LLNL/camp" - git = "https://github.com/LLNL/camp.git" - url = "https://github.com/LLNL/camp/archive/v0.1.0.tar.gz" - - maintainers = ["trws"] - - version("main", branch="main", submodules="False") - version("2022.10.1", sha256="2d12f1a46f5a6d01880fc075cfbd332e2cf296816a7c1aa12d4ee5644d386f02") - version("2022.10.0", sha256="3561c3ef00bbcb61fe3183c53d49b110e54910f47e7fc689ad9ccce57e55d6b8") - version("2022.03.2", sha256="bc4aaeacfe8f2912e28f7a36fc731ab9e481bee15f2c6daf0cb208eed3f201eb") - version("2022.03.0", sha256="e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721") - version("0.3.0", sha256="129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb") - version("0.2.3", sha256="58a0f3bd5eadb588d7dc83f3d050aff8c8db639fc89e8d6553f9ce34fc2421a7") - version("0.2.2", sha256="194d38b57e50e3494482a7f94940b27f37a2bee8291f2574d64db342b981d819") - version("0.1.0", sha256="fd4f0f2a60b82a12a1d9f943f8893dc6fe770db493f8fae5ef6f7d0c439bebcc") - - # TODO: figure out gtest dependency and then set this default True. - variant("tests", default=False, description="Build tests") - variant("openmp", default=False, description="Build with OpenMP support") - - depends_on("cub", when="+cuda") - - depends_on("blt") - - def cmake_args(self): - spec = self.spec - - options = [] - - options.append("-DBLT_SOURCE_DIR={0}".format(spec["blt"].prefix)) - - if "+cuda" in spec: - options.extend([ - "-DENABLE_CUDA=ON", - "-DCUDA_TOOLKIT_ROOT_DIR=%s" % (spec["cuda"].prefix) - ]) - - if not spec.satisfies("cuda_arch=none"): - cuda_arch = spec.variants["cuda_arch"].value - options.append("-DCMAKE_CUDA_ARCHITECTURES={0}".format(cuda_arch[0])) - options.append("-DCUDA_ARCH=sm_{0}".format(cuda_arch[0])) - flag = "-arch sm_{0}".format(cuda_arch[0]) - options.append("-DCMAKE_CUDA_FLAGS:STRING={0}".format(flag)) - else: - options.append("-DENABLE_CUDA=OFF") - - if "+rocm" in spec: - options.extend([ - "-DENABLE_HIP=ON", - "-DHIP_ROOT_DIR={0}".format(spec["hip"].prefix) - ]) - - hip_repair_options(options, spec) - - archs = self.spec.variants["amdgpu_target"].value - if archs != "none": - arch_str = ",".join(archs) - options.append("-DHIP_HIPCC_FLAGS=--amdgpu-target={0}".format(arch_str)) - else: - options.append("-DENABLE_HIP=OFF") - - options.append(self.define_from_variant("ENABLE_OPENMP", "openmp")) - options.append(self.define_from_variant("ENABLE_TESTS", "tests")) - - return options diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py deleted file mode 100644 index de457ea7f..000000000 --- a/scripts/spack_packages/raja_perf/package.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) - -import os -import socket -import re - -from os import environ as env -from os.path import join as pjoin - -from spack import * -from spack.pkg.builtin.camp import hip_for_radiuss_projects -from spack.pkg.builtin.camp import cuda_for_radiuss_projects -from spack.pkg.builtin.camp import blt_link_helpers - - -class RajaPerf(CachedCMakePackage, CudaPackage, ROCmPackage): - """RAJA Performance Suite.""" - - homepage = "http://software.llnl.gov/RAJAPerf/" - git = "https://github.com/LLNL/RAJAPerf.git" - - version("develop", branch="develop", submodules="True") - version("main", branch="main", submodules="True") - version("2022.10.0", tag="v2022.10.0", submodules="True") - version("0.12.0", tag="v0.12.0", submodules="True") - version("0.11.0", tag="v0.11.0", submodules="True") - version("0.10.0", tag="v0.10.0", submodules="True") - version("0.9.0", tag="v0.9.0", submodules="True") - version("0.8.0", tag="v0.8.0", submodules="True") - version("0.7.0", tag="v0.7.0", submodules="True") - version("0.6.0", tag="v0.6.0", submodules="True") - version("0.5.2", tag="v0.5.2", submodules="True") - version("0.5.1", tag="v0.5.1", submodules="True") - version("0.5.0", tag="v0.5.0", submodules="True") - version("0.4.0", tag="v0.4.0", submodules="True") - - variant("openmp", default=True, description="Build OpenMP backend") - variant("openmp_target", default=False, description="Build with OpenMP target support") - variant("shared", default=False, description="Build Shared Libs") - variant("libcpp", default=False, description="Uses libc++ instead of libstdc++") - variant("tests", default="basic", values=("none", "basic", "benchmarks"), - multi=False, description="Tests to run") - variant("caliper",default=False, description="Build with support for Caliper based profiling") - - depends_on("blt") - depends_on("blt@0.5.2:", type="build", when="@2022.10.0:") - depends_on("blt@0.5.0:", type="build", when="@0.12.0:") - depends_on("blt@0.4.1:", type="build", when="@0.11.0:") - depends_on("blt@0.4.0:", type="build", when="@0.8.0:") - depends_on("blt@0.3.0:", type="build", when="@:0.7.0") - - depends_on("cmake@3.20:", when="@0.12.0:", type="build") - depends_on("cmake@3.23:", when="@0.12.0: +rocm", type="build") - depends_on("cmake@3.14:", when="@:0.12.0", type="build") - - depends_on("llvm-openmp", when="+openmp %apple-clang") - - depends_on("rocprim", when="+rocm") - - - - conflicts("~openmp", when="+openmp_target", msg="OpenMP target requires OpenMP") - conflicts("+cuda", when="+openmp_target", msg="Cuda may not be activated when openmp_target is ON") - - depends_on("caliper@master",when="+caliper") - depends_on("caliper@master +cuda",when="+caliper +cuda") - depends_on("caliper@master +rocm",when="+caliper +rocm") - - def _get_sys_type(self, spec): - sys_type = str(spec.architecture) - # if on llnl systems, we can use the SYS_TYPE - if "SYS_TYPE" in env: - sys_type = env["SYS_TYPE"] - return sys_type - - @property - # TODO: name cache file conditionally to cuda and libcpp variants - def cache_name(self): - hostname = socket.gethostname() - if "SYS_TYPE" in env: - hostname = hostname.rstrip("1234567890") - var="" - if "+cuda" in self.spec: - var= "-".join([var,"cuda"]) - if "+libcpp" in self.spec: - var="-".join([var,"libcpp"]) - - return "{0}-{1}{2}-{3}@{4}-{5}.cmake".format( - hostname, - self._get_sys_type(self.spec), - var, - self.spec.compiler.name, - self.spec.compiler.version, - self.spec.dag_hash(8) - ) - - def initconfig_compiler_entries(self): - spec = self.spec - compiler = self.compiler - # Default entries are already defined in CachedCMakePackage, inherit them: - entries = super(RajaPerf, self).initconfig_compiler_entries() - - # Switch to hip as a CPP compiler. - # adrienbernede-22-11: - # This was only done in upstream Spack raja package. - # I could not find the equivalent logic in Spack source, so keeping it. - #if "+rocm" in spec: - # entries.insert(0, cmake_cache_path("CMAKE_CXX_COMPILER", spec["hip"].hipcc)) - - # Override CachedCMakePackage CMAKE_C_FLAGS and CMAKE_CXX_FLAGS add - # +libcpp specific flags - flags = spec.compiler_flags - - # use global spack compiler flags - cppflags = " ".join(flags["cppflags"]) - if cppflags: - # avoid always ending up with " " with no flags defined - cppflags += " " - - cflags = cppflags + " ".join(flags["cflags"]) - if "+libcpp" in spec: - cflags += " ".join([cflags,"-DGTEST_HAS_CXXABI_H_=0"]) - if cflags: - entries.append(cmake_cache_string("CMAKE_C_FLAGS", cflags)) - - cxxflags = cppflags + " ".join(flags["cxxflags"]) - if "+libcpp" in spec: - cxxflags += " ".join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"]) - if cxxflags: - entries.append(cmake_cache_string("CMAKE_CXX_FLAGS", cxxflags)) - - blt_link_helpers(entries, spec, compiler) - - # adrienbernede-23-01 - # Maybe we want to share this in the above blt_link_helpers function. - compilers_using_cxx14 = ["intel-17", "intel-18", "xl"] - if any(compiler in self.compiler.cxx for compiler in compilers_using_cxx14): - entries.append(cmake_cache_string("BLT_CXX_STD", "c++14")) - - return entries - - def initconfig_hardware_entries(self): - spec = self.spec - compiler = self.compiler - entries = super(RajaPerf, self).initconfig_hardware_entries() - - entries.append(cmake_cache_option("ENABLE_OPENMP", "+openmp" in spec)) - - # T benefit from the shared function "cuda_for_radiuss_projects", - # we do not modify CMAKE_CUDA_FLAGS: it is already appended by the - # shared function. - if "+cuda" in spec: - entries.append(cmake_cache_option("ENABLE_CUDA", True)) - # Shared handling of cuda. - cuda_for_radiuss_projects(entries, spec) - - # Custom options. We place everything in CMAKE_CUDA_FLAGS_(RELEASE|RELWITHDEBINFO|DEBUG) which are not set by cuda_for_radiuss_projects - if ("xl" in self.compiler.cxx): - all_targets_flags = "-Xcompiler -qstrict -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi" \ - + "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh" \ - + "-Xcompiler -qsuppress=1500-029 -Xcompiler -qsuppress=1500-036" \ - + "-Xcompiler -qsuppress=1500-030" \ - - cuda_release_flags = "-O3 -Xcompiler -O2 " + all_targets_flags - cuda_reldebinf_flags = "-O3 -g -Xcompiler -O2 " + all_targets_flags - cuda_debug_flags = "-O0 -g -Xcompiler -O2 " + all_targets_flags - - elif ("gcc" in self.compiler.cxx): - all_targets_flags = "-Xcompiler -finline-functions -Xcompiler -finline-limit=20000" - - cuda_release_flags = "-O3 -Xcompiler -Ofast " + all_targets_flags - cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast " + all_targets_flags - cuda_debug_flags = "-O0 -g -Xcompiler -O0 " + all_targets_flags - - else: - all_targets_flags = "-Xcompiler -finline-functions" - - cuda_release_flags = "-O3 -Xcompiler -Ofast " + all_targets_flags - cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast " + all_targets_flags - cuda_debug_flags = "-O0 -g -Xcompiler -O0 " + all_targets_flags - - entries.append(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) - entries.append(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) - entries.append(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", cuda_debug_flags)) - - else: - entries.append(cmake_cache_option("ENABLE_CUDA", False)) - - if "+rocm" in spec: - entries.append(cmake_cache_option("ENABLE_HIP", True)) - hip_for_radiuss_projects(entries, spec, compiler) - else: - entries.append(cmake_cache_option("ENABLE_HIP", False)) - - entries.append(cmake_cache_option("ENABLE_OPENMP_TARGET", "+openmp_target" in spec)) - if "+openmp_target" in spec: - if ("%xl" in spec): - entries.append(cmake_cache_string("BLT_OPENMP_COMPILE_FLAGS", "-qoffload;-qsmp=omp;-qnoeh;-qalias=noansi")) - entries.append(cmake_cache_string("BLT_OPENMP_LINK_FLAGS", "-qoffload;-qsmp=omp;-qnoeh;-qalias=noansi")) - if ("%clang" in spec): - entries.append(cmake_cache_string("BLT_OPENMP_COMPILE_FLAGS", "-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda")) - entries.append(cmake_cache_string("BLT_OPENMP_LINK_FLAGS", "-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda")) - - return entries - - def initconfig_package_entries(self): - spec = self.spec - entries = [] - - option_prefix = "RAJA_" if spec.satisfies("@0.14.0:") else "" - - # TPL locations - entries.append("#------------------{0}".format("-" * 60)) - entries.append("# TPLs") - entries.append("#------------------{0}\n".format("-" * 60)) - - entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec["blt"].prefix)) - - # Build options - entries.append("#------------------{0}".format("-" * 60)) - entries.append("# Build Options") - entries.append("#------------------{0}\n".format("-" * 60)) - - entries.append(cmake_cache_string( - "CMAKE_BUILD_TYPE", spec.variants["build_type"].value)) - - entries.append(cmake_cache_string("RAJA_RANGE_ALIGN", "4")) - entries.append(cmake_cache_string("RAJA_RANGE_MIN_LENGTH", "32")) - entries.append(cmake_cache_string("RAJA_DATA_ALIGN", "64")) - - entries.append(cmake_cache_option("RAJA_HOST_CONFIG_LOADED", True)) - - entries.append(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec)) - entries.append(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec)) - - entries.append(cmake_cache_option("ENABLE_BENCHMARKS", "tests=benchmarks" in spec)) - entries.append(cmake_cache_option("ENABLE_TESTS", not "tests=none" in spec or self.run_tests)) - - entries.append(cmake_cache_option("RAJA_PERFSUITE_USE_CALIPER","+caliper" in spec)) - if "caliper" in self.spec: - entries.append(cmake_cache_path("caliper_DIR", spec["caliper"].prefix+"/share/cmake/caliper/")) - entries.append(cmake_cache_path("adiak_DIR", spec["adiak"].prefix+"/lib/cmake/adiak/")) - - return entries - - def cmake_args(self): - options = [] - return options - diff --git a/tpl/RAJA b/tpl/RAJA index 28fbae0c9..c51c2c488 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 28fbae0c957d223ea88685669f5eb2108aca84d1 +Subproject commit c51c2c488bc8e03eacd5f6a77c5137d86d38b25e From 12ef9fd2ffbc61a7774536f6104cd2acd382bde2 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Wed, 8 Mar 2023 22:59:07 +0100 Subject: [PATCH 059/137] Update Radiuss-Shared-CI --- .gitlab-ci.yml | 17 ++++++++++++++--- .gitlab/corona-build-and-test-extra.yml | 4 ++-- .gitlab/custom-jobs-and-variables.yml | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 34c636131..ec4d08f13 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,6 +24,8 @@ # We define the following GitLab pipeline variables: variables: + GITHUB_PROJECT_NAME: "RAJAPerf" + GITHUB_PROJECT_ORG: "LLNL" # Use the umdev service user to run CI. This prevents from running pipelines as # an actual user. LLNL_SERVICE_USER: umdev @@ -35,6 +37,11 @@ variables: # We build the projects in the CI clone directory. # TODO: add a clean-up mechanism BUILD_ROOT: ${CI_PROJECT_DIR} +# Set the build-and-test command. + BUILD_AND_TEST_CMD: "./scripts/gitlab/build_and_test.sh" +# Override the list of branch that will skip the "draft PR test". +# Add protected branches here. Defaults to "develop main master". +# ALWAYS_RUN_LIST: "develop main" # We organize the CI on Gitlab in sub-pipelines. Each sub-pipeline corresponds # to a test phase on a given machine. @@ -50,13 +57,17 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: v2022.09.0 + ref: v2023.03.0rc file: '${CI_MACHINE}-build-and-test.yml' - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' strategy: depend forward: pipeline_variables: true -# pipelines subscribed by the project include: - - local: .gitlab/subscribed-pipelines.yml + # checks preliminary to running the actual CI test (optional) + #- project: 'radiuss/radiuss-shared-ci' + # ref: v2023.03.0rc + # file: 'preliminary-ignore-draft-pr.yml' + # pipelines subscribed by the project + - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml index 1dd6b9bed..2f48f46c9 100644 --- a/.gitlab/corona-build-and-test-extra.yml +++ b/.gitlab/corona-build-and-test-extra.yml @@ -27,7 +27,7 @@ variables: INCLUDED_FILE_CANNOT_BE_EMPTY: "True" # INFO: This job is activated in RAJA CI, but we don't use desul atomics here -#rocm_5_1_1_clang_13_0_0_desul_atomics: +#rocmcc_5_1_1_hip_desul_atomics: # variables: -# SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 ^blt@develop ^hip@5.1.1" +# SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %rocmcc@5.1.1 ^blt@develop ^hip@5.1.1" # extends: .build_and_test_on_corona diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index a7c72805a..5078b027a 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -26,7 +26,7 @@ variables: # Corona # Arguments for top level allocation - CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--time-limit=60m --nodes=1" + CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" # Arguments for job level allocation CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=30m --nodes=1" # Project specific variants for corona From d2a94587dc724e4e55be5634549863d127e2c21e Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Wed, 8 Mar 2023 23:00:02 +0100 Subject: [PATCH 060/137] Update RAJA (with update of Radiuss Spack Configs --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index c51c2c488..ce1f40198 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit c51c2c488bc8e03eacd5f6a77c5137d86d38b25e +Subproject commit ce1f40198c4835ea809b7d778da7e3d997b0df56 From 142d4e3d6c04bf27c76227b47065ca8fe54cc534 Mon Sep 17 00:00:00 2001 From: Doru Bercea Date: Wed, 8 Mar 2023 18:23:26 -0500 Subject: [PATCH 061/137] Add --disable_warmup flag. --- src/common/Executor.cpp | 28 +++++++++++++++------------- src/common/RunParams.cpp | 17 +++++++++++++---- src/common/RunParams.hpp | 4 ++++ 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 85af7dbcf..d40bcae44 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -849,21 +849,23 @@ void Executor::runSuite() return; } - getCout() << "\n\nRun warmup kernels...\n"; - vector warmup_kernels; - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - - for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { - KernelBase* warmup_kernel = warmup_kernels[ik]; - runKernel(warmup_kernel, true); - delete warmup_kernel; - warmup_kernels[ik] = nullptr; + if (!run_params.getDisableWarmup()) { + getCout() << "\n\nRun warmup kernels...\n"; + + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + + for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { + KernelBase* warmup_kernel = warmup_kernels[ik]; + runKernel(warmup_kernel, true); + delete warmup_kernel; + warmup_kernels[ik] = nullptr; + } } diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 527e962a2..93fb7703c 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -54,7 +54,8 @@ RunParams::RunParams(int argc, char** argv) npasses_combiner_input(), invalid_npasses_combiner_input(), outdir(), - outfile_prefix("RAJAPerf") + outfile_prefix("RAJAPerf"), + disable_warmup(0) { parseCommandLineOptions(argc, argv); } @@ -110,6 +111,8 @@ void RunParams::print(std::ostream& str) const str << "\n outdir = " << outdir; str << "\n outfile_prefix = " << outfile_prefix; + str << "\n disable_warmup = " << disable_warmup; + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; @@ -526,9 +529,13 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( std::string(argv[i]) == std::string("--dryrun") ) { - if (input_state != BadInput) { - input_state = DryRun; - } + if (input_state != BadInput) { + input_state = DryRun; + } + + } else if ( std::string(argv[i]) == std::string("--disable_warmup") ) { + + disable_warmup = 1; } else if ( std::string(argv[i]) == std::string("--checkrun") ) { @@ -694,6 +701,8 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --dryrun (print summary of how Suite will run without running it)\n\n"; + str << "\t --disable_warmup (disable warmup tests)\n\n"; + str << "\t --checkrun [default is 1]\n" << "\t (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n"; str << "\t\t Example...\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 806faea85..ffbf49655 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -196,6 +196,8 @@ class RunParams { const std::string& getOutputDirName() const { return outdir; } const std::string& getOutputFilePrefix() const { return outfile_prefix; } + const bool getDisableWarmup() const { return disable_warmup; } + //@} /*! @@ -268,6 +270,8 @@ class RunParams { std::string outdir; /*!< Output directory name. */ std::string outfile_prefix; /*!< Prefix for output data file names. */ + bool disable_warmup; + }; From 182b642babf0fa2d566fa5a4554575ddbf3b9186 Mon Sep 17 00:00:00 2001 From: Doru Bercea Date: Wed, 8 Mar 2023 18:23:26 -0500 Subject: [PATCH 062/137] Add --disable_warmup flag. --- src/common/Executor.cpp | 32 +++++++++++++++++--------------- src/common/RunParams.cpp | 17 +++++++++++++---- src/common/RunParams.hpp | 4 ++++ 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 85af7dbcf..a41e3fd00 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -849,21 +849,23 @@ void Executor::runSuite() return; } - getCout() << "\n\nRun warmup kernels...\n"; - - vector warmup_kernels; - - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); - - for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { - KernelBase* warmup_kernel = warmup_kernels[ik]; - runKernel(warmup_kernel, true); - delete warmup_kernel; - warmup_kernels[ik] = nullptr; + if (!run_params.getDisableWarmup()) { + getCout() << "\n\nRun warmup kernels...\n"; + + vector warmup_kernels; + + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + + for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { + KernelBase* warmup_kernel = warmup_kernels[ik]; + runKernel(warmup_kernel, true); + delete warmup_kernel; + warmup_kernels[ik] = nullptr; + } } diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 527e962a2..93fb7703c 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -54,7 +54,8 @@ RunParams::RunParams(int argc, char** argv) npasses_combiner_input(), invalid_npasses_combiner_input(), outdir(), - outfile_prefix("RAJAPerf") + outfile_prefix("RAJAPerf"), + disable_warmup(0) { parseCommandLineOptions(argc, argv); } @@ -110,6 +111,8 @@ void RunParams::print(std::ostream& str) const str << "\n outdir = " << outdir; str << "\n outfile_prefix = " << outfile_prefix; + str << "\n disable_warmup = " << disable_warmup; + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; @@ -526,9 +529,13 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( std::string(argv[i]) == std::string("--dryrun") ) { - if (input_state != BadInput) { - input_state = DryRun; - } + if (input_state != BadInput) { + input_state = DryRun; + } + + } else if ( std::string(argv[i]) == std::string("--disable_warmup") ) { + + disable_warmup = 1; } else if ( std::string(argv[i]) == std::string("--checkrun") ) { @@ -694,6 +701,8 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --dryrun (print summary of how Suite will run without running it)\n\n"; + str << "\t --disable_warmup (disable warmup tests)\n\n"; + str << "\t --checkrun [default is 1]\n" << "\t (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n"; str << "\t\t Example...\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 806faea85..ffbf49655 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -196,6 +196,8 @@ class RunParams { const std::string& getOutputDirName() const { return outdir; } const std::string& getOutputFilePrefix() const { return outfile_prefix; } + const bool getDisableWarmup() const { return disable_warmup; } + //@} /*! @@ -268,6 +270,8 @@ class RunParams { std::string outdir; /*!< Output directory name. */ std::string outfile_prefix; /*!< Prefix for output data file names. */ + bool disable_warmup; + }; From 4f1498aee9f7a27f72ec571e8bfad88608a19000 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Thu, 9 Mar 2023 11:28:40 +0100 Subject: [PATCH 063/137] Skip failing job, do not allow 2 passing jobs to fail --- .gitlab/lassen-build-and-test-extra.yml | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml index aad9c6c2e..f76cbe14d 100644 --- a/.gitlab/lassen-build-and-test-extra.yml +++ b/.gitlab/lassen-build-and-test-extra.yml @@ -27,6 +27,14 @@ ibm_clang_9_0_0_gcc_8_3_1: extends: .build_and_test_on_lassen allow_failure: true +# Overriding shared spec: Longer allocation + Allow failures +ibm_clang_9_0_0_gcc_8_3_1_cuda_10_1_168: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %clang@ibm.9.0.0 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@10.1.168 ${PROJECT_LASSEN_DEPS}" + LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60" + extends: .build_and_test_on_lassen + allow_failure: true + # Overriding shared spec: Extra flags gcc_8_3_1: variables: @@ -55,12 +63,18 @@ xl_16_1_1_12_gcc_8_3_1: LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 50" extends: .build_and_test_on_lassen -# Overriding shared spec: Longer allocation + Allow failures -ibm_clang_9_0_0_gcc_8_3_1_cuda_10_1_168: +# Overriding shared spec: Longer allocation + Extra flags +xl_16_1_1_12_gcc_7_3_1_cuda_10_1_168: variables: - SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %clang@ibm.9.0.0 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@10.1.168 ${PROJECT_LASSEN_DEPS}" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags==\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-7.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qsmp=omp -qnoeh -qsuppress=1500-029 -qsuppress=1500-036\" cflags==\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-7.3.1\" cuda_arch=70 ^cuda@10.1.168 ${PROJECT_LASSEN_DEPS}" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60" extends: .build_and_test_on_lassen + script: + - | + echo -e "\e[31mDeactivated spec !\e[0m" + echo -e "\e[31m${SPEC}\e[0m" + echo -e "\e[31mRAJA won’t build with Cuda < 11.1.0 due to a known issue.\e[0m" + - exit 1 allow_failure: true # Overriding shared spec: Extra flags + Longer allocation + Allow failure @@ -68,7 +82,6 @@ xl_16_1_1_12_cuda_11_1_0: variables: SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cuda_arch=70 ^cuda@11.1.0 ${PROJECT_LASSEN_DEPS}" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60" - allow_failure: true extends: .build_and_test_on_lassen # Overriding shared spec: Extra flags + Longer allocation + Allow failure @@ -76,7 +89,6 @@ xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0: variables: SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags\"=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@11.1.0 ${PROJECT_LASSEN_DEPS}" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60" - allow_failure: true extends: .build_and_test_on_lassen From a7636b9dc43f0d22e80ec9b44a2a9de96ae0ff14 Mon Sep 17 00:00:00 2001 From: Gheorghe-Teodor Bercea Date: Mon, 13 Mar 2023 12:19:06 -0400 Subject: [PATCH 064/137] Update src/common/RunParams.cpp Co-authored-by: Jason Burmark --- src/common/RunParams.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 93fb7703c..be524cd60 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -535,7 +535,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( std::string(argv[i]) == std::string("--disable_warmup") ) { - disable_warmup = 1; + disable_warmup = true; } else if ( std::string(argv[i]) == std::string("--checkrun") ) { From a0f3d1e3c20e35ef92c7861beaa33b895de20070 Mon Sep 17 00:00:00 2001 From: Gheorghe-Teodor Bercea Date: Mon, 13 Mar 2023 12:19:16 -0400 Subject: [PATCH 065/137] Update src/common/RunParams.cpp Co-authored-by: Jason Burmark --- src/common/RunParams.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index be524cd60..56e3cf1f6 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -55,7 +55,7 @@ RunParams::RunParams(int argc, char** argv) invalid_npasses_combiner_input(), outdir(), outfile_prefix("RAJAPerf"), - disable_warmup(0) + disable_warmup(false) { parseCommandLineOptions(argc, argv); } From ac7fd16524bcf3c2327b3949902fc46378f2a42a Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Mon, 13 Mar 2023 20:56:48 +0100 Subject: [PATCH 066/137] Update RAJA submodule --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index ce1f40198..6e1b9afbe 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit ce1f40198c4835ea809b7d778da7e3d997b0df56 +Subproject commit 6e1b9afbebd3d536de447d36733b473cc8dea790 From be48b93d772d17c65628ed1494300ee051a77015 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Mon, 13 Mar 2023 21:12:41 +0100 Subject: [PATCH 067/137] Activate Tioga CI --- .gitlab/subscribed-pipelines.yml | 5 ++++ .gitlab/tioga-build-and-test-extra.yml | 39 ++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 .gitlab/tioga-build-and-test-extra.yml diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index ede2aebfe..10258c3ec 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -19,6 +19,11 @@ corona-build-and-test: CI_MACHINE: "corona" extends: [.build-and-test] +tioga-build-and-test: + variables: + CI_MACHINE: "tioga" + extends: [.build-and-test] + lassen-build-and-test: variables: CI_MACHINE: "lassen" diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/tioga-build-and-test-extra.yml new file mode 100644 index 000000000..12ec55b82 --- /dev/null +++ b/.gitlab/tioga-build-and-test-extra.yml @@ -0,0 +1,39 @@ +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################# + +######################## +# Overridden shared jobs +######################## +# We duplicate the shared jobs description and add necessary changes for RAJA. +# We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that +# the comparison with the original job is easier. + +# No overridden jobs so far. + +############ +# Extra jobs +############ +# We do not recommend using ${PROJECT__VARIANTS} and +# ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully +# describe the spec here. + +# With GitLab CI, included files cannot be empty. +variables: + INCLUDED_FILE_CANNOT_BE_EMPTY: "True" + +# INFO: This job is activated in RAJA CI, but we don't use desul atomics here +#rocmcc_5_2_3_hip_desul_atomics: +# variables: +# SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@5.2.3 ^hip@5.2.3 ^blt@develop" +# extends: .build_and_test_on_tioga +# +# INFO: This job is activated in RAJA CI, but we may not want to run openmp here +#rocmcc_5_2_3_hip_openmp: +# variables: +# SPEC: "~shared +rocm +openmp +tests amdgpu_target=gfx90a %rocmcc@5.2.3 ^hip@5.2.3 ^blt@develop" +# extends: .build_and_test_on_tioga From fdfe899cf0be95cb96c22f9b2e7cffd832b7a67f Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Mon, 13 Mar 2023 21:38:25 +0100 Subject: [PATCH 068/137] Set Tioga CI variables --- .gitlab/custom-jobs-and-variables.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 5078b027a..f299e16ff 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -34,6 +34,16 @@ variables: # Project specific deps for corona PROJECT_CORONA_DEPS: "^blt@develop " +# Tioga +# Arguments for top level allocation + TIOGA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" +# Arguments for job level allocation + TIOGA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1" +# Project specific variants for corona + PROJECT_TIOGA_VARIANTS: "~openmp" +# Project specific deps for corona + PROJECT_TIOGA_DEPS: "^blt@develop " + # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation From a0c52aec61f299589ddd8e98ff122ee9f5ee75f0 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" Date: Mon, 13 Mar 2023 22:02:42 +0100 Subject: [PATCH 069/137] Add missing tioga logic to script --- scripts/gitlab/build_and_test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index f05571e8d..f7501b7b6 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -147,7 +147,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # Map CPU core allocations - declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32) + declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32) # If using Multi-project, set up the submodule if [[ -n ${raja_version} ]] @@ -173,7 +173,7 @@ then date - if [[ "${truehostname}" == "corona" ]] + if [[ "${truehostname}" == "corona" || "${truehostname}" == "tioga" ]] then module unload rocm fi From 7dd49f95bdd689173427f742a0bc18054564ea2e Mon Sep 17 00:00:00 2001 From: Doru Bercea Date: Tue, 14 Mar 2023 14:50:21 -0400 Subject: [PATCH 070/137] Disable warmup for unit testing. --- src/common/RunParams.cpp | 4 ++-- test/test-raja-perf-suite.cpp | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 56e3cf1f6..b4af80a20 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -533,7 +533,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = DryRun; } - } else if ( std::string(argv[i]) == std::string("--disable_warmup") ) { + } else if ( std::string(argv[i]) == std::string("--disable-warmup") ) { disable_warmup = true; @@ -701,7 +701,7 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --dryrun (print summary of how Suite will run without running it)\n\n"; - str << "\t --disable_warmup (disable warmup tests)\n\n"; + str << "\t --disable-warmup (disable warmup tests)\n\n"; str << "\t --checkrun [default is 1]\n" << "\t (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n"; diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index ddb44a01e..9e59954f6 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -20,16 +20,16 @@ TEST(ShortSuiteTest, Basic) { // Assemble command line args for basic test - int argc = 4; + int argc = 5; #if defined(RAJA_ENABLE_HIP) && \ (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) - argc = 6; + argc = 7; #endif #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) - argc = 6; + argc = 7; #endif std::vector< std::string > sargv(argc); @@ -37,17 +37,18 @@ TEST(ShortSuiteTest, Basic) sargv[1] = std::string("--checkrun"); sargv[2] = std::string("3"); sargv[3] = std::string("--show-progress"); + sargv[4] = std::string("--disable-warmup"); #if defined(RAJA_ENABLE_HIP) && \ (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) - sargv[4] = std::string("--exclude-kernels"); - sargv[5] = std::string("HALOEXCHANGE_FUSED"); + sargv[5] = std::string("--exclude-kernels"); + sargv[6] = std::string("HALOEXCHANGE_FUSED"); #endif #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) - sargv[4] = std::string("--exclude-kernels"); - sargv[5] = std::string("FIRST_MIN"); + sargv[5] = std::string("--exclude-kernels"); + sargv[6] = std::string("FIRST_MIN"); #endif char** argv = new char* [argc]; From d2084e5cf052e0de116a79a7dd594ac047401ba0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Mar 2023 10:09:01 -0700 Subject: [PATCH 071/137] Remove unnecessary const qualifier that causes many compiler warnings. --- src/common/RunParams.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index ffbf49655..5a0b79274 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -196,7 +196,7 @@ class RunParams { const std::string& getOutputDirName() const { return outdir; } const std::string& getOutputFilePrefix() const { return outfile_prefix; } - const bool getDisableWarmup() const { return disable_warmup; } + bool getDisableWarmup() const { return disable_warmup; } //@} From 17ca360a50898ecd910bb3b314473b201515b136 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Mar 2023 11:21:53 -0700 Subject: [PATCH 072/137] Clarification and fix formatting --- docs/sphinx/dev_guide/kernel_class_impl.rst | 33 ++++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 6a414cbf1..017d06f8e 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -68,19 +68,20 @@ The methods in the source file are: * ``tearDown`` method, which deallocates and resets any data that will be re-allocated and/or initialized in subsequent kernel executions. -.. important:: The ``tearDown`` method frees and/or resets all kernel data that - is allocated and/or initialized in the ``setUp`` method. + .. important:: The ``tearDown`` method frees and/or resets all kernel + data that is allocated and/or initialized in the ``setUp`` + method. * ``updateChecksum`` method, which computes a checksum from the results of an execution of the kernel and adds it to the checksum value, which is a member of the ``KernelBase`` class, for the variant and tuning index that was run. -.. important:: The checksum must be computed in the same way for each variant - of a kernel so that checksums for different variants can be - compared to help identify differences, and potential errors in - implementations, compiler optimizations, programming model - execution, etc. + .. important:: The checksum must be computed in the same way for each + variant of a kernel so that checksums for different + variants can be compared to help identify differences, and + potential errors in implementations, compiler optimizations, + programming model execution, etc. The ``setUp``, ``tearDown``, and ``updateChecksum`` methods are called **each time a kernel variant is run**. We allocate and deallocate @@ -209,15 +210,19 @@ Notable differences with the sequential variant file are: * In addition to using the ``ADD_DATA_SETUP`` macro, which is also used in the sequential variant implementation file discussed above, we define two other macros, ``ADD_DATA_SETUP_CUDA`` and - ``ADD_DATA_TEARDOWN_CUDA``. These macros allocate GPU device data and - initialize it by copying host CPU data to it, and copy data back to the - host and deallocate the device data, respectively. + ``ADD_DATA_TEARDOWN_CUDA``. The first macro allocates GPU device data needed + to run a kernel and initialize the data by copying host CPU data to it. + After a kernel executes, the second macro copies data needed to compute a + checksum to the host and then deallocates the device data. * A CUDA GPU kernel ``add`` is implemented for the ``Base_CUDA`` variant. - * The ``block_size`` template parameter for the ``ADD::runCudaVariantImpl`` - method represents the *tuning parameter*. + * The method to exjcute the CUDA kernel variants ``ADD::runCudaVariantImpl`` + is templated on a ``block_size`` parameter, which represents the + *tuning parameter*, and is passes to the kernel lauch methods. * The ``RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE`` macro is - used to generate different kernel tunings that use the GPU thread-block - sizes specified via command-line input mentioned in :ref:`build_build-label`. + used (outside the method implementation, to generate different kernel + tuning implementations at compile-time to run the GPU ``block_size`` + versions specified via command-line input mentioned in + :ref:`build_build-label`. .. important:: Following the established implementation patterns for kernels in the Suite help to ensure that the code is consistent, From 444f5d7cd76d17427ba88d495da8fe6d5a17eec6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Mar 2023 11:30:04 -0700 Subject: [PATCH 073/137] Another attempt to fix note formatting --- docs/sphinx/dev_guide/kernel_class_impl.rst | 24 ++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 017d06f8e..13da325df 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -65,23 +65,23 @@ The methods in the source file are: * ``setUp`` method, which allocates and initializes data required for the kernel to execute and produce results. - * ``tearDown`` method, which deallocates and resets any data that will be + * ``tearDown`` method, which deallocates and resets any data that will be re-allocated and/or initialized in subsequent kernel executions. - .. important:: The ``tearDown`` method frees and/or resets all kernel - data that is allocated and/or initialized in the ``setUp`` - method. + .. note:: The ``tearDown`` method frees and/or resets all kernel + data that is allocated and/or initialized in the ``setUp`` + method. * ``updateChecksum`` method, which computes a checksum from the results of - an execution of the kernel and adds it to the checksum value, which is a - member of the ``KernelBase`` class, for the variant and tuning index that + an execution of the kernel and adds it to the checksum value, which is a + member of the ``KernelBase`` class, for the variant and tuning index that was run. - .. important:: The checksum must be computed in the same way for each - variant of a kernel so that checksums for different - variants can be compared to help identify differences, and - potential errors in implementations, compiler optimizations, - programming model execution, etc. + .. note:: The checksum must be computed in the same way for each + variant of a kernel so that checksums for different + variants can be compared to help identify differences, and + potential errors in implementations, compiler optimizations, + programming model execution, etc. The ``setUp``, ``tearDown``, and ``updateChecksum`` methods are called **each time a kernel variant is run**. We allocate and deallocate @@ -202,7 +202,7 @@ kernel in the ``ADD-Cuda.cpp`` file are: Notable differences with the sequential variant file are: - * Most of the file is guarded using the ``RAJA_ENABLE_CUDA`` macro. + * Most of the file is guarded using the ``RAJA_ENABLE_CUDA`` macro. ..note:: The contents of all non-sequential variant implementation files are guarded using the ``RAJA_ENABLE_`` macros. From b23d67b0a2e934f1fd22b43c054e5c0b28c818c1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Mar 2023 11:32:43 -0700 Subject: [PATCH 074/137] Fix more formatting. --- docs/sphinx/dev_guide/kernel_class.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst index 81b7d2ec5..5d544dd68 100644 --- a/docs/sphinx/dev_guide/kernel_class.rst +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -63,7 +63,7 @@ In its entirety, the **ADD** kernel class header file ``ADD.hpp`` is: The key ingredients of a kernel class header file are: - * **Copyright statement** at the top of the file. + * **Copyright statement** at the top of the file. .. note:: Each file in the RAJA Performance Suite must start with a boilerplate comment for the project copyright information. From ae3f1b7671e8caa109e932001a410b12d295271c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Mar 2023 11:50:45 -0700 Subject: [PATCH 075/137] Fix formatting. --- docs/sphinx/dev_guide/kernel_class_impl.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 13da325df..0229f791c 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -204,8 +204,8 @@ Notable differences with the sequential variant file are: * Most of the file is guarded using the ``RAJA_ENABLE_CUDA`` macro. - ..note:: The contents of all non-sequential variant implementation files - are guarded using the ``RAJA_ENABLE_`` macros. + .. note:: The contents of all non-sequential variant implementation files + are guarded using the ``RAJA_ENABLE_`` macros. * In addition to using the ``ADD_DATA_SETUP`` macro, which is also used in the sequential variant implementation file discussed above, we From 0132403131842fb83369c96504750ad5993c80cc Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Mar 2023 15:53:41 -0700 Subject: [PATCH 076/137] Minor doc cleanup; make method formatting more consistent in kernel base header --- docs/index.rst | 7 +-- docs/sphinx/user_guide/output.rst | 33 ++++++----- src/common/KernelBase.hpp | 98 ++++++++++++++++++++----------- 3 files changed, 85 insertions(+), 53 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 945124246..12ec445a5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,16 +31,15 @@ terminology used in the Suite implementation and discussion includes: * **Variant** refers to implementations of Suite kernels that share the same implementation approach and programming model, such as *baseline OpenMP*, *RAJA OpenMP*, etc. - * **Tuning** refers to an implementation of kernels with a particular - execution parameterization, such as GPU thread-block size 128, GPU - thread-block size 256, etc. + * **Tuning** refers to execution of kernels with a particular execution + parameterization, such as GPU thread-block size 128, 256, etc. The kernels in the Suite originate from various HPC benchmark suites and applications. For example, the "Stream" group contains kernels from the Babel Stream benchmark, the "Apps" group contains kernels extracted from real scientific computing applications, and so forth. Each kernel in the Suite appears in multiple RAJA and non-RAJA (i.e., *baseline*) variants that use -parallel programming models supported by RAJA. Some kernels have multiple +parallel programming models supported by RAJA. Some kernels support multiple tunings of a variant to explore the performance implications of options that a programming model supports. diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst index 858c456cb..edcbc9a57 100644 --- a/docs/sphinx/user_guide/output.rst +++ b/docs/sphinx/user_guide/output.rst @@ -91,9 +91,11 @@ Notes about *problem size* This section describes how the Suite calculates problem sizes and the rationale behind it. - * Problem size is always reported per process/MPI rank. To get the total - problem size across all ranks when running with MPI, multiply the problem - size by the number of MPI ranks. + * **The concept of problem size is subjective and can be interpreted + differently depending on the kernel structure and what one is trying to + measure.** For example, problem size could refer to the amount of data + needed to be stored in memory to run the problem, or it could refer to + the amount of parallel work that is possible, etc. * The Suite uses three notions of problem size for each kernel: *default*, *target*, and *actual*. Default is the problem size defined for a kernel and the size that is run if no run time options are provided to run a @@ -101,19 +103,18 @@ rationale behind it. settings and alterations to those if input is provided to change the default. Actual is the problem size that is run based on how each kernel calculates it based on defaults and run time input. - * **The concept of problem size is subjective and can be interpreted - differently depending on the kernel structure and what one is trying to - measure.** For example, problem size could refer to the amount of data - needed to be stored in memory to run the problem, or it could refer to - the amount of parallel work that is possible, etc. - * We employ the following, admittedly loose definition, which depends on the - particular kernel structure. Of all *loop structures* (e.g., single loop, - nested loops, etc.) that are run for a kernel (note that some kernels run - multiple loops, possibly with different sizes or loop structures), problem - size refers to the size of the data set required to generate the kernel - result. The interpretation of this and the definition of problem size for - each kernel in the suite is determined by the kernel developer and team - discussion. + * We employ an admittedly loose definition of problem size for each kernel, + which depends on the kernel structure. Of all *loop structures* + (e.g., single loop, nested loops, etc.) that are run for a kernel (note + that some kernels run multiple loops, possibly with different sizes or + loop structures), problem size refers to the size of the data set required + to generate the kernel result. The interpretation of this and the + definition of problem size for each kernel in the suite is determined by + the kernel developer and team discussion. + +.. note: Problem size is always reported per process/MPI rank. To get the total + problem size over all ranks when running with MPI, multiply the + problem size by the number of MPI ranks. Here are a few examples to give a better sense of how we determine problem size for various kernels in the Suite. diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index d01a3f788..cec79e2eb 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -68,28 +68,34 @@ class KernelBase void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; } void setUsesFeature(FeatureID fid) { uses_feature[fid] = true; } + void setVariantDefined(VariantID vid); void addVariantTuningName(VariantID vid, std::string name) { variant_tuning_names[vid].emplace_back(std::move(name)); } virtual void setSeqTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } + #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) virtual void setOpenMPTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } #endif + #if defined(RAJA_ENABLE_CUDA) virtual void setCudaTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } #endif + #if defined(RAJA_ENABLE_HIP) virtual void setHipTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } #endif + #if defined(RAJA_ENABLE_TARGET_OPENMP) virtual void setOpenMPTargetTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } #endif + #if defined(RUN_KOKKOS) virtual void setKokkosTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } @@ -114,59 +120,69 @@ class KernelBase bool usesFeature(FeatureID fid) const { return uses_feature[fid]; }; bool hasVariantDefined(VariantID vid) const - { return !variant_tuning_names[vid].empty(); } + { return !variant_tuning_names[vid].empty(); } + bool hasVariantTuningDefined(VariantID vid, size_t tune_idx) const - { - if (hasVariantDefined(vid) && tune_idx < getNumVariantTunings(vid)) { - return true; - } - return false; + { + if (hasVariantDefined(vid) && tune_idx < getNumVariantTunings(vid)) { + return true; } - bool hasVariantTuningDefined(VariantID vid, std::string const& tuning_name) const - { - if (hasVariantDefined(vid)) { - for (std::string const& a_tuning_name : getVariantTuningNames(vid)) { - if (tuning_name == a_tuning_name) { return true; } - } + return false; + } + bool hasVariantTuningDefined(VariantID vid, + std::string const& tuning_name) const + { + if (hasVariantDefined(vid)) { + for (std::string const& a_tuning_name : getVariantTuningNames(vid)) { + if (tuning_name == a_tuning_name) { return true; } } - return false; } - size_t getVariantTuningIndex(VariantID vid, std::string const& tuning_name) const - { - std::vector const& tuning_names = getVariantTuningNames(vid); - for (size_t t = 0; t < tuning_names.size(); ++t) { - std::string const& a_tuning_name = tuning_names[t]; - if (tuning_name == a_tuning_name) { return t; } - } - return getUnknownTuningIdx(); + return false; + } + + size_t getVariantTuningIndex(VariantID vid, + std::string const& tuning_name) const + { + std::vector const& tuning_names = getVariantTuningNames(vid); + for (size_t t = 0; t < tuning_names.size(); ++t) { + std::string const& a_tuning_name = tuning_names[t]; + if (tuning_name == a_tuning_name) { return t; } } + return getUnknownTuningIdx(); + } + size_t getNumVariantTunings(VariantID vid) const - { return getVariantTuningNames(vid).size(); } + { return getVariantTuningNames(vid).size(); } std::string const& getVariantTuningName(VariantID vid, size_t tune_idx) const - { return getVariantTuningNames(vid).at(tune_idx); } + { return getVariantTuningNames(vid).at(tune_idx); } std::vector const& getVariantTuningNames(VariantID vid) const - { return variant_tuning_names[vid]; } + { return variant_tuning_names[vid]; } // // Methods to get information about kernel execution for reports // containing kernel execution information // bool wasVariantTuningRun(VariantID vid, size_t tune_idx) const - { - if (tune_idx != getUnknownTuningIdx()) { - return num_exec[vid].at(tune_idx) > 0; - } - return false; + { + if (tune_idx != getUnknownTuningIdx()) { + return num_exec[vid].at(tune_idx) > 0; } + return false; + } // get runtime of executed variant/tuning double getLastTime() const { return timer.elapsed(); } // get timers accumulated over npasses - double getMinTime(VariantID vid, size_t tune_idx) const { return min_time[vid].at(tune_idx); } - double getMaxTime(VariantID vid, size_t tune_idx) const { return max_time[vid].at(tune_idx); } - double getTotTime(VariantID vid, size_t tune_idx) { return tot_time[vid].at(tune_idx); } - Checksum_type getChecksum(VariantID vid, size_t tune_idx) const { return checksum[vid].at(tune_idx); } + double getMinTime(VariantID vid, size_t tune_idx) const + { return min_time[vid].at(tune_idx); } + double getMaxTime(VariantID vid, size_t tune_idx) const + { return max_time[vid].at(tune_idx); } + double getTotTime(VariantID vid, size_t tune_idx) const + { return tot_time[vid].at(tune_idx); } + + Checksum_type getChecksum(VariantID vid, size_t tune_idx) const + { return checksum[vid].at(tune_idx); } void execute(VariantID vid, size_t tune_idx); @@ -189,66 +205,77 @@ class KernelBase } int getDataAlignment() const { return run_params.getDataAlignment(); } + template void allocData(T*& ptr, int len, VariantID vid) { rajaperf::detail::allocData(ptr, len, getDataAlignment(), vid); } + template void allocAndInitData(T*& ptr, int len, VariantID vid) { rajaperf::detail::allocAndInitData(ptr, len, getDataAlignment(), vid); } + template void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid) { rajaperf::detail::allocAndInitDataConst(ptr, len, getDataAlignment(), val, vid); } + template void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid) { rajaperf::detail::allocAndInitDataRandSign(ptr, len, getDataAlignment(), vid); } + template void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid) { rajaperf::detail::allocAndInitDataRandValue(ptr, len, getDataAlignment(), vid); } + template void deallocData(T*& ptr, VariantID vid) { rajaperf::detail::deallocData(ptr, vid); } + template void initData(T*& ptr, int len, VariantID vid) { rajaperf::detail::initData(ptr, len, vid); } + template void initDataConst(T*& ptr, int len, T val, VariantID vid) { rajaperf::detail::initDataConst(ptr, len, val, vid); } + template void initDataRandSign(T*& ptr, int len, VariantID vid) { rajaperf::detail::initDataRandSign(ptr, len, vid); } + template void initDataRandValue(T*& ptr, int len, VariantID vid) { rajaperf::detail::initDataRandValue(ptr, len, vid); } + template void initData(T& d, VariantID vid) @@ -290,18 +317,23 @@ class KernelBase virtual void tearDown(VariantID vid, size_t tune_idx) = 0; virtual void runSeqVariant(VariantID vid, size_t tune_idx) = 0; + #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) virtual void runOpenMPVariant(VariantID vid, size_t tune_idx) = 0; #endif + #if defined(RAJA_ENABLE_CUDA) virtual void runCudaVariant(VariantID vid, size_t tune_idx) = 0; #endif + #if defined(RAJA_ENABLE_HIP) virtual void runHipVariant(VariantID vid, size_t tune_idx) = 0; #endif + #if defined(RAJA_ENABLE_TARGET_OPENMP) virtual void runOpenMPTargetVariant(VariantID vid, size_t tune_idx) = 0; #endif + #if defined(RUN_KOKKOS) virtual void runKokkosVariant(VariantID vid, size_t tune_idx) { From b77e28dd5458e209fe82f1f62310c35bd3b1803f Mon Sep 17 00:00:00 2001 From: Doru Bercea Date: Wed, 15 Mar 2023 20:33:19 -0400 Subject: [PATCH 077/137] Add support for single unit test selection. --- test/test-raja-perf-suite.cpp | 43 ++++++++++++++++------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 9e59954f6..8c29a42ff 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -20,44 +20,39 @@ TEST(ShortSuiteTest, Basic) { // Assemble command line args for basic test - int argc = 5; -#if defined(RAJA_ENABLE_HIP) && \ - (HIP_VERSION_MAJOR < 5 || \ - (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) - argc = 7; -#endif - -#if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) - argc = 7; -#endif - - std::vector< std::string > sargv(argc); - sargv[0] = std::string("dummy "); // for executable name - sargv[1] = std::string("--checkrun"); - sargv[2] = std::string("3"); - sargv[3] = std::string("--show-progress"); - sargv[4] = std::string("--disable-warmup"); + std::vector< std::string > sargv = {}; + sargv.emplace_back(std::string("dummy ")); // for executable name + sargv.emplace_back(std::string("--checkrun")); + sargv.emplace_back(std::string("3")); + sargv.emplace_back(std::string("--show-progress")); + sargv.emplace_back(std::string("--disable-warmup")); #if defined(RAJA_ENABLE_HIP) && \ (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) - sargv[5] = std::string("--exclude-kernels"); - sargv[6] = std::string("HALOEXCHANGE_FUSED"); + sargv.emplace_back(std::string("--exclude-kernels")); + sargv.emplace_back(std::string("HALOEXCHANGE_FUSED")); #endif #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) - sargv[5] = std::string("--exclude-kernels"); - sargv[6] = std::string("FIRST_MIN"); + sargv.emplace_back(std::string("--exclude-kernels")); + sargv.emplace_back(std::string("FIRST_MIN")); #endif - char** argv = new char* [argc]; - for (int is = 0; is < argc; ++is) { + char *unit_test = getenv("RAJA_PERF_UNIT_TEST"); + if (unit_test != NULL) { + sargv.emplace_back(std::string("-k")); + sargv.emplace_back(std::string(unit_test)); + } + + char** argv = new char* [sargv.size()]; + for (int is = 0; is < sargv.size(); ++is) { argv[is] = const_cast(sargv[is].c_str()); } // STEP 1: Create suite executor object with input args defined above - rajaperf::Executor executor(argc, argv); + rajaperf::Executor executor(sargv.size(), argv); // STEP 2: Assemble kernels and variants to run executor.setupSuite(); From b77953f6e56eef06fcd93ffa37134f54d7bd278e Mon Sep 17 00:00:00 2001 From: Gheorghe-Teodor Bercea Date: Thu, 16 Mar 2023 12:24:31 -0400 Subject: [PATCH 078/137] Update test/test-raja-perf-suite.cpp Co-authored-by: Jason Burmark --- test/test-raja-perf-suite.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 8c29a42ff..8b4552cde 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -21,7 +21,7 @@ TEST(ShortSuiteTest, Basic) // Assemble command line args for basic test - std::vector< std::string > sargv = {}; + std::vector< std::string > sargv{}; sargv.emplace_back(std::string("dummy ")); // for executable name sargv.emplace_back(std::string("--checkrun")); sargv.emplace_back(std::string("3")); From 02d21537f1a89f22a2bf3118674f19aaa6174c71 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 20 Mar 2023 13:03:56 -0700 Subject: [PATCH 079/137] rename teams to launch --- src/apps/CONVECTION3DPA.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/MASS3DPA.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 2 +- src/common/RAJAPerfSuite.cpp | 2 +- src/common/RAJAPerfSuite.hpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 46eb341af..dc4823482 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -50,7 +50,7 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) (1 + 2*CPA_Q1D) * CPA_D1D * CPA_D1D * CPA_D1D // 8 )); - setUsesFeature(Teams); + setUsesFeature(Launch); setVariantDefined( Base_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 3b3a2cb31..1f78cafe3 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -51,7 +51,7 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) 7 * DPA_D1D * DPA_Q1D * DPA_D1D * DPA_D1D + 3 * DPA_D1D * DPA_D1D * DPA_D1D)); - setUsesFeature(Teams); + setUsesFeature(Launch); setVariantDefined( Base_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 107f74408..a70e98847 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -47,7 +47,7 @@ MASS3DPA::MASS3DPA(const RunParams& params) 2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D + 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D + 2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D)); - setUsesFeature(Teams); + setUsesFeature(Launch); setVariantDefined( Base_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index afbaaf0c2..9f78394cf 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -43,7 +43,7 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) getActualProblemSize() ); - setUsesFeature(Teams); + setUsesFeature(Launch); setVariantDefined(Base_Seq); setVariantDefined(Lambda_Seq); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 27650cf56..da6683907 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -302,7 +302,7 @@ static const std::string FeatureNames [] = std::string("Forall"), std::string("Kernel"), - std::string("Teams"), + std::string("Launch"), std::string("Sort"), std::string("Scan"), diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index e73bd9888..1f4da8c9b 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -217,7 +217,7 @@ enum FeatureID { Forall = 0, Kernel, - Teams, + Launch, Sort, Scan, From 5041afaf41a6f8576c6639ba5cc2d4dc36be5bd4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 15:46:03 -0700 Subject: [PATCH 080/137] Add DataSpace per backend --- src/common/KernelBase.hpp | 12 +-- src/common/RAJAPerfSuite.cpp | 2 +- src/common/RAJAPerfSuite.hpp | 11 +-- src/common/RunParams.cpp | 150 +++++++++++++++++++---------------- src/common/RunParams.hpp | 20 +++-- 5 files changed, 103 insertions(+), 92 deletions(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 61eadd495..80c247057 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -208,12 +208,12 @@ class KernelBase int getDataAlignment() const { return run_params.getDataAlignment(); } - DataSpace getSeqDataSpace() const { return DataSpace::Host; } // run_params.getSeqDataSpace(); - DataSpace getOmpDataSpace() const { return DataSpace::Omp; } // run_params.getOmpDataSpace(); - DataSpace getOmpTargetDataSpace() const { return DataSpace::OmpTarget; } // run_params.getOmpTargetDataSpace(); - DataSpace getCudaDataSpace() const { return DataSpace::CudaDevice; } // run_params.getCudaDataSpace(); - DataSpace getHipDataSpace() const { return DataSpace::HipDevice; } // run_params.getHipDataSpace(); - DataSpace getKokkosDataSpace() const { return DataSpace::Host; } // run_params.getKokkosDataSpace(); + DataSpace getSeqDataSpace() const { return run_params.getSeqDataSpace(); } + DataSpace getOmpDataSpace() const { return run_params.getOmpDataSpace(); } + DataSpace getOmpTargetDataSpace() const { return run_params.getOmpTargetDataSpace(); } + DataSpace getCudaDataSpace() const { return run_params.getCudaDataSpace(); } + DataSpace getHipDataSpace() const { return run_params.getHipDataSpace(); } + DataSpace getKokkosDataSpace() const { return run_params.getKokkosDataSpace(); } DataSpace getDataSpace(VariantID vid) const { diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index ec0aedb8e..59c1e11cf 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -550,7 +550,7 @@ const std::string& getFeatureName(FeatureID fid) /* ******************************************************************************* * - * Return memory space name associated with CudaDataSpace enum value. + * Return memory space name associated with DataSpace enum value. * ******************************************************************************* */ diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 1ff430f66..3df084351 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -350,16 +350,7 @@ const std::string& getFeatureName(FeatureID vid); * ******************************************************************************* */ -const std::string& getCudaDataSpaceName(CudaDataSpace cd); - -/*! - ******************************************************************************* - * - * \brief Return memory space name associated with HipDataSpace enum value. - * - ******************************************************************************* - */ -const std::string& getHipDataSpaceName(HipDataSpace hd); +const std::string& getDataSpaceName(DataSpace cd); /*! ******************************************************************************* diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 71c7dc7f6..7ee741e0d 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -113,8 +113,12 @@ void RunParams::print(std::ostream& str) const str << "\n disable_warmup = " << disable_warmup; - str << "\n cuda memory space = " << getCudaDataSpaceName(cudaDataSpace); - str << "\n hip memory space = " << getHipDataSpaceName(hipDataSpace); + str << "\n seq data space = " << getDataSpaceName(seqDataSpace); + str << "\n omp data space = " << getDataSpaceName(ompDataSpace); + str << "\n omp target data space = " << getDataSpaceName(ompTargetDataSpace); + str << "\n cuda data space = " << getDataSpaceName(cudaDataSpace); + str << "\n hip data space = " << getDataSpaceName(hipDataSpace); + str << "\n kokkos data space = " << getDataSpaceName(kokkosDataSpace); str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { @@ -213,16 +217,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) printVariantNames(getCout()); input_state = InfoRequest; - } else if ( opt == std::string("--print-cuda_memory_spaces") || - opt == std::string("-pcms") ) { + } else if ( opt == std::string("--print-data-spaces") || + opt == std::string("-pds") ) { - printCudaDataSpaceNames(getCout()); - input_state = InfoRequest; - - } else if ( opt == std::string("--print-hip_memory_spaces") || - opt == std::string("-pcms") ) { - - printHipDataSpaceNames(getCout()); + printDataSpaceNames(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-features") || @@ -471,8 +469,18 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } - } else if ( opt == std::string("--cuda_memory_space") || - opt == std::string("-cms") ) { + } else if ( opt == std::string("--seq-data-space") || + opt == std::string("-sds") || + opt == std::string("--omp-data-space") || + opt == std::string("-ods") || + opt == std::string("--omptarget-data-space") || + opt == std::string("-otds") || + opt == std::string("--cuda-data-space") || + opt == std::string("-cds") || + opt == std::string("--hip-data-space") || + opt == std::string("-hds") || + opt == std::string("--kokkos-data-space") || + opt == std::string("-kds") ) { bool got_someting = false; i++; @@ -481,42 +489,37 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt.at(0) == '-' ) { i--; } else { - for (int cms = 0; cms < static_cast(CudaDataSpace::NumSpaces); ++cms) { - if (getCudaDataSpaceName(static_cast(cms)) == opt) { - cudaDataSpace = static_cast(cms); + for (int ds = 0; ds < static_cast(DataSpace::NumSpaces); ++ds) { + if (getDataSpaceName(static_cast(ds)) == opt) { got_someting = true; - break; - } - } - if (!got_someting) { - getCout() << "\nBad input:" - << " must give --cuda_memory_space a valid cuda memory space" - << std::endl; - input_state = BadInput; - } - } - } + if ( opt == std::string("--seq-data-space") || + opt == std::string("-sds") ) { + seqDataSpace = static_cast(ds); + } else if ( opt == std::string("--omp-data-space") || + opt == std::string("-ods") ) { + ompDataSpace = static_cast(ds); + } else if ( opt == std::string("--omptarget-data-space") || + opt == std::string("-otds") ) { + ompTargetDataSpace = static_cast(ds); + } else if ( opt == std::string("--cuda-data-space") || + opt == std::string("-cds") ) { + cudaDataSpace = static_cast(ds); + } else if ( opt == std::string("--hip-data-space") || + opt == std::string("-hds") ) { + hipDataSpace = static_cast(ds); + } else if ( opt == std::string("--kokkos-data-space") || + opt == std::string("-kds") ) { + kokkosDataSpace = static_cast(ds); + } else { + got_someting = false; + } - } else if ( opt == std::string("--hip_memory_space") || - opt == std::string("-hms") ) { - - bool got_someting = false; - i++; - if ( i < argc ) { - opt = std::string(argv[i]); - if ( opt.at(0) == '-' ) { - i--; - } else { - for (int hms = 0; hms < static_cast(HipDataSpace::NumSpaces); ++hms) { - if (getHipDataSpaceName(static_cast(hms)) == opt) { - hipDataSpace = static_cast(hms); - got_someting = true; break; } } if (!got_someting) { getCout() << "\nBad input:" - << " must give --hip_memory_space a valid hip memory space" + << " must give " << opt << " a valid data space" << std::endl; input_state = BadInput; } @@ -657,9 +660,7 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --print-variants, -pv (print names of available variants to run)\n\n"; - str << "\t --print-cuda_memory_spaces, -pcms (print names of cuda memory spaces)\n\n"; - - str << "\t --print-hip_memory_spaces, -pcms (print names of hip memory spaces)\n\n"; + str << "\t --print-data-spaces, -pds (print names of data spaces)\n\n"; str << "\t --print-features, -pf (print names of RAJA features exercised in Suite)\n\n"; @@ -741,17 +742,41 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t --exclude-variants RAJA_CUDA (exclude all RAJA_CUDA kernel variants)\n" << "\t\t -ev Base_Seq RAJA_CUDA (exclude Base_Seq and RAJA_CUDA variants)\n\n"; - str << "\t --cuda_memory_space, -cms [Default is Device]\n" - << "\t (names of memory space to use)\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" + << "\t (names of data space to use)\n"; + str << "\t\t Examples...\n" + << "\t\t --seq-data-space Host (run sequential variants with Host memory)\n" + << "\t\t -sds CudaPinned (run sequential variants with Cuda Pinned memory)\n\n"; + + str << "\t --omp-data-space, -ods [Default is Omp]\n" + << "\t (names of data space to use)\n"; + str << "\t\t Examples...\n" + << "\t\t --omp-data-space Omp (run Omp variants with Omp memory)\n" + << "\t\t -ods Host (run Omp variants with Host memory)\n\n"; + + str << "\t --omptarget-data-space, -otds [Default is OmpTarget]\n" + << "\t (names of data space to use)\n"; + str << "\t\t Examples...\n" + << "\t\t --omptarget-data-space OmpTarget (run Omp Target variants with Omp Target memory)\n" + << "\t\t -otds CudaPinned (run Omp Target variants with Cuda Pinned memory)\n\n"; + + str << "\t --cuda-data-space, -cds [Default is CudaDevice]\n" + << "\t (names of data space to use)\n"; str << "\t\t Examples...\n" - << "\t\t --cuda_memory_space Managed (run CUDA kernels with Managed memory)\n" - << "\t\t -cms Pinned (run CUDA kernels with Pinned memory)\n\n"; + << "\t\t --cuda-data-space CudaManaged (run CUDA variants with Cuda Managed memory)\n" + << "\t\t -cds CudaPinned (run CUDA variants with Cuda Pinned memory)\n\n"; - str << "\t --hip_memory_space, -hms [Default is Device]\n" - << "\t (names of memory space to use)\n"; + str << "\t --hip-data-space, -hds [Default is HipDevice]\n" + << "\t (names of data space to use)\n"; str << "\t\t Examples...\n" - << "\t\t --hip_memory_space Managed (run HIP kernels with Managed memory)\n" - << "\t\t -hms Pinned (run HIP kernels with Pinned memory)\n\n"; + << "\t\t --hip-data-space HipManaged (run HIP variants with Hip Managed memory)\n" + << "\t\t -hds HipPinned (run HIP variants with Hip Pinned memory)\n\n"; + + str << "\t --kokkos-data-space, -kds [Default is Host]\n" + << "\t (names of data space to use)\n"; + str << "\t\t Examples...\n" + << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" + << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; str << "\t --features, -f [Default is run all]\n" << "\t (names of features to run)\n"; @@ -835,23 +860,12 @@ void RunParams::printVariantNames(std::ostream& str) const } -void RunParams::printCudaDataSpaceNames(std::ostream& str) const -{ - str << "\nAvailable cuda memory spaces:"; - str << "\n-------------------\n"; - for (int cms = 0; cms < static_cast(CudaDataSpace::NumSpaces); ++cms) { - str << getCudaDataSpaceName(static_cast(cms)) << std::endl; - } - str.flush(); -} - - -void RunParams::printHipDataSpaceNames(std::ostream& str) const +void RunParams::printDataSpaceNames(std::ostream& str) const { - str << "\nAvailable hip memory spaces:"; + str << "\nAvailable data spaces:"; str << "\n-------------------\n"; - for (int hms = 0; hms < static_cast(HipDataSpace::NumSpaces); ++hms) { - str << getHipDataSpaceName(static_cast(hms)) << std::endl; + for (int ds = 0; ds < static_cast(DataSpace::NumSpaces); ++ds) { + str << getDataSpaceName(static_cast(ds)) << std::endl; } str.flush(); } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 6da9263d6..2c81a4f70 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -138,9 +138,12 @@ class RunParams { return false; } - CudaDataSpace getCudaDataSpace() const { return cudaDataSpace; } - - HipDataSpace getHipDataSpace() const { return hipDataSpace; } + DataSpace getSeqDataSpace() const { return seqDataSpace; } + DataSpace getOmpDataSpace() const { return ompDataSpace; } + DataSpace getOmpTargetDataSpace() const { return ompTargetDataSpace; } + DataSpace getCudaDataSpace() const { return cudaDataSpace; } + DataSpace getHipDataSpace() const { return hipDataSpace; } + DataSpace getKokkosDataSpace() const { return kokkosDataSpace; } double getPFTolerance() const { return pf_tol; } @@ -220,8 +223,7 @@ class RunParams { void printFullKernelNames(std::ostream& str) const; void printKernelNames(std::ostream& str) const; void printVariantNames(std::ostream& str) const; - void printCudaDataSpaceNames(std::ostream& str) const; - void printHipDataSpaceNames(std::ostream& str) const; + void printDataSpaceNames(std::ostream& str) const; void printGroupNames(std::ostream& str) const; void printFeatureNames(std::ostream& str) const; void printFeatureKernels(std::ostream& str) const; @@ -253,8 +255,12 @@ class RunParams { std::string reference_variant; /*!< Name of reference variant for speedup calculations */ - CudaDataSpace cudaDataSpace = CudaDataSpace::Device; - HipDataSpace hipDataSpace = HipDataSpace::Device; + DataSpace seqDataSpace = DataSpace::Host; + DataSpace ompDataSpace = DataSpace::Omp; + DataSpace ompTargetDataSpace = DataSpace::OmpTarget; + DataSpace cudaDataSpace = DataSpace::CudaDevice; + DataSpace hipDataSpace = DataSpace::HipDevice; + DataSpace kokkosDataSpace = DataSpace::Host; // // Arrays to hold input strings for valid/invalid input. Helpful for From 9fd7d2c7af6b0e2adc510558d4fcea2ddb734924 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 15:46:52 -0700 Subject: [PATCH 081/137] Fixup memory routines in KernelBase --- src/common/CudaDataUtils.hpp | 107 ++----- src/common/DataUtils.cpp | 15 +- src/common/DataUtils.hpp | 407 ++++++++++++++++----------- src/common/HipDataUtils.hpp | 162 ++--------- src/common/KernelBase.hpp | 91 ++---- src/common/OpenMPTargetDataUtils.hpp | 36 ++- 6 files changed, 348 insertions(+), 470 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 261da7a1c..b45d1ec79 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -28,6 +28,9 @@ namespace rajaperf { +namespace detail +{ + /*! * \brief Device timer, returns a time in ns from an arbitrary starting point. * Note that this time is consistent across the whole device. @@ -111,6 +114,17 @@ __device__ inline Index_type lambda_cuda_get_index() return blockIdx.z; } + +/* + * Copy memory len bytes from src to dst. + */ +inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid) +{ + (void)vid; + cudaErrchk( cudaMemcpy( dst_ptr, src_ptr, len, + cudaMemcpyDefault ) ); +} + /*! * \brief Copy given hptr (host) data to CUDA device (dptr). * @@ -214,94 +228,6 @@ void deallocCudaPinnedData(T& pptr) pptr = nullptr; } - -/*! - * \brief Copy given hptr (host) data to CUDA (cptr). - * - * Method assumes both host and device data arrays are allocated - * and of proper size for copy operation to succeed. - */ -template -void initCudaData(CudaDataSpace, T& cptr, const T hptr, int len) -{ - cudaErrchk( cudaMemcpy( cptr, hptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemcpyDefault ) ); - - incDataInitCount(); -} - -/*! - * \brief Allocate CUDA data array (cptr). - */ -template -void allocCudaData(CudaDataSpace cudaDataSpace, T& cptr, int len) -{ - switch (cudaDataSpace) { - case CudaDataSpace::Host: - { - allocData(cptr, len); - } break; - case CudaDataSpace::Pinned: - { - allocCudaPinnedData(cptr, len); - } break; - case CudaDataSpace::Managed: - { - allocCudaManagedData(cptr, len); - } break; - case CudaDataSpace::Device: - { - allocCudaDeviceData(cptr, len); - } break; - default: - { - throw std::invalid_argument("allocCudaData : Unknown memory type"); - } break; - } -} - -/*! - * \brief Allocate CUDA data array (cptr) and copy given hptr (host) - * data to CUDA array. - */ -template -void allocAndInitCudaData(CudaDataSpace cudaDataSpace, T& cptr, const T hptr, int len) -{ - allocCudaData(cudaDataSpace, cptr, len); - initCudaData(cudaDataSpace, cptr, hptr, len); -} - -/*! - * \brief Free Cuda data array. - */ -template -void deallocCudaData(CudaDataSpace cudaDataSpace, T& cptr) -{ - switch (cudaDataSpace) { - case CudaDataSpace::Host: - { - deallocData(cptr); - } break; - case CudaDataSpace::Pinned: - { - deallocCudaPinnedData(cptr); - } break; - case CudaDataSpace::Managed: - { - deallocCudaManagedData(cptr); - } break; - case CudaDataSpace::Device: - { - deallocCudaDeviceData(cptr); - } break; - default: - { - throw std::invalid_argument("deallocCudaData : Unknown memory type"); - } break; - } -} - /*! * \brief Copy given cptr (CUDA) data to host (hptr). * @@ -309,13 +235,16 @@ void deallocCudaData(CudaDataSpace cudaDataSpace, T& cptr) * and of propoer size for copy operation to succeed. */ template -void getCudaData(CudaDataSpace, T& hptr, const T cptr, int len) +void getCudaData(T& hptr, const T cptr, int len) { cudaErrchk( cudaMemcpy( hptr, cptr, len * sizeof(typename std::remove_pointer::type), cudaMemcpyDefault ) ); } + +} // closing brace for detail namespace + } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_CUDA diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index dd051d2ce..cdc67a7e5 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -14,6 +14,7 @@ #include "RAJA/internal/MemUtils_CPU.hpp" #include +#include #include namespace rajaperf @@ -40,6 +41,14 @@ void incDataInitCount() data_init_count++; } +/* + * Copy memory len bytes from src to dst. + */ +void copyHostData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid) +{ + (void)vid; + std::memcpy(dst_ptr, src_ptr, len); +} /* * Allocate and initialize aligned data arrays. @@ -364,7 +373,7 @@ void initData(Real_type& d, VariantID vid) /* * Calculate and return checksum for data arrays. */ -long double calcChecksum(const Int_ptr ptr, int len, +long double calcChecksum(Int_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; @@ -386,7 +395,7 @@ long double calcChecksum(const Int_ptr ptr, int len, return tchk; } -long double calcChecksum(const Real_ptr ptr, int len, +long double calcChecksum(Real_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; @@ -408,7 +417,7 @@ long double calcChecksum(const Real_ptr ptr, int len, return tchk; } -long double calcChecksum(const Complex_ptr ptr, int len, +long double calcChecksum(Complex_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index b34cb3e07..0d81d12d1 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -16,6 +16,9 @@ #include "RAJAPerfSuite.hpp" #include "RPTypes.hpp" +#include "common/OpenMPTargetDataUtils.hpp" +#include "common/CudaDataUtils.hpp" +#include "common/HipDataUtils.hpp" #include #include @@ -44,6 +47,7 @@ void resetDataInitCount(); */ void incDataInitCount(); +void copyHostData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid); /*! * \brief Allocate data arrays. @@ -141,13 +145,13 @@ void initData(Real_type& d, * Checksumn is multiplied by given scale factor. */ long double calcChecksum(Int_ptr d, int len, - Real_type scale_factor = 1.0); + Real_type scale_factor); /// long double calcChecksum(Real_ptr d, int len, - Real_type scale_factor = 1.0); + Real_type scale_factor); /// long double calcChecksum(Complex_ptr d, int len, - Real_type scale_factor = 1.0); + Real_type scale_factor); /*! * \brief Get an host accessible data space for this dataSpace. @@ -155,125 +159,204 @@ long double calcChecksum(Complex_ptr d, int len, * Intended to be a space that is quick to copy to from the given space if * the given space is not accessible on the Host. */ -DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +inline DataSpace hostAccessibleDataSpace(DataSpace dataSpace) { switch (dataSpace) { - case dataSpace::Host: - case dataSpace::Omp: - case dataSpace::CudaPinned: - case dataSpace::HipHostAdviseFine: - case dataSpace::HipHostAdviseCoarse: - case dataSpace::HipPinned: - case dataSpace::HipPinnedFine: - case dataSpace::HipPinnedCoarse: + case DataSpace::Host: + case DataSpace::Omp: + case DataSpace::CudaPinned: + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: return dataSpace; - case dataSpace::OmpTarget: - return dataSpace::Host; + case DataSpace::OmpTarget: + return DataSpace::Host; - case dataSpace::CudaManaged: - case dataSpace::CudaDevice: - return dataSpace::CudaPinned; + case DataSpace::CudaManaged: + case DataSpace::CudaDevice: + return DataSpace::CudaPinned; - case dataSpace::HipManaged: - case dataSpace::HipManagedAdviseFine: - case dataSpace::HipManagedAdviseCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: return dataSpace; - case dataSpace::HipDevice: - case dataSpace::HipDeviceFine: - return dataSpace::HipPinned; + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return DataSpace::HipPinned; default: { - throw std::invalid_argument("accessibleDataSpace : Unknown memory type"); + throw std::invalid_argument("hostAccessibleDataSpace : Unknown data space"); } break; } } -} // closing brace for detail namespace +/*! + * \brief Get if the data space is a host DataSpace. + */ +inline bool isHostDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Host: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a omp DataSpace. + */ +inline bool isOpenMPDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Omp: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a omp target DataSpace. + */ +inline bool isOpenMPTargetDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::OmpTarget: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a cuda DataSpace. + */ +inline bool isCudaDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::CudaPinned: + case DataSpace::CudaManaged: + case DataSpace::CudaDevice: + return true; + default: + return false; + } +} +/*! + * \brief Get if the data space is a hip DataSpace. + */ +inline bool isHipDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return true; + default: + return false; + } +} /*! * \brief Allocate data array (ptr). */ template -void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID vid) +inline void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID vid) { switch (dataSpace) { - case dataSpace::Host: - case dataSpace::Omp: + case DataSpace::Host: + case DataSpace::Omp: { allocHostData(ptr, len, align, vid); } break; - case dataSpace::OmpTarget: +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case DataSpace::OmpTarget: { - allocOpenMPDeviceData(ptr, len, did); + allocOpenMPDeviceData(ptr, len); } break; +#endif - case dataSpace::CudaPinned: +#if defined(RAJA_ENABLE_CUDA) + case DataSpace::CudaPinned: { allocCudaPinnedData(ptr, len, vid); } break; - case dataSpace::CudaManaged: + case DataSpace::CudaManaged: { allocCudaManagedData(ptr, len, vid); } break; - case dataSpace::CudaDevice: + case DataSpace::CudaDevice: { allocCudaDeviceData(ptr, len, vid); } break; +#endif - case dataSpace::HipHostAdviseFine: +#if defined(RAJA_ENABLE_HIP) + case DataSpace::HipHostAdviseFine: { allocHostData(ptr, len, align, vid); adviseHipFineData(ptr, len, vid); } break; - case dataSpace::HipHostAdviseCoarse: + case DataSpace::HipHostAdviseCoarse: { allocHostData(ptr, len, align, vid); adviseHipCoarseData(ptr, len, vid); } break; - case dataSpace::HipPinned: + case DataSpace::HipPinned: { allocHipPinnedData(ptr, len, vid); } break; - case dataSpace::HipPinnedFine: + case DataSpace::HipPinnedFine: { allocHipPinnedFineData(ptr, len, vid); } break; - case dataSpace::HipPinnedCoarse: + case DataSpace::HipPinnedCoarse: { allocHipPinnedCoarseData(ptr, len, vid); } break; - case dataSpace::HipManaged: + case DataSpace::HipManaged: { allocHipManagedData(ptr, len, vid); } break; - case dataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseFine: { allocHipManagedData(ptr, len, vid); adviseHipFineData(ptr, len, vid); } break; - case dataSpace::HipManagedAdviseCoarse: + case DataSpace::HipManagedAdviseCoarse: { allocHipManagedData(ptr, len, vid); adviseHipCoarseData(ptr, len, vid); } break; - case dataSpace::HipDevice: + case DataSpace::HipDevice: { allocHipDeviceData(ptr, len, vid); } break; - case dataSpace::HipDeviceFine: + case DataSpace::HipDeviceFine: { allocHipDeviceFineData(ptr, len, vid); } break; +#endif default: { - throw std::invalid_argument("allocData : Unknown memory type"); + throw std::invalid_argument("allocData : Unknown data space"); } break; } } @@ -282,65 +365,71 @@ void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID vid) * \brief Deallocate data array (ptr). */ template -void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) +inline void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) { switch (dataSpace) { - case dataSpace::Host: - case dataSpace::Omp: - case dataSpace::HipHostAdviseFine: - case dataSpace::HipHostAdviseCoarse: + case DataSpace::Host: + case DataSpace::Omp: + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: { deallocHostData(ptr, vid); } break; - case dataSpace::OmpTarget: +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case DataSpace::OmpTarget: { - deallocOpenMPDeviceData(ptr, did); + deallocOpenMPDeviceData(ptr); } break; +#endif - case dataSpace::CudaPinned: +#if defined(RAJA_ENABLE_CUDA) + case DataSpace::CudaPinned: { deallocCudaPinnedData(ptr, vid); } break; - case dataSpace::CudaManaged: + case DataSpace::CudaManaged: { deallocCudaManagedData(ptr, vid); } break; - case dataSpace::CudaDevice: + case DataSpace::CudaDevice: { deallocCudaDeviceData(ptr, vid); } break; +#endif - case dataSpace::HipPinned: +#if defined(RAJA_ENABLE_HIP) + case DataSpace::HipPinned: { deallocHipPinnedData(ptr, vid); } break; - case dataSpace::HipPinnedFine: + case DataSpace::HipPinnedFine: { deallocHipPinnedFineData(ptr, vid); } break; - case dataSpace::HipPinnedCoarse: + case DataSpace::HipPinnedCoarse: { deallocHipPinnedCoarseData(ptr, vid); } break; - case dataSpace::HipManaged: - case dataSpace::HipManagedAdviseFine: - case dataSpace::HipManagedAdviseCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: { deallocHipManagedData(ptr, vid); } break; - case dataSpace::HipDevice: + case DataSpace::HipDevice: { deallocHipDeviceData(ptr, vid); } break; - case dataSpace::HipDeviceFine: + case DataSpace::HipDeviceFine: { deallocHipDeviceFineData(ptr, vid); } break; +#endif default: { - throw std::invalid_argument("deallocData : Unknown memory type"); + throw std::invalid_argument("deallocData : Unknown data space"); } break; } } @@ -349,48 +438,43 @@ void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) * \brief Copy data from one array to another. */ template -void copyData(DataSpace dst_dataSpace, T* dst_ptr, - DataSpace src_dataSpace, const T* src_ptr, - int len, VariantID vid) +inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, + DataSpace src_dataSpace, const T* src_ptr, + int len, VariantID vid) { + if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && + hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { + copyHostData(dst_ptr, src_ptr, sizeof(T)*len, vid); + } - switch (dst_dataSpace) { - case dataSpace::Host: - case dataSpace::Omp: - case dataSpace::HipHostAdviseFine: - case dataSpace::HipHostAdviseCoarse: - { - copyHostData(dst_ptr, src_ptr, sizeof(T)*len, vid); - } break; - - case dataSpace::OmpTarget: - { - copyOpenMPDeviceData(dst_ptr, src_ptr, sizeof(T)*len, did); - } break; +#if defined(RAJA_ENABLE_TARGET_OPENMP) + else if (isOpenMPTargetDataSpace(dst_dataSpace) || + isOpenMPTargetDataSpace(src_dataSpace)) { + auto dst_did = isOpenMPTargetDataSpace(dst_dataSpace) ? omp_get_default_device() + : omp_get_initial_device(); + auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? omp_get_default_device() + : omp_get_initial_device(); + copyOpenMPTargetData(dst_ptr, src_ptr, sizeof(T)*len, vid, + dst_did, src_did); + } +#endif - case dataSpace::CudaPinned: - case dataSpace::CudaManaged: - case dataSpace::CudaDevice: - { - copyCudaData(dst_ptr, src_ptr, sizeof(T)*len, vid); - } break; +#if defined(RAJA_ENABLE_CUDA) + else if (isCudaDataSpace(dst_dataSpace) || + isCudaDataSpace(src_dataSpace)) { + copyCudaData(dst_ptr, src_ptr, sizeof(T)*len, vid); + } +#endif - case dataSpace::HipPinned: - case dataSpace::HipPinnedFine: - case dataSpace::HipPinnedCoarse: - case dataSpace::HipManaged: - case dataSpace::HipManagedAdviseFine: - case dataSpace::HipManagedAdviseCoarse: - case dataSpace::HipDevice: - case dataSpace::HipDeviceFine: - { - copyHipData(dst_ptr, src_ptr, sizeof(T)*len, vid); - } break; +#if defined(RAJA_ENABLE_HIP) + else if (isHipDataSpace(dst_dataSpace) || + isHipDataSpace(src_dataSpace)) { + copyHipData(dst_ptr, src_ptr, sizeof(T)*len, vid); + } +#endif - default: - { - throw std::invalid_argument("copyData : Unknown memory type"); - } break; + else { + throw std::invalid_argument("copyData : Unknown data space"); } } @@ -398,18 +482,18 @@ void copyData(DataSpace dst_dataSpace, T* dst_ptr, * \brief Move data array into new dataSpace. */ template -void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, int len, int align, VariantID vid) +inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, + T*& ptr, int len, int align, VariantID vid) { if (new_dataSpace != old_dataSpace) { T* new_ptr = nullptr; - detail::allocData(new_dataSpace, new_ptr, len, align, vid); + allocData(new_dataSpace, new_ptr, len, align, vid); - detail::copyData(new_dataSpace, new_ptr, old_dataSpace, ptr, len, vid); + copyData(new_dataSpace, new_ptr, old_dataSpace, ptr, len, vid); - detail::deallocData(old_dataSpace, ptr, len, vid); + deallocData(old_dataSpace, ptr, vid); ptr = new_ptr; } @@ -419,17 +503,17 @@ void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, * \brief Allocate and initialize data array. */ template -void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align, - VariantID vid) +inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align, + VariantID vid) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); - detail::allocData(init_dataSpace, ptr, len, align, vid); + allocData(init_dataSpace, ptr, len, align, vid); - detail::initData(ptr, len, vid); + initData(ptr, len, vid); if (init_dataSpace != dataSpace) { - detail::moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + moveData(dataSpace, init_dataSpace, ptr, len, align, vid); } } @@ -440,16 +524,40 @@ void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align, * Array entries are initialized using the method * initDataConst(Real_ptr& ptr...) below. */ -void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, - VariantID vid); +template +inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int align, + Real_type val, VariantID vid) +{ + DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + + allocData(init_dataSpace, ptr, len, align, vid); + + initDataConst(ptr, len, val, vid); + + if (init_dataSpace != dataSpace) { + moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + } +} /*! * \brief Allocate and initialize aligned Real_type data array with random sign. * * Array is initialized using method initDataRandSign(Real_ptr& ptr...) below. */ -void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, - VariantID vid); +template +inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align, + VariantID vid) +{ + DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + + allocData(init_dataSpace, ptr, len, align, vid); + + initDataRandSign(ptr, len, vid); + + if (init_dataSpace != dataSpace) { + moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + } +} /*! * \brief Allocate and initialize aligned Real_type data array with random @@ -457,76 +565,53 @@ void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, * * Array is initialized using method initDataRandValue(Real_ptr& ptr...) below. */ -void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, - VariantID vid); - - -/* - * Allocate and initialize aligned data arrays. - */ -void allocAndInitData(Int_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initData(ptr, len, vid); -} - -void allocAndInitData(Real_ptr& ptr, int len, int align, VariantID vid ) -{ - allocHostData(ptr, len, align, vid); - initData(ptr, len, vid); -} - -void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) +template +inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align, + VariantID vid) { - allocHostData(ptr, len, align, vid); - initData(ptr, len, vid); -} + DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); -void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, - VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initDataConst(ptr, len, val, vid); -} + allocData(init_dataSpace, ptr, len, align, vid); -void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initDataRandSign(ptr, len, vid); -} - -void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); initDataRandValue(ptr, len, vid); -} - - + if (init_dataSpace != dataSpace) { + moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + } +} /* * Calculate and return checksum for arrays. */ template -long double calcChecksum(DataSpace dataSpace, const T* ptr, int len, - Real_type scale_factor, VariantID vid) +inline long double calcChecksum(DataSpace dataSpace, T* ptr, int len, int align, + Real_type scale_factor, VariantID vid) { - const T* check_ptr = ptr; + T* check_ptr = ptr; + T* copied_ptr = nullptr; DataSpace check_dataSpace = hostAccessibleDataSpace(dataSpace); if (check_dataSpace != dataSpace) { - allocData(check_dataSpace, check_ptr, len, vid); + allocData(check_dataSpace, copied_ptr, len, align, vid); + + copyData(check_dataSpace, copied_ptr, dataSpace, ptr, len, vid); + + check_ptr = copied_ptr; } - auto val = detail::calcChecksum(check_ptr, len, scale_factor); + auto val = calcChecksum(check_ptr, len, scale_factor); if (check_dataSpace != dataSpace) { - deallocData(check_dataSpace, check_ptr, vid); + deallocData(check_dataSpace, copied_ptr, vid); } return val; } + +} // closing brace for detail namespace + + /*! * \brief Holds a RajaPool object and provides access to it via a * std allocator compliant type. diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index b9b75f385..bb35ff88b 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -28,6 +28,9 @@ namespace rajaperf { +namespace detail +{ + /*! * \brief Simple forall hip kernel that runs a lambda. */ @@ -108,6 +111,16 @@ inline int getHipDevice() return device; } +/* + * Copy memory len bytes from src to dst. + */ +inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid) +{ + (void)vid; + hipErrchk( hipMemcpy( dst_ptr, src_ptr, len, + hipMemcpyDefault ) ); +} + /*! * \brief Copy given hptr (host) data to HIP device (dptr). * @@ -255,150 +268,6 @@ void deallocHipPinnedData(T& pptr) pptr = nullptr; } - -/*! - * \brief Copy given hptr (host) data to HIP (cptr). - * - * Method assumes both host and device data arrays are allocated - * and of proper size for copy operation to succeed. - */ -template -void initHipData(HipDataSpace, T& cptr, const T hptr, int len) -{ - hipErrchk( hipMemcpy( cptr, hptr, - len * sizeof(typename std::remove_pointer::type), - hipMemcpyDefault ) ); - - incDataInitCount(); -} - -/*! - * \brief Allocate HIP data array (cptr). - */ -template -void allocHipData(HipDataSpace hipDataSpace, T& cptr, int len) -{ - switch (hipDataSpace) { - - case HipDataSpace::Host: - { - allocData(cptr, len); - } break; - case HipDataSpace::HostAdviseFine: - { -#ifdef RAJAPERF_USE_MEMADVISE_COARSE - allocData(cptr, len); - adviseHipData(cptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); -#else - throw std::invalid_argument("allocHipData : HostAdviseFine not available with this hip version"); -#endif - } break; - case HipDataSpace::HostAdviseCoarse: - { -#ifdef RAJAPERF_USE_MEMADVISE_COARSE - allocData(cptr, len); - adviseHipData(cptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); -#else - throw std::invalid_argument("allocHipData : HostAdviseCoarse not available with this hip version"); -#endif - } break; - case HipDataSpace::Pinned: - { - allocHipPinnedData(cptr, len); - } break; - case HipDataSpace::PinnedFine: - { - allocHipPinnedFineData(cptr, len); - } break; - case HipDataSpace::PinnedCoarse: - { - allocHipPinnedCoarseData(cptr, len); - } break; - case HipDataSpace::Managed: - { - allocHipManagedData(cptr, len); - } break; - case HipDataSpace::ManagedAdviseFine: - { -#ifdef RAJAPERF_USE_MEMADVISE_COARSE - allocHipManagedData(cptr, len); - adviseHipData(cptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); -#else - throw std::invalid_argument("allocHipData : ManagedAdviseFine not available with this hip version"); -#endif - } break; - case HipDataSpace::ManagedAdviseCoarse: - { -#ifdef RAJAPERF_USE_MEMADVISE_COARSE - allocHipManagedData(cptr, len); - adviseHipData(cptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); -#else - throw std::invalid_argument("allocHipData : ManagedAdviseCoarse not available with this hip version"); -#endif - } break; - case HipDataSpace::Device: - { - allocHipDeviceData(cptr, len); - } break; - case HipDataSpace::DeviceFine: - { - allocHipDeviceFineData(cptr, len); - } break; - default: - { - throw std::invalid_argument("allocHipData : Unknown memory type"); - } break; - } -} - -/*! - * \brief Allocate HIP data array (cptr) and copy given hptr (host) - * data to HIP array. - */ -template -void allocAndInitHipData(HipDataSpace hipDataSpace, T& cptr, const T hptr, int len) -{ - allocHipData(hipDataSpace, cptr, len); - initHipData(hipDataSpace, cptr, hptr, len); -} - -/*! - * \brief Free Hip data array. - */ -template -void deallocHipData(HipDataSpace hipDataSpace, T& cptr) -{ - switch (hipDataSpace) { - case HipDataSpace::Host: - case HipDataSpace::HostAdviseFine: - case HipDataSpace::HostAdviseCoarse: - { - deallocData(cptr); - } break; - case HipDataSpace::Pinned: - case HipDataSpace::PinnedFine: - case HipDataSpace::PinnedCoarse: - { - deallocHipPinnedData(cptr); - } break; - case HipDataSpace::Managed: - case HipDataSpace::ManagedAdviseFine: - case HipDataSpace::ManagedAdviseCoarse: - { - deallocHipManagedData(cptr); - } break; - case HipDataSpace::Device: - case HipDataSpace::DeviceFine: - { - deallocHipDeviceData(cptr); - } break; - default: - { - throw std::invalid_argument("deallocHipData : Unknown memory type"); - } break; - } -} - /*! * \brief Copy given cptr (HIP) data to host (hptr). * @@ -406,13 +275,16 @@ void deallocHipData(HipDataSpace hipDataSpace, T& cptr) * and of propoer size for copy operation to succeed. */ template -void getHipData(HipDataSpace, T& hptr, const T cptr, int len) +void getHipData(T& hptr, const T cptr, int len) { hipErrchk( hipMemcpy( hptr, cptr, len * sizeof(typename std::remove_pointer::type), hipMemcpyDefault ) ); } + +} // closing brace for detail namespace + } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_HIP diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 80c247057..6fc731540 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -255,141 +255,100 @@ class KernelBase void allocData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocData( - ptr, len, getDataAlignment(), getDataSpace(vid)); + rajaperf::detail::allocData(getDataSpace(vid), + ptr, len, getDataAlignment(), vid); } template void allocAndInitData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitData( - ptr, len, getDataAlignment(), getDataSpace(vid)); + rajaperf::detail::allocAndInitData(getDataSpace(vid), + ptr, len, getDataAlignment(), vid); } template void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid) { - rajaperf::detail::allocAndInitDataConst( - ptr, len, getDataAlignment(), val, getDataSpace(vid)); + rajaperf::detail::allocAndInitDataConst(getDataSpace(vid), + ptr, len, getDataAlignment(), val, vid); } template void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitDataRandSign( - ptr, len, getDataAlignment(), getDataSpace(vid)); + rajaperf::detail::allocAndInitDataRandSign(getDataSpace(vid), + ptr, len, getDataAlignment(), vid); } template void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitDataRandValue( - ptr, len, getDataAlignment(), getDataSpace(vid)); + rajaperf::detail::allocAndInitDataRandValue(getDataSpace(vid), + ptr, len, getDataAlignment(), vid); } template void deallocData(T*& ptr, VariantID vid) { - rajaperf::detail::deallocData(ptr, getDataSpace(vid)); + rajaperf::detail::deallocData(getDataSpace(vid), ptr, vid); } template void initData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::initData(ptr, len, getDataSpace(vid)); + rajaperf::detail::initData(getDataSpace(vid), ptr, len, vid); } template void initDataConst(T*& ptr, int len, T val, VariantID vid) { - rajaperf::detail::initDataConst(ptr, len, val, getDataSpace(vid)); + rajaperf::detail::initDataConst(getDataSpace(vid), ptr, len, val, vid); } template void initDataRandSign(T*& ptr, int len, VariantID vid) { - rajaperf::detail::initDataRandSign(ptr, len, getDataSpace(vid)); + rajaperf::detail::initDataRandSign(getDataSpace(vid), ptr, len, vid); } template void initDataRandValue(T*& ptr, int len, VariantID vid) { - rajaperf::detail::initDataRandValue(ptr, len, getDataSpace(vid)); + rajaperf::detail::initDataRandValue(getDataSpace(vid), ptr, len, vid); } template void initData(T& d, VariantID vid) { - rajaperf::detail::initData(d, getDataSpace(vid)); + rajaperf::detail::initData(d, vid); } -#if defined(RAJA_ENABLE_CUDA) - CudaDataSpace getCudaDataSpace() const { return run_params.getCudaDataSpace(); } - template - void initCudaData(T& cptr, const T hptr, int len) - { - rajaperf::initCudaData(getCudaDataSpace(), cptr, hptr, len); - } - template - void allocCudaData(T& cptr, int len) - { - rajaperf::allocCudaData(getCudaDataSpace(), cptr, len); - } template - void allocAndInitCudaData(T& cptr, const T hptr, int len) + long double calcChecksum(T* ptr, int len, + VariantID vid) { - rajaperf::allocAndInitCudaData(getCudaDataSpace(), cptr, hptr, len); + return rajaperf::detail::calcChecksum(getDataSpace(vid), + ptr, len, getDataAlignment(), 1.0, vid); } - template - void deallocCudaData(T& cptr) - { - rajaperf::deallocCudaData(getCudaDataSpace(), cptr); - } - template - void getCudaData(T& hptr, const T cptr, int len) - { - rajaperf::getCudaData(getCudaDataSpace(), hptr, cptr, len); - } -#endif -#if defined(RAJA_ENABLE_HIP) - HipDataSpace getHipDataSpace() const { return run_params.getHipDataSpace(); } - template - void initHipData(T& cptr, const T hptr, int len) - { - rajaperf::initHipData(getHipDataSpace(), cptr, hptr, len); - } - template - void allocHipData(T& cptr, int len) - { - rajaperf::allocHipData(getHipDataSpace(), cptr, len); - } template - void allocAndInitHipData(T& cptr, const T hptr, int len) + long double calcChecksum(T* ptr, int len, + Real_type scale_factor, + VariantID vid) { - rajaperf::allocAndInitHipData(getHipDataSpace(), cptr, hptr, len); + return rajaperf::detail::calcChecksum(getDataSpace(vid), + ptr, len, getDataAlignment(), scale_factor, vid); } - template - void deallocHipData(T& cptr) - { - rajaperf::deallocHipData(getHipDataSpace(), cptr); - } - template - void getHipData(T& hptr, const T cptr, int len) - { - rajaperf::getHipData(getHipDataSpace(), hptr, cptr, len); - } -#endif void startTimer() { diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index d122e12a5..1a5c0eb2f 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -23,6 +23,21 @@ namespace rajaperf { +namespace detail +{ + + +/* + * Copy memory len bytes from src to dst. + */ +inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid, + int dst_did, int src_did) +{ + (void)vid; + omp_target_memcpy( dst_ptr, src_ptr, len, + 0, 0, dst_did, src_did ); +} + /*! * \brief Copy given hptr (host) data to device (dptr). * @@ -31,7 +46,8 @@ namespace rajaperf */ template void copyOpenMPDeviceData(T& dptr, const T hptr, int len, - int did, int hid) + int did = omp_get_default_device(), + int hid = omp_get_initial_device()) { omp_target_memcpy( dptr, hptr, len * sizeof(typename std::remove_pointer::type), @@ -46,7 +62,8 @@ void copyOpenMPDeviceData(T& dptr, const T hptr, int len, */ template void initOpenMPDeviceData(T& dptr, const T hptr, int len, - int did, int hid) + int did = omp_get_default_device(), + int hid = omp_get_initial_device()) { copyOpenMPDeviceData(dptr, hptr, len, did, hid); detail::incDataInitCount(); @@ -57,7 +74,8 @@ void initOpenMPDeviceData(T& dptr, const T hptr, int len, * data to device array. */ template -void allocOpenMPDeviceData(T& dptr, int len, int did) +void allocOpenMPDeviceData(T& dptr, int len, + int did = omp_get_default_device()) { dptr = static_cast( omp_target_alloc( len * sizeof(typename std::remove_pointer::type), @@ -70,7 +88,8 @@ void allocOpenMPDeviceData(T& dptr, int len, int did) */ template void allocAndInitOpenMPDeviceData(T& dptr, const T hptr, int len, - int did, int hid) + int did = omp_get_default_device(), + int hid = omp_get_initial_device()) { allocOpenMPDeviceData(dptr, len, did); initOpenMPDeviceData(dptr, hptr, len, did, hid); @@ -83,7 +102,9 @@ void allocAndInitOpenMPDeviceData(T& dptr, const T hptr, int len, * and of propoer size for copy operation to succeed. */ template -void getOpenMPDeviceData(T& hptr, const T dptr, int len, int hid, int did) +void getOpenMPDeviceData(T& hptr, const T dptr, int len, + int hid = omp_get_initial_device(), + int did = omp_get_default_device()) { omp_target_memcpy( hptr, dptr, len * sizeof(typename std::remove_pointer::type), @@ -94,13 +115,16 @@ void getOpenMPDeviceData(T& hptr, const T dptr, int len, int hid, int did) * \brief Free device data array. */ template -void deallocOpenMPDeviceData(T& dptr, int did) +void deallocOpenMPDeviceData(T& dptr, + int did = omp_get_default_device()) { omp_target_free( dptr, did ); dptr = 0; } +} // closing brace for detail namespace + } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_TARGET_OPENMP From b8e1aef6c811cabb885bdaa11c803ba2058a3f5f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 15:47:43 -0700 Subject: [PATCH 082/137] Pass vid into calcChecksum --- src/algorithm/MEMCPY.cpp | 2 +- src/algorithm/MEMSET.cpp | 2 +- src/algorithm/REDUCE_SUM.cpp | 2 +- src/algorithm/SCAN.cpp | 2 +- src/algorithm/SORT.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 4 ++-- src/apps/CONVECTION3DPA.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/ENERGY.cpp | 4 ++-- src/apps/FIR.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 2 +- src/apps/LTIMES.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 2 +- src/apps/MASS3DPA.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 2 +- src/apps/PRESSURE.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- src/apps/WIP-COUPLE.cpp | 6 +++--- src/basic/DAXPY.cpp | 2 +- src/basic/DAXPY_ATOMIC.cpp | 2 +- src/basic/IF_QUAD.cpp | 4 ++-- src/basic/INDEXLIST.cpp | 2 +- src/basic/INDEXLIST_3LOOP.cpp | 2 +- src/basic/INIT3.cpp | 6 +++--- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 2 +- src/basic/MULADDSUB.cpp | 6 +++--- src/basic/NESTED_INIT.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 2 +- src/lcals/EOS.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 4 ++-- src/lcals/INT_PREDICT.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_2D.cpp | 4 ++-- src/polybench/POLYBENCH_MVT.cpp | 4 ++-- src/stream/ADD.cpp | 2 +- src/stream/COPY.cpp | 2 +- src/stream/MUL.cpp | 2 +- src/stream/TRIAD.cpp | 2 +- 58 files changed, 72 insertions(+), 72 deletions(-) diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index 583a19dea..49446a265 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -65,7 +65,7 @@ void MEMCPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void MEMCPY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize(), vid); } void MEMCPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index fdc98b3fe..95d3d5321 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -66,7 +66,7 @@ void MEMSET::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void MEMSET::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_x, getActualProblemSize()); + checksum[vid].at(tune_idx) += calcChecksum(m_x, getActualProblemSize(), vid); } void MEMSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index f2f2b25d2..3712f5ffa 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -66,7 +66,7 @@ void REDUCE_SUM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void REDUCE_SUM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(&m_sum, 1); + checksum[vid].at(tune_idx) += calcChecksum(&m_sum, 1, vid); } void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 7b2933084..30cb534df 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -69,7 +69,7 @@ void SCAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void SCAN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor); + checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor, vid); } void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index 049c03304..b7738f264 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -54,7 +54,7 @@ void SORT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void SORT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps(), vid); } void SORT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 96d79a7df..a07f1e79b 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -55,8 +55,8 @@ void SORTPAIRS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void SORTPAIRS::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); - checksum[vid][tune_idx] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps(), vid); + checksum[vid][tune_idx] += calcChecksum(m_i, getActualProblemSize()*getRunReps(), vid); } void SORTPAIRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 46eb341af..c45851d44 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -83,7 +83,7 @@ void CONVECTION3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void CONVECTION3DPA::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_Y, CPA_D1D*CPA_D1D*CPA_D1D*m_NE); + checksum[vid][tune_idx] += calcChecksum(m_Y, CPA_D1D*CPA_D1D*CPA_D1D*m_NE, vid); } void CONVECTION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 9fe3c3e85..3c1298670 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -89,7 +89,7 @@ void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_div, m_array_length); + checksum[vid][tune_idx] += calcChecksum(m_div, m_array_length, vid); } void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 3b3a2cb31..addfc0e02 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -83,7 +83,7 @@ void DIFFUSION3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); + checksum[vid][tune_idx] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE, vid); } void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 8e77961b6..fd1988300 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -94,8 +94,8 @@ void ENERGY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void ENERGY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_e_new, getActualProblemSize()); - checksum[vid][tune_idx] += calcChecksum(m_q_new, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_e_new, getActualProblemSize(), vid); + checksum[vid][tune_idx] += calcChecksum(m_q_new, getActualProblemSize(), vid); } void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 792f015d0..7b51aaebc 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -70,7 +70,7 @@ void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void FIR::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor , vid); } void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 53ec0ecc1..6b5304d6c 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -113,7 +113,7 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size); + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 5486c3645..25385ace8 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -113,7 +113,7 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size); + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index b920631dd..0abb82d35 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -92,7 +92,7 @@ void LTIMES::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void LTIMES::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor , vid); } void LTIMES::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 2f8dd4b40..a106d5418 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -91,7 +91,7 @@ void LTIMES_NOVIEW::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor , vid); } void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 107f74408..e3ab62d2d 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -79,7 +79,7 @@ void MASS3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void MASS3DPA::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); + checksum[vid][tune_idx] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE, vid); } void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 7ed2f0399..389848ebe 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -82,7 +82,7 @@ void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); + checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor , vid); } void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index 18979f3bd..c772a0989 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -74,7 +74,7 @@ void PRESSURE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void PRESSURE::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_p_new, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_p_new, getActualProblemSize(), vid); } void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index a1097163a..94957f135 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -89,7 +89,7 @@ void VOL3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void VOL3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor , vid); } void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index 73c8fecc9..2e7c70197 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -184,9 +184,9 @@ void COUPLE::updateChecksum(VariantID vid, size_t tune_idx) { Index_type max_loop_index = m_domain->lrn; - checksum[vid][tune_idx] += calcChecksum(m_t0, max_loop_index); - checksum[vid][tune_idx] += calcChecksum(m_t1, max_loop_index); - checksum[vid][tune_idx] += calcChecksum(m_t2, max_loop_index); + checksum[vid][tune_idx] += calcChecksum(m_t0, max_loop_index, vid); + checksum[vid][tune_idx] += calcChecksum(m_t1, max_loop_index, vid); + checksum[vid][tune_idx] += calcChecksum(m_t2, max_loop_index, vid); } void COUPLE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 67f4b0eb0..8aa05e66a 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -68,7 +68,7 @@ void DAXPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void DAXPY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize(), vid); } void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index e58516d2f..a9f709276 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -68,7 +68,7 @@ void DAXPY_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize(), vid); } void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index b0fe3469b..c31dc79d4 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -74,8 +74,8 @@ void IF_QUAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void IF_QUAD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor ); - checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor , vid); } void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index f5614a7f2..cb559c8b2 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -64,7 +64,7 @@ void INDEXLIST::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void INDEXLIST::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize(), vid); checksum[vid][tune_idx] += Checksum_type(m_len); } diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 3ddb3fc0c..49117dc66 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -73,7 +73,7 @@ void INDEXLIST_3LOOP::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void INDEXLIST_3LOOP::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize(), vid); checksum[vid][tune_idx] += Checksum_type(m_len); } diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index b2be64f84..bbf90da80 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -70,9 +70,9 @@ void INIT3::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void INIT3::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize()); - checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize()); - checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize(), vid); + checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize(), vid); + checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize(), vid); } void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index de34c5a28..018811f34 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -68,7 +68,7 @@ void INIT_VIEW1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), vid); } void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index fe1867698..4daa109a6 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -68,7 +68,7 @@ void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), vid); } void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index afbaaf0c2..36df38762 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -73,7 +73,7 @@ void MAT_MAT_SHARED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor , vid); } void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 4ae8d6868..a5deb6049 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -70,9 +70,9 @@ void MULADDSUB::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void MULADDSUB::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize()); - checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize()); - checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize(), vid); + checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize(), vid); + checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize(), vid); } void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 4b9183245..f9fe82e44 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -77,7 +77,7 @@ void NESTED_INIT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void NESTED_INIT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_array, m_array_length); + checksum[vid][tune_idx] += calcChecksum(m_array, m_array_length, vid); } void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 57bb9fb39..b5ddc90e4 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -68,7 +68,7 @@ void DIFF_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_px, m_array_length); + checksum[vid][tune_idx] += calcChecksum(m_px, m_array_length, vid); } void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 69ffa4bc3..517d144f8 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -79,7 +79,7 @@ void EOS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void EOS::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor , vid); } void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index df13c2f39..3e8e42ec6 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -69,7 +69,7 @@ void FIRST_DIFF::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void FIRST_DIFF::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), vid); } void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index d5559cd6c..046528e2b 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -68,7 +68,7 @@ void FIRST_SUM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void FIRST_SUM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), vid); } void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 340219129..9c132a3db 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -77,7 +77,7 @@ void GEN_LIN_RECUR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor , vid); } void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 16a251e2d..d92267fc9 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -77,7 +77,7 @@ void HYDRO_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void HYDRO_1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor , vid); } void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 0920de8c1..fd1dd9406 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -96,8 +96,8 @@ void HYDRO_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void HYDRO_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor ); - checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor , vid); } void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 1e7375752..24e357f50 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -81,7 +81,7 @@ void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) m_px[i] -= m_px_initval; } - checksum[vid][tune_idx] += calcChecksum(m_px, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_px, getActualProblemSize(), vid); } void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index da178a407..2bb8d3f7b 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -68,7 +68,7 @@ void PLANCKIAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void PLANCKIAN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_w, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_w, getActualProblemSize(), vid); } void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 98278cbcf..710927c3e 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -69,7 +69,7 @@ void TRIDIAG_ELIM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_xout, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_xout, getActualProblemSize(), vid); } void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 80136cd49..5c0ebe484 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -96,7 +96,7 @@ void POLYBENCH_2MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor , vid); } void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index e1dad595c..a649e2e89 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -106,7 +106,7 @@ void POLYBENCH_3MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor , vid); } void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index b513bdebc..7a31468a6 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -79,7 +79,7 @@ void POLYBENCH_ADI::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor , vid); } void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 4c159d3a7..5a9d15e89 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -82,7 +82,7 @@ void POLYBENCH_ATAX::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_y, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_y, m_N, checksum_scale_factor , vid); } void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 37dd1f9f5..ed2432d87 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -100,7 +100,7 @@ void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); + checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor, vid); } void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 9770821b0..03c1e65ba 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -75,7 +75,7 @@ void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor , vid); } void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index e080c6df5..6094ce908 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -86,7 +86,7 @@ void POLYBENCH_GEMM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor , vid); } void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 99e16324f..7223f85fd 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -102,7 +102,7 @@ void POLYBENCH_GEMVER::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_w, m_n, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_w, m_n, checksum_scale_factor , vid); } void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index fdf07a58f..ea8e2224f 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -76,7 +76,7 @@ void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_y, m_N); + checksum[vid][tune_idx] += calcChecksum(m_y, m_N, vid); } void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index be6c0e218..e3f0371e6 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -87,8 +87,8 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor ); - checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor , vid); } void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 925cd2682..9648571f1 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -84,8 +84,8 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_A, m_N, checksum_scale_factor ); - checksum[vid][tune_idx] += calcChecksum(m_B, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N, checksum_scale_factor , vid); } void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index e1b62d364..1ea1620ea 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -86,8 +86,8 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor ); - checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor , vid); } void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index e58065f28..c0a5b8bb9 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -80,8 +80,8 @@ void POLYBENCH_MVT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x1, m_N, checksum_scale_factor ); - checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x1, m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, checksum_scale_factor , vid); } void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 3ca91bed5..02cf25107 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -69,7 +69,7 @@ void ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void ADD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize(), vid); } void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 9cbfcbff2..c92018c63 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -68,7 +68,7 @@ void COPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void COPY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize(), vid); } void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 74ce32cb0..fba825bf6 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -69,7 +69,7 @@ void MUL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void MUL::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_b, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_b, getActualProblemSize(), vid); } void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 4790707bb..d9897618c 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -74,7 +74,7 @@ void TRIAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void TRIAD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor , vid); } void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) From 92efe48d73c26e8699e370385625da9937d11943 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 16:16:47 -0700 Subject: [PATCH 083/137] Remove incDataInitCount from backend DataUtils --- src/common/CudaDataUtils.hpp | 2 -- src/common/HipDataUtils.hpp | 2 -- src/common/OpenMPTargetDataUtils.hpp | 1 - 3 files changed, 5 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index b45d1ec79..8324fea5f 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -137,8 +137,6 @@ void initCudaDeviceData(T& dptr, const T hptr, int len) cudaErrchk( cudaMemcpy( dptr, hptr, len * sizeof(typename std::remove_pointer::type), cudaMemcpyHostToDevice ) ); - - detail::incDataInitCount(); } /*! diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index bb35ff88b..71d534b5f 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -133,8 +133,6 @@ void initHipDeviceData(T& dptr, const T hptr, int len) hipErrchk( hipMemcpy( dptr, hptr, len * sizeof(typename std::remove_pointer::type), hipMemcpyHostToDevice ) ); - - detail::incDataInitCount(); } /*! diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 1a5c0eb2f..8c4b4b0a1 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -66,7 +66,6 @@ void initOpenMPDeviceData(T& dptr, const T hptr, int len, int hid = omp_get_initial_device()) { copyOpenMPDeviceData(dptr, hptr, len, did, hid); - detail::incDataInitCount(); } /*! From 6f0e1640768f6732a958ab3960e8e786671a5616 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 16:17:12 -0700 Subject: [PATCH 084/137] Include policies in cuda and hip utils --- src/common/CudaDataUtils.hpp | 1 + src/common/HipDataUtils.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 8324fea5f..0ea0f0428 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -22,6 +22,7 @@ #include "common/RAJAPerfSuite.hpp" #include "common/GPUUtils.hpp" +#include "RAJA/policy/cuda/policy.hpp" #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 71d534b5f..efab10cce 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -22,6 +22,7 @@ #include "common/RAJAPerfSuite.hpp" #include "common/GPUUtils.hpp" +#include "RAJA/policy/hip/policy.hpp" #include "RAJA/policy/hip/raja_hiperrchk.hpp" From a2e75bf4b800c8578431c02f547ca442cb9fda1d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 16:17:38 -0700 Subject: [PATCH 085/137] fixup non-seq memory utils in DataUtils --- src/common/DataUtils.hpp | 51 +++++++++++++++---------------------- src/common/HipDataUtils.hpp | 12 +++++++++ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 0d81d12d1..42bfe448d 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -295,15 +295,15 @@ inline void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: { - allocCudaPinnedData(ptr, len, vid); + allocCudaPinnedData(ptr, len); } break; case DataSpace::CudaManaged: { - allocCudaManagedData(ptr, len, vid); + allocCudaManagedData(ptr, len); } break; case DataSpace::CudaDevice: { - allocCudaDeviceData(ptr, len, vid); + allocCudaDeviceData(ptr, len); } break; #endif @@ -311,46 +311,46 @@ inline void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID case DataSpace::HipHostAdviseFine: { allocHostData(ptr, len, align, vid); - adviseHipFineData(ptr, len, vid); + adviseHipFineData(ptr, len); } break; case DataSpace::HipHostAdviseCoarse: { allocHostData(ptr, len, align, vid); - adviseHipCoarseData(ptr, len, vid); + adviseHipCoarseData(ptr, len); } break; case DataSpace::HipPinned: { - allocHipPinnedData(ptr, len, vid); + allocHipPinnedData(ptr, len); } break; case DataSpace::HipPinnedFine: { - allocHipPinnedFineData(ptr, len, vid); + allocHipPinnedFineData(ptr, len); } break; case DataSpace::HipPinnedCoarse: { - allocHipPinnedCoarseData(ptr, len, vid); + allocHipPinnedCoarseData(ptr, len); } break; case DataSpace::HipManaged: { - allocHipManagedData(ptr, len, vid); + allocHipManagedData(ptr, len); } break; case DataSpace::HipManagedAdviseFine: { - allocHipManagedData(ptr, len, vid); - adviseHipFineData(ptr, len, vid); + allocHipManagedData(ptr, len); + adviseHipFineData(ptr, len); } break; case DataSpace::HipManagedAdviseCoarse: { - allocHipManagedData(ptr, len, vid); - adviseHipCoarseData(ptr, len, vid); + allocHipManagedData(ptr, len); + adviseHipCoarseData(ptr, len); } break; case DataSpace::HipDevice: { - allocHipDeviceData(ptr, len, vid); + allocHipDeviceData(ptr, len); } break; case DataSpace::HipDeviceFine: { - allocHipDeviceFineData(ptr, len, vid); + allocHipDeviceFineData(ptr, len); } break; #endif @@ -386,44 +386,35 @@ inline void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: { - deallocCudaPinnedData(ptr, vid); + deallocCudaPinnedData(ptr); } break; case DataSpace::CudaManaged: { - deallocCudaManagedData(ptr, vid); + deallocCudaManagedData(ptr); } break; case DataSpace::CudaDevice: { - deallocCudaDeviceData(ptr, vid); + deallocCudaDeviceData(ptr); } break; #endif #if defined(RAJA_ENABLE_HIP) case DataSpace::HipPinned: - { - deallocHipPinnedData(ptr, vid); - } break; case DataSpace::HipPinnedFine: - { - deallocHipPinnedFineData(ptr, vid); - } break; case DataSpace::HipPinnedCoarse: { - deallocHipPinnedCoarseData(ptr, vid); + deallocHipPinnedData(ptr); } break; case DataSpace::HipManaged: case DataSpace::HipManagedAdviseFine: case DataSpace::HipManagedAdviseCoarse: { - deallocHipManagedData(ptr, vid); + deallocHipManagedData(ptr); } break; case DataSpace::HipDevice: - { - deallocHipDeviceData(ptr, vid); - } break; case DataSpace::HipDeviceFine: { - deallocHipDeviceFineData(ptr, vid); + deallocHipDeviceData(ptr); } break; #endif diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index efab10cce..38abfb2fb 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -212,6 +212,18 @@ void adviseHipData(T& ptr, int len, hipMemoryAdvise advice, int device) advice, device ) ); } +template +void adviseHipCoarseData(T& ptr, int len) +{ + adviseHipData(ptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); +} + +template +void adviseHipFineData(T& ptr, int len) +{ + adviseHipData(ptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); +} + /*! * \brief Allocate HIP device data array (dptr) and copy given hptr (host) * data to device array. From 562783aa717ca5fe504b3a9738d42abaf5f24bdc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 21:36:33 -0700 Subject: [PATCH 086/137] Remove algorithm cuda/hip/omptarget data setup --- src/algorithm/MEMCPY-Cuda.cpp | 29 -------------------------- src/algorithm/MEMCPY-Hip.cpp | 29 -------------------------- src/algorithm/MEMCPY-OMPTarget.cpp | 20 ------------------ src/algorithm/MEMSET-Cuda.cpp | 27 ------------------------ src/algorithm/MEMSET-Hip.cpp | 27 ------------------------ src/algorithm/MEMSET-OMPTarget.cpp | 18 ---------------- src/algorithm/REDUCE_SUM-Cuda.cpp | 18 ---------------- src/algorithm/REDUCE_SUM-Hip.cpp | 18 ---------------- src/algorithm/REDUCE_SUM-OMPTarget.cpp | 17 --------------- src/algorithm/SCAN-Cuda.cpp | 17 --------------- src/algorithm/SCAN-Hip.cpp | 17 --------------- src/algorithm/SCAN-OMPTarget.cpp | 16 -------------- src/algorithm/SORT-Cuda.cpp | 11 ---------- src/algorithm/SORT-Hip.cpp | 11 ---------- src/algorithm/SORTPAIRS-Cuda.cpp | 14 ------------- src/algorithm/SORTPAIRS-Hip.cpp | 14 ------------- 16 files changed, 303 deletions(-) diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp index 932d62b88..74d7ca34f 100644 --- a/src/algorithm/MEMCPY-Cuda.cpp +++ b/src/algorithm/MEMCPY-Cuda.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace algorithm { -#define MEMCPY_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(y, m_y, iend); - -#define MEMCPY_DATA_TEARDOWN_CUDA \ - getCudaData(m_y, y, iend); \ - deallocCudaData(x); \ - deallocCudaData(y); - template < size_t block_size > __launch_bounds__(block_size) __global__ void memcpy(Real_ptr x, Real_ptr y, @@ -52,8 +43,6 @@ void MEMCPY::runCudaVariantLibrary(VariantID vid) if ( vid == Base_CUDA ) { - MEMCPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -62,12 +51,8 @@ void MEMCPY::runCudaVariantLibrary(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - MEMCPY_DATA_SETUP_CUDA; - camp::resources::Cuda res = camp::resources::Cuda::get_default(); startTimer(); @@ -78,8 +63,6 @@ void MEMCPY::runCudaVariantLibrary(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MEMCPY : Unknown Cuda variant id = " << vid << std::endl; @@ -99,8 +82,6 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - MEMCPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -112,12 +93,8 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - MEMCPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -133,12 +110,8 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - MEMCPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -150,8 +123,6 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MEMCPY : Unknown Cuda variant id = " << vid << std::endl; diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp index 7a970f6a6..a1577e012 100644 --- a/src/algorithm/MEMCPY-Hip.cpp +++ b/src/algorithm/MEMCPY-Hip.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace algorithm { -#define MEMCPY_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(y, m_y, iend); - -#define MEMCPY_DATA_TEARDOWN_HIP \ - getHipData(m_y, y, iend); \ - deallocHipData(x); \ - deallocHipData(y); - template < size_t block_size > __launch_bounds__(block_size) __global__ void memcpy(Real_ptr x, Real_ptr y, @@ -52,8 +43,6 @@ void MEMCPY::runHipVariantLibrary(VariantID vid) if ( vid == Base_HIP ) { - MEMCPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -62,12 +51,8 @@ void MEMCPY::runHipVariantLibrary(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - MEMCPY_DATA_SETUP_HIP; - camp::resources::Hip res = camp::resources::Hip::get_default(); startTimer(); @@ -78,8 +63,6 @@ void MEMCPY::runHipVariantLibrary(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MEMCPY : Unknown Hip variant id = " << vid << std::endl; @@ -99,8 +82,6 @@ void MEMCPY::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - MEMCPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -113,12 +94,8 @@ void MEMCPY::runHipVariantBlock(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - MEMCPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -135,12 +112,8 @@ void MEMCPY::runHipVariantBlock(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - MEMCPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -152,8 +125,6 @@ void MEMCPY::runHipVariantBlock(VariantID vid) } stopTimer(); - MEMCPY_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MEMCPY : Unknown Hip variant id = " << vid << std::endl; diff --git a/src/algorithm/MEMCPY-OMPTarget.cpp b/src/algorithm/MEMCPY-OMPTarget.cpp index 1e8ce3554..4f4932793 100644 --- a/src/algorithm/MEMCPY-OMPTarget.cpp +++ b/src/algorithm/MEMCPY-OMPTarget.cpp @@ -26,18 +26,6 @@ namespace algorithm // const size_t threads_per_team = 256; -#define MEMCPY_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); - -#define MEMCPY_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); - void MEMCPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -49,8 +37,6 @@ void MEMCPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu if ( vid == Base_OpenMPTarget ) { - MEMCPY_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -64,12 +50,8 @@ void MEMCPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } stopTimer(); - MEMCPY_DATA_TEARDOWN_OMP_TARGET - } else if ( vid == RAJA_OpenMPTarget ) { - MEMCPY_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -82,8 +64,6 @@ void MEMCPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } stopTimer(); - MEMCPY_DATA_TEARDOWN_OMP_TARGET - } else { getCout() << "\n MEMCPY : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp index 9e03de595..7dfb5abac 100644 --- a/src/algorithm/MEMSET-Cuda.cpp +++ b/src/algorithm/MEMSET-Cuda.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace algorithm { -#define MEMSET_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); - -#define MEMSET_DATA_TEARDOWN_CUDA \ - getCudaData(m_x, x, iend); \ - deallocCudaData(x); - template < size_t block_size > __launch_bounds__(block_size) __global__ void memset(Real_ptr x, Real_type val, @@ -50,8 +43,6 @@ void MEMSET::runCudaVariantLibrary(VariantID vid) if ( vid == Base_CUDA ) { - MEMSET_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -60,12 +51,8 @@ void MEMSET::runCudaVariantLibrary(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - MEMSET_DATA_SETUP_CUDA; - camp::resources::Cuda res = camp::resources::Cuda::get_default(); startTimer(); @@ -76,8 +63,6 @@ void MEMSET::runCudaVariantLibrary(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MEMSET : Unknown Cuda variant id = " << vid << std::endl; @@ -97,8 +82,6 @@ void MEMSET::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - MEMSET_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -112,12 +95,8 @@ void MEMSET::runCudaVariantBlock(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - MEMSET_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -133,12 +112,8 @@ void MEMSET::runCudaVariantBlock(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - MEMSET_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -150,8 +125,6 @@ void MEMSET::runCudaVariantBlock(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MEMSET : Unknown Cuda variant id = " << vid << std::endl; diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp index ed6a53b6a..8706d358c 100644 --- a/src/algorithm/MEMSET-Hip.cpp +++ b/src/algorithm/MEMSET-Hip.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace algorithm { -#define MEMSET_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); - -#define MEMSET_DATA_TEARDOWN_HIP \ - getHipData(m_x, x, iend); \ - deallocHipData(x); - template < size_t block_size > __launch_bounds__(block_size) __global__ void memset(Real_ptr x, Real_type val, @@ -50,8 +43,6 @@ void MEMSET::runHipVariantLibrary(VariantID vid) if ( vid == Base_HIP ) { - MEMSET_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -60,12 +51,8 @@ void MEMSET::runHipVariantLibrary(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - MEMSET_DATA_SETUP_HIP; - camp::resources::Hip res = camp::resources::Hip::get_default(); startTimer(); @@ -76,8 +63,6 @@ void MEMSET::runHipVariantLibrary(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MEMSET : Unknown Hip variant id = " << vid << std::endl; @@ -97,8 +82,6 @@ void MEMSET::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - MEMSET_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -111,12 +94,8 @@ void MEMSET::runHipVariantBlock(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - MEMSET_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -133,12 +112,8 @@ void MEMSET::runHipVariantBlock(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - MEMSET_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -150,8 +125,6 @@ void MEMSET::runHipVariantBlock(VariantID vid) } stopTimer(); - MEMSET_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MEMSET : Unknown Hip variant id = " << vid << std::endl; diff --git a/src/algorithm/MEMSET-OMPTarget.cpp b/src/algorithm/MEMSET-OMPTarget.cpp index b60454c8c..ec6d9c716 100644 --- a/src/algorithm/MEMSET-OMPTarget.cpp +++ b/src/algorithm/MEMSET-OMPTarget.cpp @@ -26,16 +26,6 @@ namespace algorithm // const size_t threads_per_team = 256; -#define MEMSET_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); - -#define MEMSET_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_x, x, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - void MEMSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -47,8 +37,6 @@ void MEMSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu if ( vid == Base_OpenMPTarget ) { - MEMSET_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -62,12 +50,8 @@ void MEMSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } stopTimer(); - MEMSET_DATA_TEARDOWN_OMP_TARGET - } else if ( vid == RAJA_OpenMPTarget ) { - MEMSET_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,8 +64,6 @@ void MEMSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } stopTimer(); - MEMSET_DATA_TEARDOWN_OMP_TARGET - } else { getCout() << "\n MEMSET : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 86a805432..7192129ef 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -24,12 +24,6 @@ namespace rajaperf namespace algorithm { -#define REDUCE_SUM_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); - -#define REDUCE_SUM_DATA_TEARDOWN_CUDA \ - deallocCudaData(x); - template < size_t block_size > __launch_bounds__(block_size) __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, @@ -74,8 +68,6 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) if ( vid == Base_CUDA ) { - REDUCE_SUM_DATA_SETUP_CUDA; - cudaStream_t stream = 0; int len = iend - ibegin; @@ -124,8 +116,6 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) deallocCudaDeviceData(temp_storage); deallocCudaPinnedData(sum_storage); - REDUCE_SUM_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; @@ -145,8 +135,6 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - REDUCE_SUM_DATA_SETUP_CUDA; - Real_ptr dsum; allocCudaDeviceData(dsum, 1); @@ -173,12 +161,8 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) deallocCudaDeviceData(dsum); - REDUCE_SUM_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - REDUCE_SUM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -194,8 +178,6 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } stopTimer(); - REDUCE_SUM_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 97d67879a..3549f8dd3 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -29,12 +29,6 @@ namespace rajaperf namespace algorithm { -#define REDUCE_SUM_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); - -#define REDUCE_SUM_DATA_TEARDOWN_HIP \ - deallocHipData(x); - template < size_t block_size > __launch_bounds__(block_size) __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, @@ -79,8 +73,6 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) if ( vid == Base_HIP ) { - REDUCE_SUM_DATA_SETUP_HIP; - hipStream_t stream = 0; int len = iend - ibegin; @@ -151,8 +143,6 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) deallocHipDeviceData(temp_storage); deallocHipPinnedData(sum_storage); - REDUCE_SUM_DATA_TEARDOWN_HIP; - } else { getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; @@ -172,8 +162,6 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - REDUCE_SUM_DATA_SETUP_HIP; - Real_ptr dsum; allocHipDeviceData(dsum, 1); @@ -199,12 +187,8 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) deallocHipDeviceData(dsum); - REDUCE_SUM_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - REDUCE_SUM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -220,8 +204,6 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } stopTimer(); - REDUCE_SUM_DATA_TEARDOWN_HIP; - } else { getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp index eff2bee26..a8652099e 100644 --- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -26,15 +26,6 @@ namespace algorithm // const size_t threads_per_team = 256; -#define REDUCE_SUM_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); - -#define REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(x, did); \ - void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -46,8 +37,6 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR if ( vid == Base_OpenMPTarget ) { - REDUCE_SUM_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -65,12 +54,8 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET - } else if ( vid == RAJA_OpenMPTarget ) { - REDUCE_SUM_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,8 +72,6 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET - } else { getCout() << "\n REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 8448dd70f..61b6ee551 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -24,15 +24,6 @@ namespace rajaperf namespace algorithm { -#define SCAN_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(y, m_y, iend); - -#define SCAN_DATA_TEARDOWN_CUDA \ - getCudaData(m_y, y, iend); \ - deallocCudaData(x); \ - deallocCudaData(y); - void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -44,8 +35,6 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) if ( vid == Base_CUDA ) { - SCAN_DATA_SETUP_CUDA; - cudaStream_t stream = 0; RAJA::operators::plus binary_op; @@ -89,12 +78,8 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) // Free temporary storage deallocCudaDeviceData(temp_storage); - SCAN_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - SCAN_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -103,8 +88,6 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } stopTimer(); - SCAN_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index fcf644971..c1d994210 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -29,15 +29,6 @@ namespace rajaperf namespace algorithm { -#define SCAN_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(y, m_y, iend); - -#define SCAN_DATA_TEARDOWN_HIP \ - getHipData(m_y, y, iend); \ - deallocHipData(x); \ - deallocHipData(y); - void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -49,8 +40,6 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) if ( vid == Base_HIP ) { - SCAN_DATA_SETUP_HIP; - hipStream_t stream = 0; RAJA::operators::plus binary_op; @@ -116,12 +105,8 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) // Free temporary storage deallocHipDeviceData(temp_storage); - SCAN_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - SCAN_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -130,8 +115,6 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } stopTimer(); - SCAN_DATA_TEARDOWN_HIP; - } else { getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp index f7f18588a..05347e7f9 100644 --- a/src/algorithm/SCAN-OMPTarget.cpp +++ b/src/algorithm/SCAN-OMPTarget.cpp @@ -27,18 +27,6 @@ namespace algorithm // const size_t threads_per_team = 256; -#define SCAN_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ - \ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); - -#define SCAN_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); - void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -54,8 +42,6 @@ void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune case Base_OpenMPTarget : { - SCAN_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -73,8 +59,6 @@ void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } stopTimer(); - SCAN_DATA_TEARDOWN_OMP_TARGET; - break; } diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 8cb06557f..dd843a532 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace algorithm { -#define SORT_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend*run_reps); - -#define SORT_DATA_TEARDOWN_CUDA \ - getCudaData(m_x, x, iend*run_reps); \ - deallocCudaData(x); - void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -39,8 +32,6 @@ void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) if ( vid == RAJA_CUDA ) { - SORT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -49,8 +40,6 @@ void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } stopTimer(); - SORT_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n SORT : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index 5da029718..176a4ac77 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace algorithm { -#define SORT_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend*run_reps); - -#define SORT_DATA_TEARDOWN_HIP \ - getHipData(m_x, x, iend*run_reps); \ - deallocHipData(x); - void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -39,8 +32,6 @@ void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) if ( vid == RAJA_HIP ) { - SORT_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -49,8 +40,6 @@ void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } stopTimer(); - SORT_DATA_TEARDOWN_HIP; - } else { getCout() << "\n SORT : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index 9042d3b70..23769806b 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -21,16 +21,6 @@ namespace rajaperf namespace algorithm { -#define SORTPAIRS_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend*run_reps); \ - allocAndInitCudaData(i, m_i, iend*run_reps); - -#define SORTPAIRS_DATA_TEARDOWN_CUDA \ - getCudaData(m_x, x, iend*run_reps); \ - getCudaData(m_i, i, iend*run_reps); \ - deallocCudaData(x); \ - deallocCudaData(i); - void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -42,8 +32,6 @@ void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id if ( vid == RAJA_CUDA ) { - SORTPAIRS_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -52,8 +40,6 @@ void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id } stopTimer(); - SORTPAIRS_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n SORTPAIRS : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index c84459b5f..7c71117c3 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -21,16 +21,6 @@ namespace rajaperf namespace algorithm { -#define SORTPAIRS_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend*run_reps); \ - allocAndInitHipData(i, m_i, iend*run_reps); - -#define SORTPAIRS_DATA_TEARDOWN_HIP \ - getHipData(m_x, x, iend*run_reps); \ - getHipData(m_i, i, iend*run_reps); \ - deallocHipData(x); \ - deallocHipData(i); - void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -42,8 +32,6 @@ void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx if ( vid == RAJA_HIP ) { - SORTPAIRS_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -52,8 +40,6 @@ void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } stopTimer(); - SORTPAIRS_DATA_TEARDOWN_HIP; - } else { getCout() << "\n SORTPAIRS : Unknown Hip variant id = " << vid << std::endl; } From b9253d8a9eb179e53b481c20a8df92eba0e4f6c2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 21:37:33 -0700 Subject: [PATCH 087/137] remove basic cuda/hip/omptarget data setup --- src/basic/DAXPY-Cuda.cpp | 21 --------------- src/basic/DAXPY-Hip.cpp | 21 --------------- src/basic/DAXPY-OMPTarget.cpp | 20 -------------- src/basic/DAXPY_ATOMIC-Cuda.cpp | 21 --------------- src/basic/DAXPY_ATOMIC-Hip.cpp | 21 --------------- src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 20 -------------- src/basic/IF_QUAD-Cuda.cpp | 31 ---------------------- src/basic/IF_QUAD-Hip.cpp | 28 ------------------- src/basic/IF_QUAD-OMPTarget.cpp | 26 ------------------ src/basic/INDEXLIST-Cuda.cpp | 14 ---------- src/basic/INDEXLIST-Hip.cpp | 14 ---------- src/basic/INDEXLIST-OMPTarget.cpp | 16 ----------- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 9 ++----- src/basic/INDEXLIST_3LOOP-Hip.cpp | 9 ++----- src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 10 ++----- src/basic/INIT3-Cuda.cpp | 29 -------------------- src/basic/INIT3-Hip.cpp | 29 -------------------- src/basic/INIT3-OMPTarget.cpp | 28 ------------------- src/basic/INIT_VIEW1D-Cuda.cpp | 19 ------------- src/basic/INIT_VIEW1D-Hip.cpp | 19 ------------- src/basic/INIT_VIEW1D-OMPTarget.cpp | 18 ------------- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 19 ------------- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 19 ------------- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 18 ------------- src/basic/MAT_MAT_SHARED-Cuda.cpp | 26 ------------------ src/basic/MAT_MAT_SHARED-Hip.cpp | 26 ------------------ src/basic/MULADDSUB-Cuda.cpp | 29 -------------------- src/basic/MULADDSUB-Hip.cpp | 29 -------------------- src/basic/MULADDSUB-OMPTarget.cpp | 28 ------------------- src/basic/NESTED_INIT-Cuda.cpp | 19 ------------- src/basic/NESTED_INIT-Hip.cpp | 19 ------------- src/basic/NESTED_INIT-OMPTarget.cpp | 18 ------------- src/basic/PI_ATOMIC-Cuda.cpp | 18 ------------- src/basic/PI_ATOMIC-Hip.cpp | 18 ------------- src/basic/PI_ATOMIC-OMPTarget.cpp | 17 ------------ src/basic/REDUCE3_INT-Cuda.cpp | 14 ---------- src/basic/REDUCE3_INT-Hip.cpp | 14 ---------- src/basic/REDUCE3_INT-OMPTarget.cpp | 17 ------------ src/basic/REDUCE_STRUCT-Cuda.cpp | 17 ------------ src/basic/REDUCE_STRUCT-Hip.cpp | 16 ----------- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 22 --------------- src/basic/TRAP_INT-Cuda.cpp | 13 --------- src/basic/TRAP_INT-Hip.cpp | 13 --------- src/basic/TRAP_INT-OMPTarget.cpp | 11 -------- 44 files changed, 6 insertions(+), 857 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 4ccd5581c..c6efc9f61 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace basic { -#define DAXPY_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(y, m_y, iend); - -#define DAXPY_DATA_TEARDOWN_CUDA \ - getCudaData(m_y, y, iend); \ - deallocCudaData(x); \ - deallocCudaData(y); - template < size_t block_size > __launch_bounds__(block_size) __global__ void daxpy(Real_ptr y, Real_ptr x, @@ -54,8 +45,6 @@ void DAXPY::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - DAXPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -67,12 +56,8 @@ void DAXPY::runCudaVariantImpl(VariantID vid) } stopTimer(); - DAXPY_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - DAXPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -86,12 +71,8 @@ void DAXPY::runCudaVariantImpl(VariantID vid) } stopTimer(); - DAXPY_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - DAXPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -103,8 +84,6 @@ void DAXPY::runCudaVariantImpl(VariantID vid) } stopTimer(); - DAXPY_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 0cb5ae3a5..2e8f92c75 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace basic { -#define DAXPY_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(y, m_y, iend); - -#define DAXPY_DATA_TEARDOWN_HIP \ - getHipData(m_y, y, iend); \ - deallocHipData(x); \ - deallocHipData(y); - template < size_t block_size > __launch_bounds__(block_size) __global__ void daxpy(Real_ptr y, Real_ptr x, @@ -55,8 +46,6 @@ void DAXPY::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - DAXPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -68,12 +57,8 @@ void DAXPY::runHipVariantImpl(VariantID vid) } stopTimer(); - DAXPY_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - DAXPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -89,12 +74,8 @@ void DAXPY::runHipVariantImpl(VariantID vid) } stopTimer(); - DAXPY_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - DAXPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -106,8 +87,6 @@ void DAXPY::runHipVariantImpl(VariantID vid) } stopTimer(); - DAXPY_DATA_TEARDOWN_HIP; - } else { getCout() << "\n DAXPY : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 6dcc2a1c0..387a4c40d 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -26,18 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define DAXPY_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); - -#define DAXPY_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); - void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -49,8 +37,6 @@ void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun if ( vid == Base_OpenMPTarget ) { - DAXPY_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -63,12 +49,8 @@ void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } stopTimer(); - DAXPY_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - DAXPY_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,8 +62,6 @@ void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } stopTimer(); - DAXPY_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n DAXPY : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 85cd771e7..17f59b0c4 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace basic { -#define DAXPY_ATOMIC_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(y, m_y, iend); - -#define DAXPY_ATOMIC_DATA_TEARDOWN_CUDA \ - getCudaData(m_y, y, iend); \ - deallocCudaData(x); \ - deallocCudaData(y); - template < size_t block_size > __launch_bounds__(block_size) __global__ void daxpy_atomic(Real_ptr y, Real_ptr x, @@ -54,8 +45,6 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - DAXPY_ATOMIC_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -67,12 +56,8 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - DAXPY_ATOMIC_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -86,12 +71,8 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - DAXPY_ATOMIC_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -103,8 +84,6 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n DAXPY_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 137cb32f0..418a98c36 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace basic { -#define DAXPY_ATOMIC_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(y, m_y, iend); - -#define DAXPY_ATOMIC_DATA_TEARDOWN_HIP \ - getHipData(m_y, y, iend); \ - deallocHipData(x); \ - deallocHipData(y); - template < size_t block_size > __launch_bounds__(block_size) __global__ void daxpy_atomic(Real_ptr y, Real_ptr x, @@ -54,8 +45,6 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - DAXPY_ATOMIC_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -67,12 +56,8 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - DAXPY_ATOMIC_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,12 +73,8 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - DAXPY_ATOMIC_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -105,8 +86,6 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_HIP; - } else { getCout() << "\n DAXPY_ATOMIC : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index 3c05239bf..bc6b08932 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -26,18 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); - -#define DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); - void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -49,8 +37,6 @@ void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ if ( vid == Base_OpenMPTarget ) { - DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -64,12 +50,8 @@ void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -81,8 +63,6 @@ void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n DAXPY_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 0938b7ebf..b5dc90583 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -21,22 +21,6 @@ namespace rajaperf namespace basic { -#define IF_QUAD_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, iend); \ - allocAndInitCudaData(b, m_b, iend); \ - allocAndInitCudaData(c, m_c, iend); \ - allocAndInitCudaData(x1, m_x1, iend); \ - allocAndInitCudaData(x2, m_x2, iend); - -#define IF_QUAD_DATA_TEARDOWN_CUDA \ - getCudaData(m_x1, x1, iend); \ - getCudaData(m_x2, x2, iend); \ - deallocCudaData(a); \ - deallocCudaData(b); \ - deallocCudaData(c); \ - deallocCudaData(x1); \ - deallocCudaData(x2); - template < size_t block_size > __launch_bounds__(block_size) __global__ void ifquad(Real_ptr x1, Real_ptr x2, @@ -62,8 +46,6 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - IF_QUAD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -73,13 +55,8 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) } stopTimer(); - - IF_QUAD_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - IF_QUAD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,13 +69,8 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) } stopTimer(); - - IF_QUAD_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - IF_QUAD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -109,9 +81,6 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) } stopTimer(); - - IF_QUAD_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n IF_QUAD : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 878a32547..e041919d4 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -21,22 +21,6 @@ namespace rajaperf namespace basic { -#define IF_QUAD_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, iend); \ - allocAndInitHipData(b, m_b, iend); \ - allocAndInitHipData(c, m_c, iend); \ - allocAndInitHipData(x1, m_x1, iend); \ - allocAndInitHipData(x2, m_x2, iend); - -#define IF_QUAD_DATA_TEARDOWN_HIP \ - getHipData(m_x1, x1, iend); \ - getHipData(m_x2, x2, iend); \ - deallocHipData(a); \ - deallocHipData(b); \ - deallocHipData(c); \ - deallocHipData(x1); \ - deallocHipData(x2); - template < size_t block_size > __launch_bounds__(block_size) __global__ void ifquad(Real_ptr x1, Real_ptr x2, @@ -62,8 +46,6 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - IF_QUAD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -75,12 +57,8 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) } stopTimer(); - IF_QUAD_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - IF_QUAD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -96,12 +74,8 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) } stopTimer(); - IF_QUAD_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - IF_QUAD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -113,8 +87,6 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) } stopTimer(); - IF_QUAD_DATA_TEARDOWN_HIP; - } else { getCout() << "\n IF_QUAD : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index 34acba2a4..d6232ec13 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -26,24 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define IF_QUAD_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ - allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \ - allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); \ - allocAndInitOpenMPDeviceData(x1, m_x1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(x2, m_x2, iend, did, hid); - -#define IF_QUAD_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_x1, x1, iend, hid, did); \ - getOpenMPDeviceData(m_x2, x2, iend, hid, did); \ - deallocOpenMPDeviceData(a, did); \ - deallocOpenMPDeviceData(b, did); \ - deallocOpenMPDeviceData(c, did); \ - deallocOpenMPDeviceData(x1, did); \ - deallocOpenMPDeviceData(x2, did); void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -55,8 +37,6 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t if ( vid == Base_OpenMPTarget ) { - IF_QUAD_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -69,12 +49,8 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } stopTimer(); - IF_QUAD_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - IF_QUAD_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -86,8 +62,6 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } stopTimer(); - IF_QUAD_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 6a8b05642..1fbac7291 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -33,16 +33,6 @@ namespace basic const size_t items_per_thread = 15; -#define INDEXLIST_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(list, m_list, iend); - -#define INDEXLIST_DATA_TEARDOWN_CUDA \ - getCudaData(m_list, list, iend); \ - deallocCudaData(x); \ - deallocCudaData(list); - - // perform a grid scan on val and returns the result at each thread // in exclusive and inclusive, note that val is used as scratch space template < size_t block_size, size_t items_per_thread > @@ -267,8 +257,6 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - INDEXLIST_DATA_SETUP_CUDA; - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); const size_t shmem_size = 0; @@ -303,8 +291,6 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) deallocCudaDeviceData(grid_counts); deallocCudaDeviceData(block_readys); - INDEXLIST_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index e38332046..38ccb0f09 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -33,16 +33,6 @@ namespace basic const size_t items_per_thread = 8; -#define INDEXLIST_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(list, m_list, iend); - -#define INDEXLIST_DATA_TEARDOWN_HIP \ - getHipData(m_list, list, iend); \ - deallocHipData(x); \ - deallocHipData(list); - - // perform a grid scan on val and returns the result at each thread // in exclusive and inclusive, note that val is used as scratch space template < size_t block_size, size_t items_per_thread > @@ -267,8 +257,6 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - INDEXLIST_DATA_SETUP_HIP; - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); const size_t shmem_size = 0; @@ -303,8 +291,6 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) deallocHipDeviceData(grid_counts); deallocHipDeviceData(block_readys); - INDEXLIST_DATA_TEARDOWN_HIP; - } else { getCout() << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } diff --git a/src/basic/INDEXLIST-OMPTarget.cpp b/src/basic/INDEXLIST-OMPTarget.cpp index 36980a013..85e15dbb4 100644 --- a/src/basic/INDEXLIST-OMPTarget.cpp +++ b/src/basic/INDEXLIST-OMPTarget.cpp @@ -25,18 +25,6 @@ namespace basic // const size_t threads_per_team = 256; -#define INDEXLIST_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ - \ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(list, m_list, iend, did, hid); - -#define INDEXLIST_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_list, list, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(list, did); - #endif @@ -55,8 +43,6 @@ void INDEXLIST::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG case Base_OpenMPTarget : { - INDEXLIST_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -79,8 +65,6 @@ void INDEXLIST::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - INDEXLIST_DATA_TEARDOWN_OMP_TARGET; - break; } diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 92bb08f9e..8e086ab6d 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -23,15 +23,10 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ Index_type* counts; \ - allocCudaData(counts, iend+1); \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(list, m_list, iend); + allocCudaData(counts, iend+1); #define INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA \ - deallocCudaData(counts); \ - getCudaData(m_list, list, iend); \ - deallocCudaData(x); \ - deallocCudaData(list); + deallocCudaData(counts); template < size_t block_size > diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 73f554853..65f63db77 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -23,15 +23,10 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_HIP \ Index_type* counts; \ - allocHipData(counts, iend+1); \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(list, m_list, iend); + allocHipData(counts, iend+1); #define INDEXLIST_3LOOP_DATA_TEARDOWN_HIP \ - deallocHipData(counts); \ - getHipData(m_list, list, iend); \ - deallocHipData(x); \ - deallocHipData(list); + deallocHipData(counts); template < size_t block_size > diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp index 3acf16049..d73c193cb 100644 --- a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -26,19 +26,13 @@ namespace basic const size_t threads_per_team = 256; #define INDEXLIST_3LOOP_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ int did = omp_get_default_device(); \ \ Index_type* counts = nullptr; \ - allocOpenMPDeviceData(counts, iend+1, did); \ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(list, m_list, iend, did, hid); + allocOpenMPDeviceData(counts, iend+1, did); #define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(counts, did); \ - getOpenMPDeviceData(m_list, list, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(list, did); + deallocOpenMPDeviceData(counts, did); #endif diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index 21252172e..b5109cd56 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -21,23 +21,6 @@ namespace rajaperf namespace basic { -#define INIT3_DATA_SETUP_CUDA \ - allocAndInitCudaData(out1, m_out1, iend); \ - allocAndInitCudaData(out2, m_out2, iend); \ - allocAndInitCudaData(out3, m_out3, iend); \ - allocAndInitCudaData(in1, m_in1, iend); \ - allocAndInitCudaData(in2, m_in2, iend); - -#define INIT3_DATA_TEARDOWN_CUDA \ - getCudaData(m_out1, out1, iend); \ - getCudaData(m_out2, out2, iend); \ - getCudaData(m_out3, out3, iend); \ - deallocCudaData(out1); \ - deallocCudaData(out2); \ - deallocCudaData(out3); \ - deallocCudaData(in1); \ - deallocCudaData(in2); - template < size_t block_size > __launch_bounds__(block_size) __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, @@ -63,8 +46,6 @@ void INIT3::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - INIT3_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,12 +57,8 @@ void INIT3::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT3_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - INIT3_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -95,12 +72,8 @@ void INIT3::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT3_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - INIT3_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -112,8 +85,6 @@ void INIT3::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT3_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index ea16991a0..51f27ddf5 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -21,23 +21,6 @@ namespace rajaperf namespace basic { -#define INIT3_DATA_SETUP_HIP \ - allocAndInitHipData(out1, m_out1, iend); \ - allocAndInitHipData(out2, m_out2, iend); \ - allocAndInitHipData(out3, m_out3, iend); \ - allocAndInitHipData(in1, m_in1, iend); \ - allocAndInitHipData(in2, m_in2, iend); - -#define INIT3_DATA_TEARDOWN_HIP \ - getHipData(m_out1, out1, iend); \ - getHipData(m_out2, out2, iend); \ - getHipData(m_out3, out3, iend); \ - deallocHipData(out1); \ - deallocHipData(out2); \ - deallocHipData(out3); \ - deallocHipData(in1); \ - deallocHipData(in2); - template < size_t block_size > __launch_bounds__(block_size) __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, @@ -63,8 +46,6 @@ void INIT3::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - INIT3_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,12 +57,8 @@ void INIT3::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT3_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - INIT3_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -97,12 +74,8 @@ void INIT3::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT3_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - INIT3_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,8 +87,6 @@ void INIT3::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT3_DATA_TEARDOWN_HIP; - } else { getCout() << "\n INIT3 : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index a64432836..825730bdc 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -26,26 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define INIT3_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid); - -#define INIT3_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_out1, out1, iend, hid, did); \ - getOpenMPDeviceData(m_out2, out2, iend, hid, did); \ - getOpenMPDeviceData(m_out3, out3, iend, hid, did); \ - deallocOpenMPDeviceData(out1, did); \ - deallocOpenMPDeviceData(out2, did); \ - deallocOpenMPDeviceData(out3, did); \ - deallocOpenMPDeviceData(in1, did); \ - deallocOpenMPDeviceData(in2, did); - void INIT3::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -57,8 +37,6 @@ void INIT3::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun if ( vid == Base_OpenMPTarget ) { - INIT3_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +49,8 @@ void INIT3::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } stopTimer(); - INIT3_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - INIT3_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,8 +62,6 @@ void INIT3::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } stopTimer(); - INIT3_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n INIT3 : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index a2b677c26..2ee729dc7 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace basic { -#define INIT_VIEW1D_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, getActualProblemSize()); - -#define INIT_VIEW1D_DATA_TEARDOWN_CUDA \ - getCudaData(m_a, a, getActualProblemSize()); \ - deallocCudaData(a); - template < size_t block_size > __launch_bounds__(block_size) __global__ void initview1d(Real_ptr a, @@ -53,8 +46,6 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - INIT_VIEW1D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -65,12 +56,8 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - INIT_VIEW1D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -84,12 +71,8 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - INIT_VIEW1D_DATA_SETUP_CUDA; - INIT_VIEW1D_VIEW_RAJA; startTimer(); @@ -103,8 +86,6 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index ef108688e..7498c4a85 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace basic { -#define INIT_VIEW1D_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, iend); - -#define INIT_VIEW1D_DATA_TEARDOWN_HIP \ - getHipData(m_a, a, iend); \ - deallocHipData(a); - template < size_t block_size > __launch_bounds__(block_size) __global__ void initview1d(Real_ptr a, @@ -53,8 +46,6 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - INIT_VIEW1D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +57,8 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - INIT_VIEW1D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,12 +74,8 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - INIT_VIEW1D_DATA_SETUP_HIP; - INIT_VIEW1D_VIEW_RAJA; startTimer(); @@ -106,8 +89,6 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_HIP; - } else { getCout() << "\n INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index 1a6105b0f..d9ad636e1 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -26,16 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define INIT_VIEW1D_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, getActualProblemSize(), did, hid); - -#define INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_a, a, getActualProblemSize(), hid, did); \ - deallocOpenMPDeviceData(a, did); - void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -47,8 +37,6 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A if ( vid == Base_OpenMPTarget ) { - INIT_VIEW1D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -61,12 +49,8 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - INIT_VIEW1D_DATA_SETUP_OMP_TARGET; - INIT_VIEW1D_VIEW_RAJA; startTimer(); @@ -80,8 +64,6 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 1d7e005e6..75d9816e2 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace basic { -#define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, getActualProblemSize()); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA \ - getCudaData(m_a, a, getActualProblemSize()); \ - deallocCudaData(a); - template < size_t block_size > __launch_bounds__(block_size) __global__ void initview1d_offset(Real_ptr a, @@ -54,8 +47,6 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -68,12 +59,8 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,12 +74,8 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; - INIT_VIEW1D_OFFSET_VIEW_RAJA; startTimer(); @@ -106,8 +89,6 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 359e8469a..dec2d02ac 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace basic { -#define INIT_VIEW1D_OFFSET_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, getActualProblemSize()); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP \ - getHipData(m_a, a, getActualProblemSize()); \ - deallocHipData(a); - template < size_t block_size > __launch_bounds__(block_size) __global__ void initview1d_offset(Real_ptr a, @@ -54,8 +47,6 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -67,12 +58,8 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,12 +75,8 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; - INIT_VIEW1D_OFFSET_VIEW_RAJA; startTimer(); @@ -107,8 +90,6 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; - } else { getCout() << "\n INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index 2ece34afe..d045462d7 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -26,16 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, getActualProblemSize(), did, hid); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_a, a, getActualProblemSize(), hid, did); \ - deallocOpenMPDeviceData(a, did); - void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -47,8 +37,6 @@ void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U if ( vid == Base_OpenMPTarget ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -61,12 +49,8 @@ void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET; - INIT_VIEW1D_OFFSET_VIEW_RAJA; startTimer(); @@ -80,8 +64,6 @@ void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl; } diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index 7dc05d6d0..a5866441f 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -19,20 +19,6 @@ namespace rajaperf { namespace basic { -#define MAT_MAT_SHARED_DATA_SETUP_CUDA \ - const Index_type NN = m_N * m_N; \ - allocAndInitCudaData(A, m_A, NN); \ - allocAndInitCudaData(B, m_B, NN); \ - allocAndInitCudaData(C, m_C, NN); - -#define MAT_MAT_SHARED_DATA_TEARDOWN_CUDA \ - getCudaData(m_A, A, NN); \ - getCudaData(m_B, B, NN); \ - getCudaData(m_C, C, NN); \ - deallocCudaData(A); \ - deallocCudaData(B); \ - deallocCudaData(C); - template < Index_type tile_size > __launch_bounds__(tile_size*tile_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, @@ -81,8 +67,6 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) if (vid == Base_CUDA) { - MAT_MAT_SHARED_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,12 +76,8 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) } stopTimer(); - MAT_MAT_SHARED_DATA_TEARDOWN_CUDA; - } else if (vid == Lambda_CUDA) { - MAT_MAT_SHARED_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -194,12 +174,8 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) } stopTimer(); - MAT_MAT_SHARED_DATA_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - MAT_MAT_SHARED_DATA_SETUP_CUDA; - constexpr bool async = true; using launch_policy = RAJA::LaunchPolicy>; @@ -287,8 +263,6 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) } // loop over kernel reps stopTimer(); - MAT_MAT_SHARED_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MAT_MAT_SHARED : Unknown Cuda variant id = " << vid << std::endl; diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 36001e168..39e8813f6 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -19,20 +19,6 @@ namespace rajaperf { namespace basic { -#define MAT_MAT_SHARED_DATA_SETUP_HIP \ - const Index_type NN = m_N * m_N; \ - allocAndInitHipData(A, m_A, NN); \ - allocAndInitHipData(B, m_B, NN); \ - allocAndInitHipData(C, m_C, NN); - -#define MAT_MAT_SHARED_DATA_TEARDOWN_HIP \ - getHipData(m_A, A, NN); \ - getHipData(m_B, B, NN); \ - getHipData(m_C, C, NN); \ - deallocHipData(A); \ - deallocHipData(B); \ - deallocHipData(C); - template < Index_type tile_size > __launch_bounds__(tile_size*tile_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, @@ -81,8 +67,6 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) if (vid == Base_HIP) { - MAT_MAT_SHARED_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -93,12 +77,8 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } stopTimer(); - MAT_MAT_SHARED_DATA_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { - MAT_MAT_SHARED_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -199,12 +179,8 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } stopTimer(); - MAT_MAT_SHARED_DATA_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - MAT_MAT_SHARED_DATA_SETUP_HIP; - constexpr bool async = true; using launch_policy = RAJA::LaunchPolicy>; @@ -291,8 +267,6 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } // loop over kernel reps stopTimer(); - MAT_MAT_SHARED_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MAT_MAT_SHARED : Unknown Hip variant id = " << vid << std::endl; diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 9b70ff2d5..29233f8a1 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -21,23 +21,6 @@ namespace rajaperf namespace basic { -#define MULADDSUB_DATA_SETUP_CUDA \ - allocAndInitCudaData(out1, m_out1, iend); \ - allocAndInitCudaData(out2, m_out2, iend); \ - allocAndInitCudaData(out3, m_out3, iend); \ - allocAndInitCudaData(in1, m_in1, iend); \ - allocAndInitCudaData(in2, m_in2, iend); - -#define MULADDSUB_DATA_TEARDOWN_CUDA \ - getCudaData(m_out1, out1, iend); \ - getCudaData(m_out2, out2, iend); \ - getCudaData(m_out3, out3, iend); \ - deallocCudaData(out1); \ - deallocCudaData(out2); \ - deallocCudaData(out3); \ - deallocCudaData(in1); \ - deallocCudaData(in2); - template < size_t block_size > __launch_bounds__(block_size) __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, @@ -63,8 +46,6 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - MULADDSUB_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,12 +57,8 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) } stopTimer(); - MULADDSUB_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - MULADDSUB_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -95,12 +72,8 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) } stopTimer(); - MULADDSUB_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - MULADDSUB_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -112,8 +85,6 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) } stopTimer(); - MULADDSUB_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MULADDSUB : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 039ad995d..0f5a25a15 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -21,23 +21,6 @@ namespace rajaperf namespace basic { -#define MULADDSUB_DATA_SETUP_HIP \ - allocAndInitHipData(out1, m_out1, iend); \ - allocAndInitHipData(out2, m_out2, iend); \ - allocAndInitHipData(out3, m_out3, iend); \ - allocAndInitHipData(in1, m_in1, iend); \ - allocAndInitHipData(in2, m_in2, iend); - -#define MULADDSUB_DATA_TEARDOWN_HIP \ - getHipData(m_out1, out1, iend); \ - getHipData(m_out2, out2, iend); \ - getHipData(m_out3, out3, iend); \ - deallocHipData(out1); \ - deallocHipData(out2); \ - deallocHipData(out3); \ - deallocHipData(in1); \ - deallocHipData(in2); - template < size_t block_size > __launch_bounds__(block_size) __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, @@ -63,8 +46,6 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - MULADDSUB_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,12 +57,8 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) } stopTimer(); - MULADDSUB_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - MULADDSUB_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -97,12 +74,8 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) } stopTimer(); - MULADDSUB_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - MULADDSUB_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,8 +87,6 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) } stopTimer(); - MULADDSUB_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MULADDSUB : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index c2487b04f..af691d008 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -26,26 +26,6 @@ namespace basic // const size_t threads_per_team = 256; -#define MULADDSUB_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid); - -#define MULADDSUB_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_out1, out1, iend, hid, did); \ - getOpenMPDeviceData(m_out2, out2, iend, hid, did); \ - getOpenMPDeviceData(m_out3, out3, iend, hid, did); \ - deallocOpenMPDeviceData(out1, did); \ - deallocOpenMPDeviceData(out2, did); \ - deallocOpenMPDeviceData(out3, did); \ - deallocOpenMPDeviceData(in1, did); \ - deallocOpenMPDeviceData(in2, did); - void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -57,8 +37,6 @@ void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG if ( vid == Base_OpenMPTarget ) { - MULADDSUB_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +49,8 @@ void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - MULADDSUB_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - MULADDSUB_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,8 +62,6 @@ void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - MULADDSUB_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index ee1160dbc..294994113 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -41,13 +41,6 @@ namespace basic static_cast(RAJA_DIVIDE_CEILING_INT(nk, k_block_sz))); -#define NESTED_INIT_DATA_SETUP_CUDA \ - allocAndInitCudaData(array, m_array, m_array_length); - -#define NESTED_INIT_DATA_TEARDOWN_CUDA \ - getCudaData(m_array, array, m_array_length); \ - deallocCudaData(array); - template< size_t i_block_size, size_t j_block_size, size_t k_block_size > __launch_bounds__(i_block_size*j_block_size*k_block_size) __global__ void nested_init(Real_ptr array, @@ -87,8 +80,6 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - NESTED_INIT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -103,12 +94,8 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) } stopTimer(); - NESTED_INIT_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - NESTED_INIT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -126,12 +113,8 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) } stopTimer(); - NESTED_INIT_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - NESTED_INIT_DATA_SETUP_CUDA; - using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernelFixedAsync __launch_bounds__(block_size) __global__ void int_predict(Real_ptr px, @@ -55,8 +48,6 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - INT_PREDICT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +62,8 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) } stopTimer(); - INT_PREDICT_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - INT_PREDICT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,8 +75,6 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) } stopTimer(); - INT_PREDICT_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n INT_PREDICT : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 1313aa402..8cc7007e4 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -21,13 +21,6 @@ namespace rajaperf namespace lcals { -#define INT_PREDICT_DATA_SETUP_HIP \ - allocAndInitHipData(px, m_px, m_array_length); - -#define INT_PREDICT_DATA_TEARDOWN_HIP \ - getHipData(m_px, px, m_array_length); \ - deallocHipData(px); - template < size_t block_size > __launch_bounds__(block_size) __global__ void int_predict(Real_ptr px, @@ -55,8 +48,6 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - INT_PREDICT_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +62,8 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) } stopTimer(); - INT_PREDICT_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - INT_PREDICT_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,8 +75,6 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) } stopTimer(); - INT_PREDICT_DATA_TEARDOWN_HIP; - } else { getCout() << "\n INT_PREDICT : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index f166b971f..4172c1822 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -26,16 +26,6 @@ namespace lcals // const size_t threads_per_team = 256; -#define INT_PREDICT_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(px, m_px, m_array_length, did, hid); - -#define INT_PREDICT_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_px, px, m_array_length, hid, did); \ - deallocOpenMPDeviceData(px, did); - void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -47,8 +37,6 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A if ( vid == Base_OpenMPTarget ) { - INT_PREDICT_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -61,12 +49,8 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - INT_PREDICT_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - INT_PREDICT_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -78,8 +62,6 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - INT_PREDICT_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n INT_PREDICT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index e3818fcce..dc5b000e1 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -22,21 +22,6 @@ namespace rajaperf namespace lcals { -#define PLANCKIAN_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, iend); \ - allocAndInitCudaData(y, m_y, iend); \ - allocAndInitCudaData(u, m_u, iend); \ - allocAndInitCudaData(v, m_v, iend); \ - allocAndInitCudaData(w, m_w, iend); - -#define PLANCKIAN_DATA_TEARDOWN_CUDA \ - getCudaData(m_w, w, iend); \ - deallocCudaData(x); \ - deallocCudaData(y); \ - deallocCudaData(u); \ - deallocCudaData(v); \ - deallocCudaData(w); - template < size_t block_size > __launch_bounds__(block_size) __global__ void planckian(Real_ptr x, Real_ptr y, @@ -61,8 +46,6 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - PLANCKIAN_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -75,12 +58,8 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - PLANCKIAN_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,8 +71,6 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n PLANCKIAN : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index eeb716c07..cfaccfa40 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -22,21 +22,6 @@ namespace rajaperf namespace lcals { -#define PLANCKIAN_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, iend); \ - allocAndInitHipData(y, m_y, iend); \ - allocAndInitHipData(u, m_u, iend); \ - allocAndInitHipData(v, m_v, iend); \ - allocAndInitHipData(w, m_w, iend); - -#define PLANCKIAN_DATA_TEARDOWN_HIP \ - getHipData(m_w, w, iend); \ - deallocHipData(x); \ - deallocHipData(y); \ - deallocHipData(u); \ - deallocHipData(v); \ - deallocHipData(w); - template < size_t block_size > __launch_bounds__(block_size) __global__ void planckian(Real_ptr x, Real_ptr y, @@ -61,8 +46,6 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - PLANCKIAN_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -75,12 +58,8 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - PLANCKIAN_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,8 +71,6 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_HIP; - } else { getCout() << "\n PLANCKIAN : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index 04dd16e9d..fb0f41cef 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -27,24 +27,6 @@ namespace lcals // const size_t threads_per_team = 256; -#define PLANCKIAN_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); \ - allocAndInitOpenMPDeviceData(u, m_u, iend, did, hid); \ - allocAndInitOpenMPDeviceData(v, m_v, iend, did, hid); \ - allocAndInitOpenMPDeviceData(w, m_w, iend, did, hid); - -#define PLANCKIAN_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_w, w, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); \ - deallocOpenMPDeviceData(u, did); \ - deallocOpenMPDeviceData(v, did); \ - deallocOpenMPDeviceData(w, did); - void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -56,8 +38,6 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG if ( vid == Base_OpenMPTarget ) { - PLANCKIAN_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -70,12 +50,8 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - PLANCKIAN_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,8 +63,6 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n PLANCKIAN : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 8cff00ef5..d21d351b1 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -21,19 +21,6 @@ namespace rajaperf namespace lcals { -#define TRIDIAG_ELIM_DATA_SETUP_CUDA \ - allocAndInitCudaData(xout, m_xout, m_N); \ - allocAndInitCudaData(xin, m_xin, m_N); \ - allocAndInitCudaData(y, m_y, m_N); \ - allocAndInitCudaData(z, m_z, m_N); - -#define TRIDIAG_ELIM_DATA_TEARDOWN_CUDA \ - getCudaData(m_xout, xout, m_N); \ - deallocCudaData(xout); \ - deallocCudaData(xin); \ - deallocCudaData(y); \ - deallocCudaData(z); - template < size_t block_size > __launch_bounds__(block_size) __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, @@ -57,8 +44,6 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - TRIDIAG_ELIM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +56,8 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) } stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - TRIDIAG_ELIM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,8 +69,6 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) } stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n TRIDIAG_ELIM : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 78d898f70..bb5f0bc8d 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -21,19 +21,6 @@ namespace rajaperf namespace lcals { -#define TRIDIAG_ELIM_DATA_SETUP_HIP \ - allocAndInitHipData(xout, m_xout, m_N); \ - allocAndInitHipData(xin, m_xin, m_N); \ - allocAndInitHipData(y, m_y, m_N); \ - allocAndInitHipData(z, m_z, m_N); - -#define TRIDIAG_ELIM_DATA_TEARDOWN_HIP \ - getHipData(m_xout, xout, m_N); \ - deallocHipData(xout); \ - deallocHipData(xin); \ - deallocHipData(y); \ - deallocHipData(z); - template < size_t block_size > __launch_bounds__(block_size) __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, @@ -57,8 +44,6 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - TRIDIAG_ELIM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -70,12 +55,8 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) } stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - TRIDIAG_ELIM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,8 +68,6 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) } stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_HIP; - } else { getCout() << "\n TRIDIAG_ELIM : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 7cf12fd3f..59a8a323c 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -26,22 +26,6 @@ namespace lcals // const size_t threads_per_team = 256; -#define TRIDIAG_ELIM_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(xout, m_xout, m_N, did, hid); \ - allocAndInitOpenMPDeviceData(xin, m_xin, m_N, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, m_N, did, hid); \ - allocAndInitOpenMPDeviceData(z, m_z, m_N, did, hid); - -#define TRIDIAG_ELIM_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_xout, xout, m_N, hid, did); \ - deallocOpenMPDeviceData(xout, did); \ - deallocOpenMPDeviceData(xin, did); \ - deallocOpenMPDeviceData(y, did); \ - deallocOpenMPDeviceData(z, did); - void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -53,8 +37,6 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ if ( vid == Base_OpenMPTarget ) { - TRIDIAG_ELIM_DATA_SETUP_OMP_TARGET - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -67,12 +49,8 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_OMP_TARGET - } else if ( vid == RAJA_OpenMPTarget ) { - TRIDIAG_ELIM_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -84,8 +62,6 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_OMP_TARGET - } else { getCout() << "\n TRIDIAG_ELIM : Unknown OMP Tagretvariant id = " << vid << std::endl; } From 415798de18f8733097f50df312dc2977250312f9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 21:23:16 -0700 Subject: [PATCH 090/137] Remove polybench cuda/hip/omptarget data setup --- src/polybench/POLYBENCH_2MM-Cuda.cpp | 29 --------------- src/polybench/POLYBENCH_2MM-Hip.cpp | 28 --------------- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 28 --------------- src/polybench/POLYBENCH_3MM-Cuda.cpp | 32 ----------------- src/polybench/POLYBENCH_3MM-Hip.cpp | 32 ----------------- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 31 ---------------- src/polybench/POLYBENCH_ADI-Cuda.cpp | 26 -------------- src/polybench/POLYBENCH_ADI-Hip.cpp | 27 -------------- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 25 ------------- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 27 -------------- src/polybench/POLYBENCH_ATAX-Hip.cpp | 27 -------------- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 26 -------------- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 26 -------------- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 26 -------------- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 25 ------------- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 23 ------------ .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 22 ------------ .../POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 22 ------------ src/polybench/POLYBENCH_GEMM-Cuda.cpp | 25 ------------- src/polybench/POLYBENCH_GEMM-Hip.cpp | 25 ------------- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 24 ------------- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 36 ------------------- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 36 ------------------- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 36 ------------------- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 23 ------------ src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 23 ------------ src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 26 -------------- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 25 ------------- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 25 ------------- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 23 ------------ src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 20 ----------- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 20 ----------- .../POLYBENCH_JACOBI_1D-OMPTarget.cpp | 23 ------------ src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 24 ------------- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 24 ------------- .../POLYBENCH_JACOBI_2D-OMPTarget.cpp | 23 ------------ src/polybench/POLYBENCH_MVT-Cuda.cpp | 26 -------------- src/polybench/POLYBENCH_MVT-Hip.cpp | 26 -------------- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 29 --------------- 39 files changed, 1024 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index d10ca613d..4a22d2b09 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -44,23 +44,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_2MM_DATA_SETUP_CUDA \ - allocAndInitCudaData(tmp, m_tmp, m_ni * m_nj); \ - allocAndInitCudaData(A, m_A, m_ni * m_nk); \ - allocAndInitCudaData(B, m_B, m_nk * m_nj); \ - allocAndInitCudaData(C, m_C, m_nj * m_nl); \ - allocAndInitCudaData(D, m_D, m_ni * m_nl); - - -#define POLYBENCH_2MM_TEARDOWN_CUDA \ - getCudaData(m_D, D, m_ni * m_nl); \ - deallocCudaData(tmp); \ - deallocCudaData(A); \ - deallocCudaData(B); \ - deallocCudaData(C); \ - deallocCudaData(D); - - template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B, @@ -133,8 +116,6 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_2MM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -155,12 +136,8 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_CUDA; - } else if (vid == Lambda_CUDA) { - POLYBENCH_2MM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -195,12 +172,8 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_2MM_DATA_SETUP_CUDA; - POLYBENCH_2MM_VIEWS_RAJA; using EXEC_POL = @@ -268,8 +241,6 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 5f50f1424..d29fb1e97 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -44,22 +44,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_2MM_DATA_SETUP_HIP \ - allocAndInitHipData(tmp, m_tmp, m_ni * m_nj); \ - allocAndInitHipData(A, m_A, m_ni * m_nk); \ - allocAndInitHipData(B, m_B, m_nk * m_nj); \ - allocAndInitHipData(C, m_C, m_nj * m_nl); \ - allocAndInitHipData(D, m_D, m_ni * m_nl); - - -#define POLYBENCH_2MM_TEARDOWN_HIP \ - getHipData(m_D, D, m_ni * m_nl); \ - deallocHipData(tmp); \ - deallocHipData(A); \ - deallocHipData(B); \ - deallocHipData(C); \ - deallocHipData(D); - template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B, @@ -132,8 +116,6 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_2MM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -156,12 +138,8 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { - POLYBENCH_2MM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -198,12 +176,8 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_2MM_DATA_SETUP_HIP; - POLYBENCH_2MM_VIEWS_RAJA; using EXEC_POL = @@ -271,8 +245,6 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_2MM : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index a5010753e..79d6f96c0 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -21,26 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_2MM_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(tmp, m_tmp, m_ni * m_nj, did, hid); \ - allocAndInitOpenMPDeviceData(A, m_A, m_ni * m_nk, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_B, m_nk * m_nj, did, hid); \ - allocAndInitOpenMPDeviceData(C, m_C, m_nj * m_nl, did, hid); \ - allocAndInitOpenMPDeviceData(D, m_D, m_ni * m_nl, did, hid); - - -#define POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_D, D, m_ni * m_nl, hid, did); \ - deallocOpenMPDeviceData(tmp, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); \ - deallocOpenMPDeviceData(C, did); \ - deallocOpenMPDeviceData(D, did); - - void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -49,8 +29,6 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED if ( vid == Base_OpenMPTarget ) { - POLYBENCH_2MM_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -81,12 +59,8 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - POLYBENCH_2MM_DATA_SETUP_OMP_TARGET; - POLYBENCH_2MM_VIEWS_RAJA; using EXEC_POL = @@ -141,8 +115,6 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_2MM : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index eee8a762a..8a0415e4c 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -49,26 +49,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_3MM_DATA_SETUP_CUDA \ - allocAndInitCudaData(A, m_A, m_ni * m_nk); \ - allocAndInitCudaData(B, m_B, m_nk * m_nj); \ - allocAndInitCudaData(C, m_C, m_nj * m_nm); \ - allocAndInitCudaData(D, m_D, m_nm * m_nl); \ - allocAndInitCudaData(E, m_E, m_ni * m_nj); \ - allocAndInitCudaData(F, m_F, m_nj * m_nl); \ - allocAndInitCudaData(G, m_G, m_ni * m_nl); - - -#define POLYBENCH_3MM_TEARDOWN_CUDA \ - getCudaData(m_G, G, m_ni * m_nl); \ - deallocCudaData(A); \ - deallocCudaData(B); \ - deallocCudaData(C); \ - deallocCudaData(D); \ - deallocCudaData(E); \ - deallocCudaData(F); \ - deallocCudaData(G); - template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B, @@ -170,8 +150,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_3MM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -198,12 +176,8 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_3MM_TEARDOWN_CUDA; - } else if (vid == Lambda_CUDA) { - POLYBENCH_3MM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -251,12 +225,8 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_3MM_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_3MM_DATA_SETUP_CUDA; - POLYBENCH_3MM_VIEWS_RAJA; using EXEC_POL = @@ -346,8 +316,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_3MM_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 0d08f3dbe..977e43ec6 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -49,26 +49,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_3MM_DATA_SETUP_HIP \ - allocAndInitHipData(A, m_A, m_ni * m_nk); \ - allocAndInitHipData(B, m_B, m_nk * m_nj); \ - allocAndInitHipData(C, m_C, m_nj * m_nm); \ - allocAndInitHipData(D, m_D, m_nm * m_nl); \ - allocAndInitHipData(E, m_E, m_ni * m_nj); \ - allocAndInitHipData(F, m_F, m_nj * m_nl); \ - allocAndInitHipData(G, m_G, m_ni * m_nl); - - -#define POLYBENCH_3MM_TEARDOWN_HIP \ - getHipData(m_G, G, m_ni * m_nl); \ - deallocHipData(A); \ - deallocHipData(B); \ - deallocHipData(C); \ - deallocHipData(D); \ - deallocHipData(E); \ - deallocHipData(F); \ - deallocHipData(G); - template < size_t in_block_size, size_t out_block_size > __launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B, @@ -169,8 +149,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_3MM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -200,12 +178,8 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_3MM_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { - POLYBENCH_3MM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -256,12 +230,8 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_3MM_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_3MM_DATA_SETUP_HIP; - POLYBENCH_3MM_VIEWS_RAJA; using EXEC_POL = @@ -351,8 +321,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_3MM_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_3MM : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index a760a590a..c25a49dee 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -21,29 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_3MM_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(A, m_A, m_ni * m_nk, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_B, m_nk * m_nj, did, hid); \ - allocAndInitOpenMPDeviceData(C, m_C, m_nj * m_nm, did, hid); \ - allocAndInitOpenMPDeviceData(D, m_D, m_nm * m_nl, did, hid); \ - allocAndInitOpenMPDeviceData(E, m_E, m_ni * m_nj, did, hid); \ - allocAndInitOpenMPDeviceData(F, m_F, m_nj * m_nl, did, hid); \ - allocAndInitOpenMPDeviceData(G, m_G, m_ni * m_nl, did, hid); - - -#define POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_G, G, m_ni * m_nl, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); \ - deallocOpenMPDeviceData(C, did); \ - deallocOpenMPDeviceData(D, did); \ - deallocOpenMPDeviceData(E, did); \ - deallocOpenMPDeviceData(F, did); \ - deallocOpenMPDeviceData(G, did); - void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -52,8 +29,6 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED if ( vid == Base_OpenMPTarget ) { - POLYBENCH_3MM_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -96,12 +71,8 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - POLYBENCH_3MM_DATA_SETUP_OMP_TARGET; - POLYBENCH_3MM_VIEWS_RAJA; using EXEC_POL = @@ -182,8 +153,6 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_3MM : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index 2e2b03972..b676dfd08 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -21,20 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_ADI_DATA_SETUP_CUDA \ - allocAndInitCudaData(U, m_U, m_n * m_n); \ - allocAndInitCudaData(V, m_V, m_n * m_n); \ - allocAndInitCudaData(P, m_P, m_n * m_n); \ - allocAndInitCudaData(Q, m_Q, m_n * m_n); - -#define POLYBENCH_ADI_TEARDOWN_CUDA \ - getCudaData(m_U, U, m_n * m_n); \ - deallocCudaData(U); \ - deallocCudaData(V); \ - deallocCudaData(P); \ - deallocCudaData(Q); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void adi1(const Index_type n, @@ -96,8 +82,6 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_ADI_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -120,12 +104,8 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ADI_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_ADI_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -166,12 +146,8 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ADI_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_ADI_DATA_SETUP_CUDA; - POLYBENCH_ADI_VIEWS_RAJA; using EXEC_POL = @@ -241,8 +217,6 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_ADI_TEARDOWN_CUDA - } else { getCout() << "\n POLYBENCH_ADI : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index c224d8741..fb79054a2 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_ADI_DATA_SETUP_HIP \ - allocAndInitHipData(U, m_U, m_n * m_n); \ - allocAndInitHipData(V, m_V, m_n * m_n); \ - allocAndInitHipData(P, m_P, m_n * m_n); \ - allocAndInitHipData(Q, m_Q, m_n * m_n); - - -#define POLYBENCH_ADI_TEARDOWN_HIP \ - getHipData(m_U, U, m_n * m_n); \ - deallocHipData(U); \ - deallocHipData(V); \ - deallocHipData(P); \ - deallocHipData(Q); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void adi1(const Index_type n, @@ -97,8 +82,6 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_ADI_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -125,12 +108,8 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ADI_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_ADI_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -175,12 +154,8 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ADI_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_ADI_DATA_SETUP_HIP; - POLYBENCH_ADI_VIEWS_RAJA; using EXEC_POL = @@ -250,8 +225,6 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_ADI_TEARDOWN_HIP - } else { getCout() << "\n POLYBENCH_ADI : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index cb159522b..c67e5a20a 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -26,23 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_ADI_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(U, m_U, m_n * m_n, did, hid); \ - allocAndInitOpenMPDeviceData(V, m_V, m_n * m_n, did, hid); \ - allocAndInitOpenMPDeviceData(P, m_P, m_n * m_n, did, hid); \ - allocAndInitOpenMPDeviceData(Q, m_Q, m_n * m_n, did, hid); - -#define POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_U, U, m_n * m_n, hid, did); \ - deallocOpenMPDeviceData(U, did); \ - deallocOpenMPDeviceData(V, did); \ - deallocOpenMPDeviceData(P, did); \ - deallocOpenMPDeviceData(Q, did); - - void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -51,8 +34,6 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED if ( vid == Base_OpenMPTarget ) { - POLYBENCH_ADI_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -89,12 +70,8 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } // run_reps stopTimer(); - POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - POLYBENCH_ADI_DATA_SETUP_OMP_TARGET; - POLYBENCH_ADI_VIEWS_RAJA; using EXEC_POL = @@ -159,8 +136,6 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } // run_reps stopTimer(); - POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_ADI : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 53b13aa2d..ab3839735 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_ATAX_DATA_SETUP_CUDA \ - allocAndInitCudaData(tmp, m_tmp, N); \ - allocAndInitCudaData(y, m_y, N); \ - allocAndInitCudaData(x, m_x, N); \ - allocAndInitCudaData(A, m_A, N * N); - - -#define POLYBENCH_ATAX_TEARDOWN_CUDA \ - getCudaData(m_y, y, N); \ - deallocCudaData(tmp); \ - deallocCudaData(y); \ - deallocCudaData(x); \ - deallocCudaData(A); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp, @@ -90,8 +75,6 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_ATAX_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -106,12 +89,8 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_ATAX_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -142,12 +121,8 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_ATAX_DATA_SETUP_CUDA; - POLYBENCH_ATAX_VIEWS_RAJA; using EXEC_POL1 = @@ -223,8 +198,6 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_ATAX : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 6bebcfc0d..f233ab227 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_ATAX_DATA_SETUP_HIP \ - allocAndInitHipData(tmp, m_tmp, N); \ - allocAndInitHipData(y, m_y, N); \ - allocAndInitHipData(x, m_x, N); \ - allocAndInitHipData(A, m_A, N * N); - - -#define POLYBENCH_ATAX_TEARDOWN_HIP \ - getHipData(m_y, y, N); \ - deallocHipData(tmp); \ - deallocHipData(y); \ - deallocHipData(x); \ - deallocHipData(A); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp, @@ -90,8 +75,6 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_ATAX_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -110,12 +93,8 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_ATAX_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -150,12 +129,8 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_ATAX_DATA_SETUP_HIP; - POLYBENCH_ATAX_VIEWS_RAJA; using EXEC_POL1 = @@ -230,8 +205,6 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_ATAX : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index ed2f83ae7..e9e13e9cd 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -26,24 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_ATAX_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(tmp, m_tmp, N, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, N, did, hid); \ - allocAndInitOpenMPDeviceData(x, m_x, N, did, hid); \ - allocAndInitOpenMPDeviceData(A, m_A, N * N, did, hid); - - -#define POLYBENCH_ATAX_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, N, hid, did); \ - deallocOpenMPDeviceData(tmp, did); \ - deallocOpenMPDeviceData(y, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(A, did); - - void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -52,8 +34,6 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE if ( vid == Base_OpenMPTarget ) { - POLYBENCH_ATAX_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,12 +60,8 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_ATAX_DATA_SETUP_OMP_TARGET; - POLYBENCH_ATAX_VIEWS_RAJA; using EXEC_POL1 = @@ -150,8 +126,6 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE } stopTimer(); - POLYBENCH_ATAX_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_ATAX : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 4e6b56a0c..d5ee6f167 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -39,20 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_FDTD_2D_DATA_SETUP_CUDA \ - allocAndInitCudaData(hz, m_hz, m_nx * m_ny); \ - allocAndInitCudaData(ex, m_ex, m_nx * m_ny); \ - allocAndInitCudaData(ey, m_ey, m_nx * m_ny); \ - allocAndInitCudaData(fict, m_fict, m_tsteps); - - -#define POLYBENCH_FDTD_2D_TEARDOWN_CUDA \ - getCudaData(m_hz, hz, m_nx * m_ny); \ - deallocCudaData(ex); \ - deallocCudaData(ey); \ - deallocCudaData(fict); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_fdtd2d_1(Real_ptr ey, Real_ptr fict, @@ -164,8 +150,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_FDTD_2D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -196,12 +180,8 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_FDTD_2D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -247,12 +227,8 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_FDTD_2D_DATA_SETUP_CUDA; - POLYBENCH_FDTD_2D_VIEWS_RAJA; using EXEC_POL1 = RAJA::cuda_exec; @@ -313,8 +289,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_FDTD_2D : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index 8e7602ea7..4738fd4d9 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -38,20 +38,6 @@ namespace polybench static_cast(RAJA_DIVIDE_CEILING_INT(nx, i_block_sz)), \ static_cast(1)); -#define POLYBENCH_FDTD_2D_DATA_SETUP_HIP \ - allocAndInitHipData(hz, m_hz, m_nx * m_ny); \ - allocAndInitHipData(ex, m_ex, m_nx * m_ny); \ - allocAndInitHipData(ey, m_ey, m_nx * m_ny); \ - allocAndInitHipData(fict, m_fict, m_tsteps); - - -#define POLYBENCH_FDTD_2D_TEARDOWN_HIP \ - getHipData(m_hz, hz, m_nx * m_ny); \ - deallocHipData(ex); \ - deallocHipData(ey); \ - deallocHipData(fict); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_fdtd2d_1(Real_ptr ey, Real_ptr fict, @@ -163,8 +149,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_FDTD_2D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -199,12 +183,8 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_FDTD_2D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -259,12 +239,8 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_FDTD_2D_DATA_SETUP_HIP; - POLYBENCH_FDTD_2D_VIEWS_RAJA; using EXEC_POL1 = RAJA::hip_exec; @@ -325,8 +301,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_FDTD_2D : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index b8e51f834..c34d939ad 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -26,23 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_FDTD_2D_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(hz, m_hz, m_nx * m_ny, did, hid); \ - allocAndInitOpenMPDeviceData(ex, m_ex, m_nx * m_ny, did, hid); \ - allocAndInitOpenMPDeviceData(ey, m_ey, m_nx * m_ny, did, hid); \ - allocAndInitOpenMPDeviceData(fict, m_fict, m_tsteps, did, hid); - - -#define POLYBENCH_FDTD_2D_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_hz, hz, m_nx * m_ny, hid, did); \ - deallocOpenMPDeviceData(ex, did); \ - deallocOpenMPDeviceData(ey, did); \ - deallocOpenMPDeviceData(fict, did); - - void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -51,8 +34,6 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN if ( vid == Base_OpenMPTarget ) { - POLYBENCH_FDTD_2D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -93,12 +74,8 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_FDTD_2D_DATA_SETUP_OMP_TARGET; - POLYBENCH_FDTD_2D_VIEWS_RAJA; using EXEC_POL1 = RAJA::omp_target_parallel_for_exec; @@ -150,8 +127,6 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } // run_reps stopTimer(); - POLYBENCH_FDTD_2D_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_FDTD_2D : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 6bee94639..3e1cddca4 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -37,17 +37,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_CUDA \ - allocAndInitCudaData(pin, m_pin, m_N * m_N); \ - allocAndInitCudaData(pout, m_pout, m_N * m_N); - - -#define POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA \ - getCudaData(m_pout, pout, m_N * m_N); \ - deallocCudaData(pin); \ - deallocCudaData(pout); - - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin, @@ -85,8 +74,6 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -105,12 +92,8 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -131,12 +114,8 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_CUDA; - POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; using EXEC_POL = @@ -172,8 +151,6 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 0c7fdfa1c..e8bd61f48 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -37,16 +37,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_HIP \ - allocAndInitHipData(pin, m_pin, m_N * m_N); \ - allocAndInitHipData(pout, m_pout, m_N * m_N); - - -#define POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP \ - getHipData(m_pout, pout, m_N * m_N); \ - deallocHipData(pin); \ - deallocHipData(pout); - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin, @@ -84,8 +74,6 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -105,12 +93,8 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -135,12 +119,8 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_HIP; - POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; using EXEC_POL = @@ -176,8 +156,6 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index 036220707..e0e5d6d54 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -21,20 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(pin, m_pin, m_N * m_N, did, hid); \ - allocAndInitOpenMPDeviceData(pout, m_pout, m_N * m_N, did, hid); - - -#define POLYBENCH_FLOYD_WARSHALL_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_pout, pout, m_N * m_N, hid, did); \ - deallocOpenMPDeviceData(pin, did); \ - deallocOpenMPDeviceData(pout, did); - - void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -43,8 +29,6 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t RAJA if ( vid == Base_OpenMPTarget ) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -63,12 +47,8 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t RAJA } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - POLYBENCH_FLOYD_WARSHALL_DATA_SETUP_OMP_TARGET; - POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; using EXEC_POL = @@ -95,8 +75,6 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t RAJA } stopTimer(); - POLYBENCH_FLOYD_WARSHALL_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index b9a12aa8d..9a2886be4 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -39,19 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_GEMM_DATA_SETUP_CUDA \ - allocAndInitCudaData(A, m_A, ni*nk); \ - allocAndInitCudaData(B, m_B, nk*nj); \ - allocAndInitCudaData(C, m_C, ni*nj); - - -#define POLYBENCH_GEMM_TEARDOWN_CUDA \ - getCudaData(m_C, C, ni*nj); \ - deallocCudaData(A); \ - deallocCudaData(B); \ - deallocCudaData(C); - - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B, @@ -94,8 +81,6 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_GEMM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -111,12 +96,8 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_GEMM_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -139,12 +120,8 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_GEMM_DATA_SETUP_CUDA; - POLYBENCH_GEMM_VIEWS_RAJA; using EXEC_POL = @@ -198,8 +175,6 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index df14cc5e8..90b2450ca 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -39,19 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_GEMM_DATA_SETUP_HIP \ - allocAndInitHipData(A, m_A, ni*nk); \ - allocAndInitHipData(B, m_B, nk*nj); \ - allocAndInitHipData(C, m_C, ni*nj); - - -#define POLYBENCH_GEMM_TEARDOWN_HIP \ - getHipData(m_C, C, ni*nj); \ - deallocHipData(A); \ - deallocHipData(B); \ - deallocHipData(C); - - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B, @@ -94,8 +81,6 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_GEMM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -111,12 +96,8 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_GEMM_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -140,12 +121,8 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_GEMM_DATA_SETUP_HIP; - POLYBENCH_GEMM_VIEWS_RAJA; using EXEC_POL = @@ -199,8 +176,6 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_GEMM : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index 6a6c9f25a..a1d618b5b 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -21,22 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_GEMM_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(A, m_A, ni*nk, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_B, nk*nj, did, hid); \ - allocAndInitOpenMPDeviceData(C, m_C, ni*nj, did, hid); - - -#define POLYBENCH_GEMM_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_C, C, ni*nj, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); \ - deallocOpenMPDeviceData(C, did); - - void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -45,8 +29,6 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE if ( vid == Base_OpenMPTarget ) { - POLYBENCH_GEMM_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +48,8 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_GEMM_DATA_SETUP_OMP_TARGET; - POLYBENCH_GEMM_VIEWS_RAJA; using EXEC_POL = @@ -116,8 +94,6 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE } stopTimer(); - POLYBENCH_GEMM_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_GEMM : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 46e695623..ed40a7355 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -39,30 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_GEMVER_DATA_SETUP_CUDA \ - allocAndInitCudaData(A, m_A, m_n * m_n); \ - allocAndInitCudaData(u1, m_u1, m_n); \ - allocAndInitCudaData(v1, m_v1, m_n); \ - allocAndInitCudaData(u2, m_u2, m_n); \ - allocAndInitCudaData(v2, m_v2, m_n); \ - allocAndInitCudaData(w, m_w, m_n); \ - allocAndInitCudaData(x, m_x, m_n); \ - allocAndInitCudaData(y, m_y, m_n); \ - allocAndInitCudaData(z, m_z, m_n); - - -#define POLYBENCH_GEMVER_TEARDOWN_CUDA \ - getCudaData(m_w, w, m_n); \ - deallocCudaData(A); \ - deallocCudaData(u1); \ - deallocCudaData(v1); \ - deallocCudaData(u2); \ - deallocCudaData(v2); \ - deallocCudaData(w); \ - deallocCudaData(x); \ - deallocCudaData(y); \ - deallocCudaData(z); - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemmver_1(Real_ptr A, @@ -155,8 +131,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_GEMVER_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -187,12 +161,8 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMVER_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_GEMVER_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -241,12 +211,8 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMVER_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_GEMVER_DATA_SETUP_CUDA; - POLYBENCH_GEMVER_VIEWS_RAJA; using EXEC_POL1 = @@ -335,8 +301,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMVER_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_GEMVER : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 6579d5173..4d74dc50a 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -39,30 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_GEMVER_DATA_SETUP_HIP \ - allocAndInitHipData(A, m_A, m_n * m_n); \ - allocAndInitHipData(u1, m_u1, m_n); \ - allocAndInitHipData(v1, m_v1, m_n); \ - allocAndInitHipData(u2, m_u2, m_n); \ - allocAndInitHipData(v2, m_v2, m_n); \ - allocAndInitHipData(w, m_w, m_n); \ - allocAndInitHipData(x, m_x, m_n); \ - allocAndInitHipData(y, m_y, m_n); \ - allocAndInitHipData(z, m_z, m_n); - - -#define POLYBENCH_GEMVER_TEARDOWN_HIP \ - getHipData(m_w, w, m_n); \ - deallocHipData(A); \ - deallocHipData(u1); \ - deallocHipData(v1); \ - deallocHipData(u2); \ - deallocHipData(v2); \ - deallocHipData(w); \ - deallocHipData(x); \ - deallocHipData(y); \ - deallocHipData(z); - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemmver_1(Real_ptr A, @@ -155,8 +131,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_GEMVER_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -188,12 +162,8 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMVER_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_GEMVER_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -249,12 +219,8 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMVER_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_GEMVER_DATA_SETUP_HIP; - POLYBENCH_GEMVER_VIEWS_RAJA; using EXEC_POL1 = @@ -343,8 +309,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GEMVER_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_GEMVER : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index 39b7b50c8..b12be578a 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -26,34 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_GEMVER_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(A, m_A, m_n * m_n, did, hid); \ - allocAndInitOpenMPDeviceData(u1, m_u1, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(v1, m_v1, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(u2, m_u2, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(v2, m_v2, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(w, m_w, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(x, m_x, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(z, m_z, m_n, did, hid); - -#define POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_w, w, m_n, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(u1, did); \ - deallocOpenMPDeviceData(v1, did); \ - deallocOpenMPDeviceData(u2, did); \ - deallocOpenMPDeviceData(v2, did); \ - deallocOpenMPDeviceData(w, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); \ - deallocOpenMPDeviceData(z, did); - - - void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -62,8 +34,6 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU if ( vid == Base_OpenMPTarget ) { - POLYBENCH_GEMVER_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -104,12 +74,8 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU } // end run_reps stopTimer(); - POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - POLYBENCH_GEMVER_DATA_SETUP_OMP_TARGET; - POLYBENCH_GEMVER_VIEWS_RAJA; using EXEC_POL1 = @@ -184,8 +150,6 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU } stopTimer(); - POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_GEMVER : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 4ab40203a..a15f48542 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_GESUMMV_DATA_SETUP_CUDA \ - allocAndInitCudaData(x, m_x, N); \ - allocAndInitCudaData(y, m_y, N); \ - allocAndInitCudaData(A, m_A, N*N); \ - allocAndInitCudaData(B, m_B, N*N); - - -#define POLYBENCH_GESUMMV_TEARDOWN_CUDA \ - getCudaData(m_y, y, N); \ - deallocCudaData(x); \ - deallocCudaData(y); \ - deallocCudaData(A); \ - deallocCudaData(B); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_gesummv(Real_ptr x, Real_ptr y, @@ -64,8 +49,6 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_GESUMMV_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,12 +63,8 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GESUMMV_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_GESUMMV_DATA_SETUP_CUDA; - POLYBENCH_GESUMMV_VIEWS_RAJA; using EXEC_POL = @@ -130,8 +109,6 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GESUMMV_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_GESUMMV : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 55285f0d5..c06ed3822 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_GESUMMV_DATA_SETUP_HIP \ - allocAndInitHipData(x, m_x, N); \ - allocAndInitHipData(y, m_y, N); \ - allocAndInitHipData(A, m_A, N*N); \ - allocAndInitHipData(B, m_B, N*N); - - -#define POLYBENCH_GESUMMV_TEARDOWN_HIP \ - getHipData(m_y, y, N); \ - deallocHipData(x); \ - deallocHipData(y); \ - deallocHipData(A); \ - deallocHipData(B); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_gesummv(Real_ptr x, Real_ptr y, @@ -64,8 +49,6 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_GESUMMV_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -82,12 +65,8 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GESUMMV_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_GESUMMV_DATA_SETUP_HIP; - POLYBENCH_GESUMMV_VIEWS_RAJA; using EXEC_POL = @@ -132,8 +111,6 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_GESUMMV_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_GESUMMV : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index f25e85cfb..86e73b293 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -26,24 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_GESUMMV_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, N, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, N, did, hid); \ - allocAndInitOpenMPDeviceData(A, m_A, N*N, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_B, N*N, did, hid); - - -#define POLYBENCH_GESUMMV_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, N, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); - - void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -52,8 +34,6 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN if ( vid == Base_OpenMPTarget ) { - POLYBENCH_GESUMMV_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -70,12 +50,8 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } stopTimer(); - POLYBENCH_GESUMMV_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_GESUMMV_DATA_SETUP_OMP_TARGET; - POLYBENCH_GESUMMV_VIEWS_RAJA; using EXEC_POL = @@ -115,8 +91,6 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } stopTimer(); - POLYBENCH_GESUMMV_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_GESUMMV : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 93515e260..14533d6e6 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -40,19 +40,6 @@ namespace polybench static_cast(RAJA_DIVIDE_CEILING_INT(N-2, i_block_sz))); -#define POLYBENCH_HEAT_3D_DATA_SETUP_CUDA \ - allocAndInitCudaData(A, m_Ainit, m_N*m_N*m_N); \ - allocAndInitCudaData(B, m_Binit, m_N*m_N*m_N); \ - static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size"); - - -#define POLYBENCH_HEAT_3D_TEARDOWN_CUDA \ - getCudaData(m_A, A, m_N*m_N*m_N); \ - getCudaData(m_B, B, m_N*m_N*m_N); \ - deallocCudaData(A); \ - deallocCudaData(B); - - template < size_t k_block_size, size_t j_block_size, size_t i_block_size > __launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_1(Real_ptr A, Real_ptr B, Index_type N) @@ -102,8 +89,6 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_HEAT_3D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -125,12 +110,8 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_HEAT_3D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -160,12 +141,8 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_HEAT_3D_DATA_SETUP_CUDA; - POLYBENCH_HEAT_3D_VIEWS_RAJA; using EXEC_POL = @@ -214,8 +191,6 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_HEAT_3D : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 7fb09bbf6..5a0541649 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -40,19 +40,6 @@ namespace polybench static_cast(RAJA_DIVIDE_CEILING_INT(N-2, i_block_sz))); -#define POLYBENCH_HEAT_3D_DATA_SETUP_HIP \ - allocAndInitHipData(A, m_Ainit, m_N*m_N*m_N); \ - allocAndInitHipData(B, m_Binit, m_N*m_N*m_N); \ - static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size"); - - -#define POLYBENCH_HEAT_3D_TEARDOWN_HIP \ - getHipData(m_A, A, m_N*m_N*m_N); \ - getHipData(m_B, B, m_N*m_N*m_N); \ - deallocHipData(A); \ - deallocHipData(B); - - template < size_t k_block_size, size_t j_block_size, size_t i_block_size > __launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_1(Real_ptr A, Real_ptr B, Index_type N) @@ -102,8 +89,6 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_HEAT_3D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -127,12 +112,8 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_HEAT_3D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -167,12 +148,8 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_HEAT_3D_DATA_SETUP_HIP; - POLYBENCH_HEAT_3D_VIEWS_RAJA; using EXEC_POL = @@ -220,8 +197,6 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_HEAT_3D : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index 112f07ee1..7a70c3f87 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_HEAT_3D_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(A, m_Ainit, m_N*m_N*m_N, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_Binit, m_N*m_N*m_N, did, hid); - - -#define POLYBENCH_HEAT_3D_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_A, A, m_N*m_N*m_N, hid, did); \ - getOpenMPDeviceData(m_B, B, m_N*m_N*m_N, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); - - void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -44,8 +29,6 @@ void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN if ( vid == Base_OpenMPTarget ) { - POLYBENCH_HEAT_3D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,12 +59,8 @@ void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_HEAT_3D_DATA_SETUP_OMP_TARGET; - POLYBENCH_HEAT_3D_VIEWS_RAJA; using EXEC_POL = @@ -118,8 +97,6 @@ void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } stopTimer(); - POLYBENCH_HEAT_3D_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_HEAT_3D : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index 567b75a24..a5d181345 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -21,18 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_JACOBI_1D_DATA_SETUP_CUDA \ - allocAndInitCudaData(A, m_Ainit, m_N); \ - allocAndInitCudaData(B, m_Binit, m_N); - - -#define POLYBENCH_JACOBI_1D_TEARDOWN_CUDA \ - getCudaData(m_A, A, m_N); \ - getCudaData(m_B, B, m_N); \ - deallocCudaData(A); \ - deallocCudaData(B); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_jacobi_1D_1(Real_ptr A, Real_ptr B, Index_type N) @@ -65,8 +53,6 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_JACOBI_1D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -85,12 +71,8 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_1D_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_JACOBI_1D_DATA_SETUP_CUDA; - using EXEC_POL = RAJA::cuda_exec; startTimer(); @@ -113,8 +95,6 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_1D_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_JACOBI_1D : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index bd1ccebfd..505b8cb91 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -21,18 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_JACOBI_1D_DATA_SETUP_HIP \ - allocAndInitHipData(A, m_Ainit, m_N); \ - allocAndInitHipData(B, m_Binit, m_N); - - -#define POLYBENCH_JACOBI_1D_TEARDOWN_HIP \ - getHipData(m_A, A, m_N); \ - getHipData(m_B, B, m_N); \ - deallocHipData(A); \ - deallocHipData(B); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_jacobi_1D_1(Real_ptr A, Real_ptr B, Index_type N) @@ -65,8 +53,6 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_JACOBI_1D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,12 +73,8 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_1D_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_JACOBI_1D_DATA_SETUP_HIP; - using EXEC_POL = RAJA::hip_exec; startTimer(); @@ -115,8 +97,6 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_1D_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_JACOBI_1D : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index 13b9b5375..39a2423df 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -26,21 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_JACOBI_1D_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(A, m_Ainit, m_N, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_Binit, m_N, did, hid); - - -#define POLYBENCH_JACOBI_1D_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_A, A, m_N, hid, did); \ - getOpenMPDeviceData(m_B, B, m_N, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); - - void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -49,8 +34,6 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_ if ( vid == Base_OpenMPTarget ) { - POLYBENCH_JACOBI_1D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -72,12 +55,8 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_ } stopTimer(); - POLYBENCH_JACOBI_1D_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget ) { - POLYBENCH_JACOBI_1D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -98,8 +77,6 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_ } stopTimer(); - POLYBENCH_JACOBI_1D_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_JACOBI_1D : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 19e83da1c..61258a6f5 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -39,18 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA \ - allocAndInitCudaData(A, m_Ainit, m_N*m_N); \ - allocAndInitCudaData(B, m_Binit, m_N*m_N); - - -#define POLYBENCH_JACOBI_2D_TEARDOWN_CUDA \ - getCudaData(m_A, A, m_N*m_N); \ - getCudaData(m_B, B, m_N*m_N); \ - deallocCudaData(A); \ - deallocCudaData(B); - - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_1(Real_ptr A, Real_ptr B, Index_type N) @@ -97,8 +85,6 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -120,12 +106,8 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -155,12 +137,8 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA; - POLYBENCH_JACOBI_2D_VIEWS_RAJA; using EXEC_POL = @@ -204,8 +182,6 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_JACOBI_2D : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 53d93759c..9f4ba7009 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -39,18 +39,6 @@ namespace polybench static_cast(1)); -#define POLYBENCH_JACOBI_2D_DATA_SETUP_HIP \ - allocAndInitHipData(A, m_Ainit, m_N*m_N); \ - allocAndInitHipData(B, m_Binit, m_N*m_N); - - -#define POLYBENCH_JACOBI_2D_TEARDOWN_HIP \ - getHipData(m_A, A, m_N*m_N); \ - getHipData(m_B, B, m_N*m_N); \ - deallocHipData(A); \ - deallocHipData(B); - - template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_1(Real_ptr A, Real_ptr B, Index_type N) @@ -97,8 +85,6 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_JACOBI_2D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -122,12 +108,8 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - POLYBENCH_JACOBI_2D_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -161,12 +143,8 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_JACOBI_2D_DATA_SETUP_HIP; - POLYBENCH_JACOBI_2D_VIEWS_RAJA; using EXEC_POL = @@ -210,8 +188,6 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_JACOBI_2D : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 752115bce..97806cfac 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -21,21 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(A, m_Ainit, m_N*m_N, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_Binit, m_N*m_N, did, hid); - - -#define POLYBENCH_JACOBI_2D_TEARDOWN_CUDA \ - getOpenMPDeviceData(m_A, A, m_N*m_N, hid, did); \ - getOpenMPDeviceData(m_B, B, m_N*m_N, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); - - void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -44,8 +29,6 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_ if ( vid == Base_OpenMPTarget ) { - POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -72,12 +55,8 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_ } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_JACOBI_2D_DATA_SETUP_CUDA; - POLYBENCH_JACOBI_2D_VIEWS_RAJA; using EXEC_POL = @@ -112,8 +91,6 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_ } stopTimer(); - POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_JACOBI_2D : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 94b27616d..d4d92e644 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -21,24 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_MVT_DATA_SETUP_CUDA \ - allocAndInitCudaData(x1, m_x1, N); \ - allocAndInitCudaData(x2, m_x2, N); \ - allocAndInitCudaData(y1, m_y1, N); \ - allocAndInitCudaData(y2, m_y2, N); \ - allocAndInitCudaData(A, m_A, N * N); - - -#define POLYBENCH_MVT_TEARDOWN_CUDA \ - getCudaData(m_x1, x1, N); \ - getCudaData(m_x2, x2, N); \ - deallocCudaData(x1); \ - deallocCudaData(x2); \ - deallocCudaData(y1); \ - deallocCudaData(y2); \ - deallocCudaData(A); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1, @@ -81,8 +63,6 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - POLYBENCH_MVT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -97,12 +77,8 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_MVT_TEARDOWN_CUDA; - } else if (vid == RAJA_CUDA) { - POLYBENCH_MVT_DATA_SETUP_CUDA; - POLYBENCH_MVT_VIEWS_RAJA; using EXEC_POL = @@ -171,8 +147,6 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_MVT_TEARDOWN_CUDA; - } else { getCout() << "\n POLYBENCH_MVT : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 65ef62764..59fdeaab9 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -21,24 +21,6 @@ namespace rajaperf namespace polybench { -#define POLYBENCH_MVT_DATA_SETUP_HIP \ - allocAndInitHipData(x1, m_x1, N); \ - allocAndInitHipData(x2, m_x2, N); \ - allocAndInitHipData(y1, m_y1, N); \ - allocAndInitHipData(y2, m_y2, N); \ - allocAndInitHipData(A, m_A, N * N); - - -#define POLYBENCH_MVT_TEARDOWN_HIP \ - getHipData(m_x1, x1, N); \ - getHipData(m_x2, x2, N); \ - deallocHipData(x1); \ - deallocHipData(x2); \ - deallocHipData(y1); \ - deallocHipData(y2); \ - deallocHipData(A); - - template < size_t block_size > __launch_bounds__(block_size) __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1, @@ -81,8 +63,6 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - POLYBENCH_MVT_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -101,12 +81,8 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_MVT_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { - POLYBENCH_MVT_DATA_SETUP_HIP; - POLYBENCH_MVT_VIEWS_RAJA; using EXEC_POL = @@ -169,8 +145,6 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) } stopTimer(); - POLYBENCH_MVT_TEARDOWN_HIP; - } else { getCout() << "\n POLYBENCH_MVT : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index 724ec3348..c9ff17751 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -26,27 +26,6 @@ namespace polybench // const size_t threads_per_team = 256; -#define POLYBENCH_MVT_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x1, m_x1, N, did, hid); \ - allocAndInitOpenMPDeviceData(x2, m_x2, N, did, hid); \ - allocAndInitOpenMPDeviceData(y1, m_y1, N, did, hid); \ - allocAndInitOpenMPDeviceData(y2, m_y2, N, did, hid); \ - allocAndInitOpenMPDeviceData(A, m_A, N * N, did, hid); - - -#define POLYBENCH_MVT_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_x1, x1, N, hid, did); \ - getOpenMPDeviceData(m_x2, x2, N, hid, did); \ - deallocOpenMPDeviceData(x1, did); \ - deallocOpenMPDeviceData(x2, did); \ - deallocOpenMPDeviceData(y1, did); \ - deallocOpenMPDeviceData(y2, did); \ - deallocOpenMPDeviceData(A, did); - - void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -55,8 +34,6 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED if ( vid == Base_OpenMPTarget ) { - POLYBENCH_MVT_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -83,12 +60,8 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_MVT_TEARDOWN_OMP_TARGET; - } else if (vid == RAJA_OpenMPTarget) { - POLYBENCH_MVT_DATA_SETUP_OMP_TARGET; - POLYBENCH_MVT_VIEWS_RAJA; using EXEC_POL = @@ -146,8 +119,6 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_MVT_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n POLYBENCH_MVT : Unknown OMP Target variant id = " << vid << std::endl; } From 2225ab0dbb9e6f5ee1f56a877e181d7a409e3208 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Mar 2023 21:25:09 -0700 Subject: [PATCH 091/137] Remove stream cuda/hip/omptarget data setup --- src/stream/ADD-Cuda.cpp | 23 ----------------------- src/stream/ADD-Hip.cpp | 23 ----------------------- src/stream/ADD-OMPTarget.cpp | 22 ---------------------- src/stream/COPY-Cuda.cpp | 21 --------------------- src/stream/COPY-Hip.cpp | 21 --------------------- src/stream/COPY-OMPTarget.cpp | 21 --------------------- src/stream/DOT-Cuda.cpp | 16 ---------------- src/stream/DOT-Hip.cpp | 16 ---------------- src/stream/DOT-OMPTarget.cpp | 19 ------------------- src/stream/MUL-Cuda.cpp | 21 --------------------- src/stream/MUL-Hip.cpp | 21 --------------------- src/stream/MUL-OMPTarget.cpp | 20 -------------------- src/stream/TRIAD-Cuda.cpp | 23 ----------------------- src/stream/TRIAD-Hip.cpp | 23 ----------------------- src/stream/TRIAD-OMPTarget.cpp | 22 ---------------------- 15 files changed, 312 deletions(-) diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 5dc4e031e..2cecdb6f4 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -22,17 +22,6 @@ namespace rajaperf namespace stream { -#define ADD_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, iend); \ - allocAndInitCudaData(b, m_b, iend); \ - allocAndInitCudaData(c, m_c, iend); - -#define ADD_DATA_TEARDOWN_CUDA \ - getCudaData(m_c, c, iend); \ - deallocCudaData(a); \ - deallocCudaData(b); \ - deallocCudaData(c); - template < size_t block_size > __launch_bounds__(block_size) __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, @@ -56,8 +45,6 @@ void ADD::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - ADD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -69,12 +56,8 @@ void ADD::runCudaVariantImpl(VariantID vid) } stopTimer(); - ADD_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - ADD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -88,12 +71,8 @@ void ADD::runCudaVariantImpl(VariantID vid) } stopTimer(); - ADD_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - ADD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -105,8 +84,6 @@ void ADD::runCudaVariantImpl(VariantID vid) } stopTimer(); - ADD_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n ADD : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index f9018cfd3..e384974ca 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -21,17 +21,6 @@ namespace rajaperf namespace stream { -#define ADD_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, iend); \ - allocAndInitHipData(b, m_b, iend); \ - allocAndInitHipData(c, m_c, iend); - -#define ADD_DATA_TEARDOWN_HIP \ - getHipData(m_c, c, iend); \ - deallocHipData(a); \ - deallocHipData(b); \ - deallocHipData(c); - template < size_t block_size > __launch_bounds__(block_size) __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, @@ -55,8 +44,6 @@ void ADD::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - ADD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -68,12 +55,8 @@ void ADD::runHipVariantImpl(VariantID vid) } stopTimer(); - ADD_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - ADD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -89,12 +72,8 @@ void ADD::runHipVariantImpl(VariantID vid) } stopTimer(); - ADD_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - ADD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -106,8 +85,6 @@ void ADD::runHipVariantImpl(VariantID vid) } stopTimer(); - ADD_DATA_TEARDOWN_HIP; - } else { getCout() << "\n ADD : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index 33c12e5cc..6e4302446 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -26,20 +26,6 @@ namespace stream // const size_t threads_per_team = 256; -#define ADD_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ - allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \ - allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); - -#define ADD_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_c, c, iend, hid, did); \ - deallocOpenMPDeviceData(a, did); \ - deallocOpenMPDeviceData(b, did); \ - deallocOpenMPDeviceData(c, did); - void ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -50,8 +36,6 @@ void ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ if ( vid == Base_OpenMPTarget ) { - ADD_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -64,12 +48,8 @@ void ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); - ADD_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - ADD_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -81,8 +61,6 @@ void ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); - ADD_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n ADD : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 06a52186d..7ec37f5ea 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace stream { -#define COPY_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, iend); \ - allocAndInitCudaData(c, m_c, iend); - -#define COPY_DATA_TEARDOWN_CUDA \ - getCudaData(m_c, c, iend); \ - deallocCudaData(a); \ - deallocCudaData(c); - template < size_t block_size > __launch_bounds__(block_size) __global__ void copy(Real_ptr c, Real_ptr a, @@ -53,8 +44,6 @@ void COPY::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - COPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +55,8 @@ void COPY::runCudaVariantImpl(VariantID vid) } stopTimer(); - COPY_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - COPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -85,12 +70,8 @@ void COPY::runCudaVariantImpl(VariantID vid) } stopTimer(); - COPY_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - COPY_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -102,8 +83,6 @@ void COPY::runCudaVariantImpl(VariantID vid) } stopTimer(); - COPY_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n COPY : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 3428e00ce..7a6e321cb 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace stream { -#define COPY_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, iend); \ - allocAndInitHipData(c, m_c, iend); - -#define COPY_DATA_TEARDOWN_HIP \ - getHipData(m_c, c, iend); \ - deallocHipData(a); \ - deallocHipData(c); - template < size_t block_size > __launch_bounds__(block_size) __global__ void copy(Real_ptr c, Real_ptr a, @@ -53,8 +44,6 @@ void COPY::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - COPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +55,8 @@ void COPY::runHipVariantImpl(VariantID vid) } stopTimer(); - COPY_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - COPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,12 +72,8 @@ void COPY::runHipVariantImpl(VariantID vid) } stopTimer(); - COPY_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - COPY_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -104,8 +85,6 @@ void COPY::runHipVariantImpl(VariantID vid) } stopTimer(); - COPY_DATA_TEARDOWN_HIP; - } else { getCout() << "\n COPY : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index f4266bde7..a9250c4cd 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -26,19 +26,6 @@ namespace stream // const size_t threads_per_team = 256; -#define COPY_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ - allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); - -#define COPY_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_c, c, iend, hid, did); \ - deallocOpenMPDeviceData(a, did); \ - deallocOpenMPDeviceData(c, did); - - void COPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -49,8 +36,6 @@ void COPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune if ( vid == Base_OpenMPTarget ) { - COPY_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -63,12 +48,8 @@ void COPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } stopTimer(); - COPY_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - COPY_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,8 +61,6 @@ void COPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } stopTimer(); - COPY_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n COPY : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 4cd0ff810..d4dc1d55c 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -22,14 +22,6 @@ namespace rajaperf namespace stream { -#define DOT_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, iend); \ - allocAndInitCudaData(b, m_b, iend); - -#define DOT_DATA_TEARDOWN_CUDA \ - deallocCudaData(a); \ - deallocCudaData(b); - template < size_t block_size > __launch_bounds__(block_size) __global__ void dot(Real_ptr a, Real_ptr b, @@ -77,8 +69,6 @@ void DOT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - DOT_DATA_SETUP_CUDA; - Real_ptr dprod; allocAndInitCudaDeviceData(dprod, &m_dot_init, 1); @@ -100,14 +90,10 @@ void DOT::runCudaVariantImpl(VariantID vid) } stopTimer(); - DOT_DATA_TEARDOWN_CUDA; - deallocCudaDeviceData(dprod); } else if ( vid == RAJA_CUDA ) { - DOT_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -123,8 +109,6 @@ void DOT::runCudaVariantImpl(VariantID vid) } stopTimer(); - DOT_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index f609822f0..27f724d1f 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -22,14 +22,6 @@ namespace rajaperf namespace stream { -#define DOT_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, iend); \ - allocAndInitHipData(b, m_b, iend); - -#define DOT_DATA_TEARDOWN_HIP \ - deallocHipData(a); \ - deallocHipData(b); - template < size_t block_size > __launch_bounds__(block_size) __global__ void dot(Real_ptr a, Real_ptr b, @@ -78,8 +70,6 @@ void DOT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - DOT_DATA_SETUP_HIP; - Real_ptr dprod; allocAndInitHipDeviceData(dprod, &m_dot_init, 1); @@ -102,14 +92,10 @@ void DOT::runHipVariantImpl(VariantID vid) } stopTimer(); - DOT_DATA_TEARDOWN_HIP; - deallocHipDeviceData(dprod); } else if ( vid == RAJA_HIP ) { - DOT_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -125,8 +111,6 @@ void DOT::runHipVariantImpl(VariantID vid) } stopTimer(); - DOT_DATA_TEARDOWN_HIP; - } else { getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index ddbcf7b4d..7ab5d578e 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -26,17 +26,6 @@ namespace stream // const size_t threads_per_team = 256; -#define DOT_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ - allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); - -#define DOT_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(a, did); \ - deallocOpenMPDeviceData(b, did); - void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -47,8 +36,6 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ if ( vid == Base_OpenMPTarget ) { - DOT_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +53,8 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); - DOT_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - DOT_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,8 +70,6 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); - DOT_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 0e8b0af61..40fb2f4a8 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace stream { -#define MUL_DATA_SETUP_CUDA \ - allocAndInitCudaData(b, m_b, iend); \ - allocAndInitCudaData(c, m_c, iend); - -#define MUL_DATA_TEARDOWN_CUDA \ - getCudaData(m_b, b, iend); \ - deallocCudaData(b); \ - deallocCudaData(c) - template < size_t block_size > __launch_bounds__(block_size) __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha, @@ -53,8 +44,6 @@ void MUL::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - MUL_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +55,8 @@ void MUL::runCudaVariantImpl(VariantID vid) } stopTimer(); - MUL_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - MUL_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -85,12 +70,8 @@ void MUL::runCudaVariantImpl(VariantID vid) } stopTimer(); - MUL_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - MUL_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -102,8 +83,6 @@ void MUL::runCudaVariantImpl(VariantID vid) } stopTimer(); - MUL_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n MUL : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 37978e091..83e3196a8 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -21,15 +21,6 @@ namespace rajaperf namespace stream { -#define MUL_DATA_SETUP_HIP \ - allocAndInitHipData(b, m_b, iend); \ - allocAndInitHipData(c, m_c, iend); - -#define MUL_DATA_TEARDOWN_HIP \ - getHipData(m_b, b, iend); \ - deallocHipData(b); \ - deallocHipData(c) - template < size_t block_size > __launch_bounds__(block_size) __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha, @@ -53,8 +44,6 @@ void MUL::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - MUL_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +55,8 @@ void MUL::runHipVariantImpl(VariantID vid) } stopTimer(); - MUL_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - MUL_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,12 +72,8 @@ void MUL::runHipVariantImpl(VariantID vid) } stopTimer(); - MUL_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - MUL_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -104,8 +85,6 @@ void MUL::runHipVariantImpl(VariantID vid) } stopTimer(); - MUL_DATA_TEARDOWN_HIP; - } else { getCout() << "\n MUL : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index ab02bbccc..c5f20d6b3 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -26,18 +26,6 @@ namespace stream // const size_t threads_per_team = 256; -#define MUL_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \ - allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); - -#define MUL_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_b, b, iend, hid, did); \ - deallocOpenMPDeviceData(b, did); \ - deallocOpenMPDeviceData(c, did); - void MUL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -48,8 +36,6 @@ void MUL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ if ( vid == Base_OpenMPTarget ) { - MUL_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -62,12 +48,8 @@ void MUL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); - MUL_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - MUL_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -79,8 +61,6 @@ void MUL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); - MUL_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n MUL : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 45c06d785..386ecb627 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -21,17 +21,6 @@ namespace rajaperf namespace stream { -#define TRIAD_DATA_SETUP_CUDA \ - allocAndInitCudaData(a, m_a, iend); \ - allocAndInitCudaData(b, m_b, iend); \ - allocAndInitCudaData(c, m_c, iend); - -#define TRIAD_DATA_TEARDOWN_CUDA \ - getCudaData(m_a, a, iend); \ - deallocCudaData(a); \ - deallocCudaData(b); \ - deallocCudaData(c); - template < size_t block_size > __launch_bounds__(block_size) __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, @@ -55,8 +44,6 @@ void TRIAD::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - TRIAD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -68,12 +55,8 @@ void TRIAD::runCudaVariantImpl(VariantID vid) } stopTimer(); - TRIAD_DATA_TEARDOWN_CUDA; - } else if ( vid == Lambda_CUDA ) { - TRIAD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,12 +70,8 @@ void TRIAD::runCudaVariantImpl(VariantID vid) } stopTimer(); - TRIAD_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - TRIAD_DATA_SETUP_CUDA; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -104,8 +83,6 @@ void TRIAD::runCudaVariantImpl(VariantID vid) } stopTimer(); - TRIAD_DATA_TEARDOWN_CUDA; - } else { getCout() << "\n TRIAD : Unknown Cuda variant id = " << vid << std::endl; } diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 83c349158..ff4007436 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -21,17 +21,6 @@ namespace rajaperf namespace stream { -#define TRIAD_DATA_SETUP_HIP \ - allocAndInitHipData(a, m_a, iend); \ - allocAndInitHipData(b, m_b, iend); \ - allocAndInitHipData(c, m_c, iend); - -#define TRIAD_DATA_TEARDOWN_HIP \ - getHipData(m_a, a, iend); \ - deallocHipData(a); \ - deallocHipData(b); \ - deallocHipData(c); - template < size_t block_size > __launch_bounds__(block_size) __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, @@ -55,8 +44,6 @@ void TRIAD::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - TRIAD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -68,12 +55,8 @@ void TRIAD::runHipVariantImpl(VariantID vid) } stopTimer(); - TRIAD_DATA_TEARDOWN_HIP; - } else if ( vid == Lambda_HIP ) { - TRIAD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -89,12 +72,8 @@ void TRIAD::runHipVariantImpl(VariantID vid) } stopTimer(); - TRIAD_DATA_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - TRIAD_DATA_SETUP_HIP; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -106,8 +85,6 @@ void TRIAD::runHipVariantImpl(VariantID vid) } stopTimer(); - TRIAD_DATA_TEARDOWN_HIP; - } else { getCout() << "\n TRIAD : Unknown Hip variant id = " << vid << std::endl; } diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index 5d63d1c24..dfea3158d 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -26,20 +26,6 @@ namespace stream // const size_t threads_per_team = 256; -#define TRIAD_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ - allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \ - allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); - -#define TRIAD_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_a, a, iend, hid, did); \ - deallocOpenMPDeviceData(a, did); \ - deallocOpenMPDeviceData(b, did); \ - deallocOpenMPDeviceData(c, did); - void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); @@ -50,8 +36,6 @@ void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun if ( vid == Base_OpenMPTarget ) { - TRIAD_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -64,12 +48,8 @@ void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } stopTimer(); - TRIAD_DATA_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - TRIAD_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -81,8 +61,6 @@ void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } stopTimer(); - TRIAD_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n TRIAD : Unknown OMP Target variant id = " << vid << std::endl; } From 75753bfccb677746c021a28f0fbcf8362fc9a54e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Mar 2023 08:17:16 -0700 Subject: [PATCH 092/137] Remove VariantId from DataUtils routines Split omp first touch routine and apply during allocation to Omp memory only. --- src/common/CudaDataUtils.hpp | 3 +- src/common/DataUtils.cpp | 212 +++++++-------------------- src/common/DataUtils.hpp | 188 +++++++++++++----------- src/common/HipDataUtils.hpp | 3 +- src/common/KernelBase.hpp | 89 ++++------- src/common/OpenMPTargetDataUtils.hpp | 3 +- 6 files changed, 184 insertions(+), 314 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 0ea0f0428..6d49fa0fd 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -119,9 +119,8 @@ __device__ inline Index_type lambda_cuda_get_index() /* * Copy memory len bytes from src to dst. */ -inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid) +inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len) { - (void)vid; cudaErrchk( cudaMemcpy( dst_ptr, src_ptr, len, cudaMemcpyDefault ) ); } diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index cdc67a7e5..11eeea04a 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -44,84 +44,35 @@ void incDataInitCount() /* * Copy memory len bytes from src to dst. */ -void copyHostData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid) +void copyHostData(void* dst_ptr, const void* src_ptr, size_t len) { - (void)vid; std::memcpy(dst_ptr, src_ptr, len); } -/* - * Allocate and initialize aligned data arrays. - */ -void allocAndInitData(Int_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initData(ptr, len, vid); -} - -void allocAndInitData(Real_ptr& ptr, int len, int align, VariantID vid ) -{ - allocHostData(ptr, len, align, vid); - initData(ptr, len, vid); -} - -void allocAndInitDataConst(Real_ptr& ptr, int len, int align, Real_type val, - VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initDataConst(ptr, len, val, vid); -} - -void allocAndInitDataRandSign(Real_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initDataRandSign(ptr, len, vid); -} - -void allocAndInitDataRandValue(Real_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initDataRandValue(ptr, len, vid); -} - -void allocAndInitData(Complex_ptr& ptr, int len, int align, VariantID vid) -{ - allocHostData(ptr, len, align, vid); - initData(ptr, len, vid); -} - /* * Allocate data arrays of given type. */ -void allocHostData(Int_ptr& ptr, int len, int align, - VariantID vid) +void allocHostData(Int_ptr& ptr, int len, int align) { - (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Int_type)); } /// -void allocHostData(Index_type*& ptr, int len, int align, - VariantID vid) +void allocHostData(Index_type*& ptr, int len, int align) { - (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Index_type)); } -void allocHostData(Real_ptr& ptr, int len, int align, - VariantID vid) +void allocHostData(Real_ptr& ptr, int len, int align) { - (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Real_type)); } -void allocHostData(Complex_ptr& ptr, int len, int align, - VariantID vid) +void allocHostData(Complex_ptr& ptr, int len, int align) { - (void)vid; ptr = RAJA::allocate_aligned_type( align, len*sizeof(Complex_type)); } @@ -130,40 +81,32 @@ void allocHostData(Complex_ptr& ptr, int len, int align, /* * Free data arrays of given type. */ -void deallocHostData(Int_ptr& ptr, - VariantID vid) +void deallocHostData(Int_ptr& ptr) { - (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; } } -void deallocHostData(Index_type*& ptr, - VariantID vid) +void deallocHostData(Index_type*& ptr) { - (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; } } -void deallocHostData(Real_ptr& ptr, - VariantID vid) +void deallocHostData(Real_ptr& ptr) { - (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; } } -void deallocHostData(Complex_ptr& ptr, - VariantID vid) +void deallocHostData(Complex_ptr& ptr) { - (void)vid; if (ptr) { RAJA::free_aligned(ptr); ptr = nullptr; @@ -172,25 +115,53 @@ void deallocHostData(Complex_ptr& ptr, /* - * \brief Initialize Int_type data array to - * randomly signed positive and negative values. + * \brief Touch Int_type data array with omp threads. */ -void initData(Int_ptr& ptr, int len, VariantID vid) +void touchOmpData(Int_ptr& ptr, int len) { - (void) vid; +// First touch... +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + #pragma omp parallel for + for (int i = 0; i < len; ++i) { + ptr[i] = -987654321; + }; +#endif +} +/* + * \brief Touch Real_type data array with omp threads. + */ +void touchOmpData(Real_ptr& ptr, int len) +{ // First touch... #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = 0; - }; - } + #pragma omp parallel for + for (int i = 0; i < len; ++i) { + ptr[i] = -(i + 1.11111111)/(i + 1.23456789); + }; +#endif +} + +/* + * \brief Touch Complex_type data array with omp threads. + */ +void touchOmpData(Complex_ptr& ptr, int len) +{ +// First touch... +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + #pragma omp parallel for + for (int i = 0; i < len; ++i) { + ptr[i] = -(i + 1.11111111)/(i + 1.23456789); + }; #endif +} +/* + * \brief Initialize Int_type data array to + * randomly signed positive and negative values. + */ +void initData(Int_ptr& ptr, int len) +{ srand(4793); Real_type signfact = 0.0; @@ -216,24 +187,10 @@ void initData(Int_ptr& ptr, int len, VariantID vid) * positive values (0.0, 1.0) based on their array position * (index) and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len, VariantID vid) +void initData(Real_ptr& ptr, int len) { - (void) vid; - Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); -// first touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = factor*(i + 1.1)/(i + 1.12345); - }; - } -#endif - for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); } @@ -244,24 +201,8 @@ void initData(Real_ptr& ptr, int len, VariantID vid) /* * Initialize Real_type data array to constant values. */ -void initDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid) +void initDataConst(Real_ptr& ptr, int len, Real_type val) { - -// first touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = 0; - }; - } -#else - (void) vid; -#endif - for (int i = 0; i < len; ++i) { ptr[i] = val; }; @@ -272,22 +213,8 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val, /* * Initialize Real_type data array with random sign. */ -void initDataRandSign(Real_ptr& ptr, int len, VariantID vid) +void initDataRandSign(Real_ptr& ptr, int len) { - (void) vid; - -// First touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = 0.0; - }; - } -#endif - Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); srand(4793); @@ -304,22 +231,8 @@ void initDataRandSign(Real_ptr& ptr, int len, VariantID vid) /* * Initialize Real_type data array with random values. */ -void initDataRandValue(Real_ptr& ptr, int len, VariantID vid) +void initDataRandValue(Real_ptr& ptr, int len) { - (void) vid; - -// First touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = 0.0; - }; - } -#endif - srand(4793); for (int i = 0; i < len; ++i) { @@ -332,24 +245,11 @@ void initDataRandValue(Real_ptr& ptr, int len, VariantID vid) /* * Initialize Complex_type data array. */ -void initData(Complex_ptr& ptr, int len, VariantID vid) +void initData(Complex_ptr& ptr, int len) { - (void) vid; - Complex_type factor = ( data_init_count % 2 ? Complex_type(0.1,0.2) : Complex_type(0.2,0.3) ); -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = factor*(i + 1.1)/(i + 1.12345); - }; - } -#endif - for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); } @@ -360,10 +260,8 @@ void initData(Complex_ptr& ptr, int len, VariantID vid) /* * Initialize scalar data. */ -void initData(Real_type& d, VariantID vid) +void initData(Real_type& d) { - (void) vid; - Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); d = factor*1.1/1.12345; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 42bfe448d..8bc83b6f7 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -47,31 +47,46 @@ void resetDataInitCount(); */ void incDataInitCount(); -void copyHostData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid); +void copyHostData(void* dst_ptr, const void* src_ptr, size_t len); /*! * \brief Allocate data arrays. */ -void allocHostData(Int_ptr& ptr, int len, int align, VariantID vid); +void allocHostData(Int_ptr& ptr, int len, int align); /// -void allocHostData(Index_type*& ptr, int len, int align, VariantID vid); +void allocHostData(Index_type*& ptr, int len, int align); /// -void allocHostData(Real_ptr& ptr, int len, int align, VariantID vid); +void allocHostData(Real_ptr& ptr, int len, int align); /// -void allocHostData(Complex_ptr& ptr, int len, int align, VariantID vid); +void allocHostData(Complex_ptr& ptr, int len, int align); /*! * \brief Free data arrays. */ -void deallocHostData(Int_ptr& ptr, VariantID vid); +void deallocHostData(Int_ptr& ptr); /// -void deallocHostData(Index_type*& ptr, VariantID vid); +void deallocHostData(Index_type*& ptr); /// -void deallocHostData(Real_ptr& ptr, VariantID vid); +void deallocHostData(Real_ptr& ptr); /// -void deallocHostData(Complex_ptr& ptr, VariantID vid); +void deallocHostData(Complex_ptr& ptr); +/*! + * \brief Touch Int_type data array with omp threads. + */ +void touchOmpData(Int_ptr& ptr, int len); + +/*! + * \brief Touch Real_type data array with omp threads. + */ +void touchOmpData(Real_ptr& ptr, int len); + +/*! + * \brief Touch Complex_type data array with omp threads. + */ +void touchOmpData(Complex_ptr& ptr, int len); + /*! * \brief Initialize Int_type data array. * @@ -79,8 +94,7 @@ void deallocHostData(Complex_ptr& ptr, VariantID vid); * Then, two randomly-chosen entries are reset, one to * a value > 1, one to a value < -1. */ -void initData(Int_ptr& ptr, int len, - VariantID vid); +void initData(Int_ptr& ptr, int len); /*! * \brief Initialize Real_type data array. @@ -89,16 +103,14 @@ void initData(Int_ptr& ptr, int len, * in the interval (0.0, 1.0) based on their array position (index) * and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len, - VariantID vid); +void initData(Real_ptr& ptr, int len); /*! * \brief Initialize Real_type data array. * * Array entries are set to given constant value. */ -void initDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid); +void initDataConst(Real_ptr& ptr, int len, Real_type val); /*! * \brief Initialize Real_type data array with random sign. @@ -106,16 +118,14 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val, * Array entries are initialized in the same way as the method * initData(Real_ptr& ptr...) above, but with random sign. */ -void initDataRandSign(Real_ptr& ptr, int len, - VariantID vid); +void initDataRandSign(Real_ptr& ptr, int len); /*! * \brief Initialize Real_type data array with random values. * * Array entries are initialized with random values in the interval [0.0, 1.0]. */ -void initDataRandValue(Real_ptr& ptr, int len, - VariantID vid); +void initDataRandValue(Real_ptr& ptr, int len); /*! * \brief Initialize Complex_type data array. @@ -123,8 +133,7 @@ void initDataRandValue(Real_ptr& ptr, int len, * Real and imaginary array entries are initialized in the same way as the * method allocAndInitData(Real_ptr& ptr...) above. */ -void initData(Complex_ptr& ptr, int len, - VariantID vid); +void initData(Complex_ptr& ptr, int len); /*! * \brief Initialize Real_type scalar data. @@ -132,8 +141,7 @@ void initData(Complex_ptr& ptr, int len, * Data is set similarly to an array enttry in the method * initData(Real_ptr& ptr...) above. */ -void initData(Real_type& d, - VariantID vid); +void initData(Real_type& d); /*! @@ -153,6 +161,9 @@ long double calcChecksum(Real_ptr d, int len, long double calcChecksum(Complex_ptr d, int len, Real_type scale_factor); +} // closing brace for detail namespace + + /*! * \brief Get an host accessible data space for this dataSpace. * @@ -276,81 +287,88 @@ inline bool isHipDataSpace(DataSpace dataSpace) * \brief Allocate data array (ptr). */ template -inline void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID vid) +inline void allocData(DataSpace dataSpace, T& ptr, int len, int align) { switch (dataSpace) { case DataSpace::Host: + { + detail::allocHostData(ptr, len, align); + } break; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) case DataSpace::Omp: { - allocHostData(ptr, len, align, vid); + detail::allocHostData(ptr, len, align); + detail::touchOmpData(ptr, len); } break; +#endif #if defined(RAJA_ENABLE_TARGET_OPENMP) case DataSpace::OmpTarget: { - allocOpenMPDeviceData(ptr, len); + detail::allocOpenMPDeviceData(ptr, len); } break; #endif #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: { - allocCudaPinnedData(ptr, len); + detail::allocCudaPinnedData(ptr, len); } break; case DataSpace::CudaManaged: { - allocCudaManagedData(ptr, len); + detail::allocCudaManagedData(ptr, len); } break; case DataSpace::CudaDevice: { - allocCudaDeviceData(ptr, len); + detail::allocCudaDeviceData(ptr, len); } break; #endif #if defined(RAJA_ENABLE_HIP) case DataSpace::HipHostAdviseFine: { - allocHostData(ptr, len, align, vid); - adviseHipFineData(ptr, len); + detail::allocHostData(ptr, len, align); + detail::adviseHipFineData(ptr, len); } break; case DataSpace::HipHostAdviseCoarse: { - allocHostData(ptr, len, align, vid); - adviseHipCoarseData(ptr, len); + detail::allocHostData(ptr, len, align); + detail::adviseHipCoarseData(ptr, len); } break; case DataSpace::HipPinned: { - allocHipPinnedData(ptr, len); + detail::allocHipPinnedData(ptr, len); } break; case DataSpace::HipPinnedFine: { - allocHipPinnedFineData(ptr, len); + detail::allocHipPinnedFineData(ptr, len); } break; case DataSpace::HipPinnedCoarse: { - allocHipPinnedCoarseData(ptr, len); + detail::allocHipPinnedCoarseData(ptr, len); } break; case DataSpace::HipManaged: { - allocHipManagedData(ptr, len); + detail::allocHipManagedData(ptr, len); } break; case DataSpace::HipManagedAdviseFine: { - allocHipManagedData(ptr, len); - adviseHipFineData(ptr, len); + detail::allocHipManagedData(ptr, len); + detail::adviseHipFineData(ptr, len); } break; case DataSpace::HipManagedAdviseCoarse: { - allocHipManagedData(ptr, len); - adviseHipCoarseData(ptr, len); + detail::allocHipManagedData(ptr, len); + detail::adviseHipCoarseData(ptr, len); } break; case DataSpace::HipDevice: { - allocHipDeviceData(ptr, len); + detail::allocHipDeviceData(ptr, len); } break; case DataSpace::HipDeviceFine: { - allocHipDeviceFineData(ptr, len); + detail::allocHipDeviceFineData(ptr, len); } break; #endif @@ -365,7 +383,7 @@ inline void allocData(DataSpace dataSpace, T& ptr, int len, int align, VariantID * \brief Deallocate data array (ptr). */ template -inline void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) +inline void deallocData(DataSpace dataSpace, T& ptr) { switch (dataSpace) { case DataSpace::Host: @@ -373,28 +391,28 @@ inline void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) case DataSpace::HipHostAdviseFine: case DataSpace::HipHostAdviseCoarse: { - deallocHostData(ptr, vid); + detail::deallocHostData(ptr); } break; #if defined(RAJA_ENABLE_TARGET_OPENMP) case DataSpace::OmpTarget: { - deallocOpenMPDeviceData(ptr); + detail::deallocOpenMPDeviceData(ptr); } break; #endif #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: { - deallocCudaPinnedData(ptr); + detail::deallocCudaPinnedData(ptr); } break; case DataSpace::CudaManaged: { - deallocCudaManagedData(ptr); + detail::deallocCudaManagedData(ptr); } break; case DataSpace::CudaDevice: { - deallocCudaDeviceData(ptr); + detail::deallocCudaDeviceData(ptr); } break; #endif @@ -403,18 +421,18 @@ inline void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) case DataSpace::HipPinnedFine: case DataSpace::HipPinnedCoarse: { - deallocHipPinnedData(ptr); + detail::deallocHipPinnedData(ptr); } break; case DataSpace::HipManaged: case DataSpace::HipManagedAdviseFine: case DataSpace::HipManagedAdviseCoarse: { - deallocHipManagedData(ptr); + detail::deallocHipManagedData(ptr); } break; case DataSpace::HipDevice: case DataSpace::HipDeviceFine: { - deallocHipDeviceData(ptr); + detail::deallocHipDeviceData(ptr); } break; #endif @@ -431,11 +449,11 @@ inline void deallocData(DataSpace dataSpace, T& ptr, VariantID vid) template inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, - int len, VariantID vid) + int len) { if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { - copyHostData(dst_ptr, src_ptr, sizeof(T)*len, vid); + detail::copyHostData(dst_ptr, src_ptr, sizeof(T)*len); } #if defined(RAJA_ENABLE_TARGET_OPENMP) @@ -445,7 +463,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, : omp_get_initial_device(); auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? omp_get_default_device() : omp_get_initial_device(); - copyOpenMPTargetData(dst_ptr, src_ptr, sizeof(T)*len, vid, + detail::copyOpenMPTargetData(dst_ptr, src_ptr, sizeof(T)*len, dst_did, src_did); } #endif @@ -453,14 +471,14 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, #if defined(RAJA_ENABLE_CUDA) else if (isCudaDataSpace(dst_dataSpace) || isCudaDataSpace(src_dataSpace)) { - copyCudaData(dst_ptr, src_ptr, sizeof(T)*len, vid); + detail::copyCudaData(dst_ptr, src_ptr, sizeof(T)*len); } #endif #if defined(RAJA_ENABLE_HIP) else if (isHipDataSpace(dst_dataSpace) || isHipDataSpace(src_dataSpace)) { - copyHipData(dst_ptr, src_ptr, sizeof(T)*len, vid); + detail::copyHipData(dst_ptr, src_ptr, sizeof(T)*len); } #endif @@ -474,17 +492,17 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, */ template inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, int len, int align, VariantID vid) + T*& ptr, int len, int align) { if (new_dataSpace != old_dataSpace) { T* new_ptr = nullptr; - allocData(new_dataSpace, new_ptr, len, align, vid); + allocData(new_dataSpace, new_ptr, len, align); - copyData(new_dataSpace, new_ptr, old_dataSpace, ptr, len, vid); + copyData(new_dataSpace, new_ptr, old_dataSpace, ptr, len); - deallocData(old_dataSpace, ptr, vid); + deallocData(old_dataSpace, ptr); ptr = new_ptr; } @@ -494,17 +512,16 @@ inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, * \brief Allocate and initialize data array. */ template -inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align, - VariantID vid) +inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); - allocData(init_dataSpace, ptr, len, align, vid); + allocData(init_dataSpace, ptr, len, align); - initData(ptr, len, vid); + detail::initData(ptr, len); if (init_dataSpace != dataSpace) { - moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + moveData(dataSpace, init_dataSpace, ptr, len, align); } } @@ -517,16 +534,16 @@ inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align, */ template inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int align, - Real_type val, VariantID vid) + Real_type val) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); - allocData(init_dataSpace, ptr, len, align, vid); + allocData(init_dataSpace, ptr, len, align); - initDataConst(ptr, len, val, vid); + detail::initDataConst(ptr, len, val); if (init_dataSpace != dataSpace) { - moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + moveData(dataSpace, init_dataSpace, ptr, len, align); } } @@ -536,17 +553,16 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int ali * Array is initialized using method initDataRandSign(Real_ptr& ptr...) below. */ template -inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align, - VariantID vid) +inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); - allocData(init_dataSpace, ptr, len, align, vid); + allocData(init_dataSpace, ptr, len, align); - initDataRandSign(ptr, len, vid); + detail::initDataRandSign(ptr, len); if (init_dataSpace != dataSpace) { - moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + moveData(dataSpace, init_dataSpace, ptr, len, align); } } @@ -557,17 +573,16 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int * Array is initialized using method initDataRandValue(Real_ptr& ptr...) below. */ template -inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align, - VariantID vid) +inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); - allocData(init_dataSpace, ptr, len, align, vid); + allocData(init_dataSpace, ptr, len, align); - initDataRandValue(ptr, len, vid); + detail::initDataRandValue(ptr, len); if (init_dataSpace != dataSpace) { - moveData(dataSpace, init_dataSpace, ptr, len, align, vid); + moveData(dataSpace, init_dataSpace, ptr, len, align); } } @@ -576,33 +591,30 @@ inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int */ template inline long double calcChecksum(DataSpace dataSpace, T* ptr, int len, int align, - Real_type scale_factor, VariantID vid) + Real_type scale_factor) { T* check_ptr = ptr; T* copied_ptr = nullptr; DataSpace check_dataSpace = hostAccessibleDataSpace(dataSpace); if (check_dataSpace != dataSpace) { - allocData(check_dataSpace, copied_ptr, len, align, vid); + allocData(check_dataSpace, copied_ptr, len, align); - copyData(check_dataSpace, copied_ptr, dataSpace, ptr, len, vid); + copyData(check_dataSpace, copied_ptr, dataSpace, ptr, len); check_ptr = copied_ptr; } - auto val = calcChecksum(check_ptr, len, scale_factor); + auto val = detail::calcChecksum(check_ptr, len, scale_factor); if (check_dataSpace != dataSpace) { - deallocData(check_dataSpace, copied_ptr, vid); + deallocData(check_dataSpace, copied_ptr); } return val; } -} // closing brace for detail namespace - - /*! * \brief Holds a RajaPool object and provides access to it via a * std allocator compliant type. diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 38abfb2fb..6debbe54f 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -115,9 +115,8 @@ inline int getHipDevice() /* * Copy memory len bytes from src to dst. */ -inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid) +inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len) { - (void)vid; hipErrchk( hipMemcpy( dst_ptr, src_ptr, len, hipMemcpyDefault ) ); } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 6fc731540..cc5f18470 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -252,102 +252,65 @@ class KernelBase } template - void allocData(T*& ptr, int len, - VariantID vid) + void allocData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocData(getDataSpace(vid), - ptr, len, getDataAlignment(), vid); + rajaperf::allocData(getDataSpace(vid), + ptr, len, getDataAlignment()); } template - void allocAndInitData(T*& ptr, int len, - VariantID vid) + void allocAndInitData(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitData(getDataSpace(vid), - ptr, len, getDataAlignment(), vid); + rajaperf::allocAndInitData(getDataSpace(vid), + ptr, len, getDataAlignment()); } template - void allocAndInitDataConst(T*& ptr, int len, T val, - VariantID vid) + void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid) { - rajaperf::detail::allocAndInitDataConst(getDataSpace(vid), - ptr, len, getDataAlignment(), val, vid); + rajaperf::allocAndInitDataConst(getDataSpace(vid), + ptr, len, getDataAlignment(), val); } template - void allocAndInitDataRandSign(T*& ptr, int len, - VariantID vid) + void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitDataRandSign(getDataSpace(vid), - ptr, len, getDataAlignment(), vid); + rajaperf::allocAndInitDataRandSign(getDataSpace(vid), + ptr, len, getDataAlignment()); } template - void allocAndInitDataRandValue(T*& ptr, int len, - VariantID vid) + void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid) { - rajaperf::detail::allocAndInitDataRandValue(getDataSpace(vid), - ptr, len, getDataAlignment(), vid); + rajaperf::allocAndInitDataRandValue(getDataSpace(vid), + ptr, len, getDataAlignment()); } template - void deallocData(T*& ptr, - VariantID vid) + void deallocData(T*& ptr, VariantID vid) { - rajaperf::detail::deallocData(getDataSpace(vid), ptr, vid); + rajaperf::deallocData(getDataSpace(vid), ptr); } template - void initData(T*& ptr, int len, - VariantID vid) + void initData(T& d, VariantID vid) { - rajaperf::detail::initData(getDataSpace(vid), ptr, len, vid); + (void)vid; + rajaperf::detail::initData(d); } template - void initDataConst(T*& ptr, int len, T val, - VariantID vid) + long double calcChecksum(T* ptr, int len, VariantID vid) { - rajaperf::detail::initDataConst(getDataSpace(vid), ptr, len, val, vid); + return rajaperf::calcChecksum(getDataSpace(vid), + ptr, len, getDataAlignment(), 1.0); } template - void initDataRandSign(T*& ptr, int len, - VariantID vid) + long double calcChecksum(T* ptr, int len, Real_type scale_factor, VariantID vid) { - rajaperf::detail::initDataRandSign(getDataSpace(vid), ptr, len, vid); - } - - template - void initDataRandValue(T*& ptr, int len, - VariantID vid) - { - rajaperf::detail::initDataRandValue(getDataSpace(vid), ptr, len, vid); - } - - template - void initData(T& d, - VariantID vid) - { - rajaperf::detail::initData(d, vid); - } - - template - long double calcChecksum(T* ptr, int len, - VariantID vid) - { - return rajaperf::detail::calcChecksum(getDataSpace(vid), - ptr, len, getDataAlignment(), 1.0, vid); - } - - template - long double calcChecksum(T* ptr, int len, - Real_type scale_factor, - VariantID vid) - { - return rajaperf::detail::calcChecksum(getDataSpace(vid), - ptr, len, getDataAlignment(), scale_factor, vid); + return rajaperf::calcChecksum(getDataSpace(vid), + ptr, len, getDataAlignment(), scale_factor); } void startTimer() diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 8c4b4b0a1..dfb8f4c00 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -30,10 +30,9 @@ namespace detail /* * Copy memory len bytes from src to dst. */ -inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, VariantID vid, +inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, int dst_did, int src_did) { - (void)vid; omp_target_memcpy( dst_ptr, src_ptr, len, 0, 0, dst_did, src_did ); } From ab4706fba2d435acab625496f73dfac2d80c6742 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Mar 2023 10:16:01 -0700 Subject: [PATCH 093/137] Use DataSpace memory methods for temporaries --- src/algorithm/REDUCE_SUM-Cuda.cpp | 12 +- src/algorithm/REDUCE_SUM-Hip.cpp | 12 +- src/algorithm/SCAN-Cuda.cpp | 4 +- src/algorithm/SCAN-Hip.cpp | 4 +- src/apps/FIR-Cuda.cpp | 5 +- src/apps/FIR-Hip.cpp | 5 +- src/apps/FIR-OMPTarget.cpp | 5 +- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 32 ++--- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 32 ++--- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 12 +- src/basic/INDEXLIST-Cuda.cpp | 16 +-- src/basic/INDEXLIST-Hip.cpp | 16 +-- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 12 +- src/basic/INDEXLIST_3LOOP-Hip.cpp | 12 +- src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 4 +- src/basic/PI_REDUCE-Cuda.cpp | 4 +- src/basic/PI_REDUCE-Hip.cpp | 4 +- src/basic/REDUCE3_INT-Cuda.cpp | 8 +- src/basic/REDUCE3_INT-Hip.cpp | 8 +- src/basic/REDUCE_STRUCT-Cuda.cpp | 4 +- src/basic/REDUCE_STRUCT-Hip.cpp | 4 +- src/basic/TRAP_INT-Cuda.cpp | 4 +- src/basic/TRAP_INT-Hip.cpp | 4 +- src/common/CudaDataUtils.hpp | 110 ++++++---------- src/common/DataUtils.cpp | 93 +------------- src/common/DataUtils.hpp | 103 ++++++++------- src/common/HipDataUtils.hpp | 145 ++++++++-------------- src/common/KernelBase.hpp | 12 ++ src/common/OpenMPTargetDataUtils.hpp | 68 +++------- src/stream/DOT-Cuda.cpp | 4 +- src/stream/DOT-Hip.cpp | 4 +- 31 files changed, 294 insertions(+), 468 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 7192129ef..c5be07c41 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -73,7 +73,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) int len = iend - ibegin; Real_type* sum_storage; - allocCudaPinnedData(sum_storage, 1); + allocData(DataSpace::CudaPinned, sum_storage, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -89,7 +89,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) // Allocate temporary storage unsigned char* temp_storage; - allocCudaDeviceData(temp_storage, temp_storage_bytes); + allocData(DataSpace::CudaDevice, temp_storage, temp_storage_bytes); d_temp_storage = temp_storage; @@ -113,8 +113,8 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) stopTimer(); // Free temporary storage - deallocCudaDeviceData(temp_storage); - deallocCudaPinnedData(sum_storage); + deallocData(DataSpace::CudaDevice, temp_storage); + deallocData(DataSpace::CudaPinned, sum_storage); } else { @@ -136,7 +136,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { Real_ptr dsum; - allocCudaDeviceData(dsum, 1); + allocData(DataSpace::CudaDevice, dsum, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -159,7 +159,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocCudaDeviceData(dsum); + deallocData(DataSpace::CudaDevice, dsum); } else if ( vid == RAJA_CUDA ) { diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 3549f8dd3..3327d1991 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -78,7 +78,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; Real_type* sum_storage; - allocHipPinnedData(sum_storage, 1); + allocData(DataSpace::HipPinned, sum_storage, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -105,7 +105,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) // Allocate temporary storage unsigned char* temp_storage; - allocHipDeviceData(temp_storage, temp_storage_bytes); + allocData(DataSpace::HipDevice, temp_storage, temp_storage_bytes); d_temp_storage = temp_storage; @@ -140,8 +140,8 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) stopTimer(); // Free temporary storage - deallocHipDeviceData(temp_storage); - deallocHipPinnedData(sum_storage); + deallocData(DataSpace::HipDevice, temp_storage); + deallocData(DataSpace::HipPinned, sum_storage); } else { @@ -163,7 +163,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { Real_ptr dsum; - allocHipDeviceData(dsum, 1); + allocData(DataSpace::HipDevice, dsum, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -185,7 +185,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocHipDeviceData(dsum); + deallocData(DataSpace::HipDevice, dsum); } else if ( vid == RAJA_HIP ) { diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 61b6ee551..2320e9431 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -56,7 +56,7 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) // Allocate temporary storage unsigned char* temp_storage; - allocCudaDeviceData(temp_storage, temp_storage_bytes); + allocData(DataSpace::CudaDevice, temp_storage, temp_storage_bytes); d_temp_storage = temp_storage; startTimer(); @@ -76,7 +76,7 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) stopTimer(); // Free temporary storage - deallocCudaDeviceData(temp_storage); + deallocData(DataSpace::CudaDevice, temp_storage); } else if ( vid == RAJA_CUDA ) { diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index c1d994210..fdcdb455d 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -72,7 +72,7 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) // Allocate temporary storage unsigned char* temp_storage; - allocHipDeviceData(temp_storage, temp_storage_bytes); + allocData(DataSpace::HipDevice, temp_storage, temp_storage_bytes); d_temp_storage = temp_storage; startTimer(); @@ -103,7 +103,7 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) stopTimer(); // Free temporary storage - deallocHipDeviceData(temp_storage); + deallocData(DataSpace::HipDevice, temp_storage); } else if ( vid == RAJA_HIP ) { diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 269634f7b..32e6227e5 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -53,11 +53,12 @@ __global__ void fir(Real_ptr out, Real_ptr in, Real_ptr coeff; \ \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitCudaData(coeff, tcoeff, FIR_COEFFLEN); + allocData(DataSpace::CudaDevice, coeff, FIR_COEFFLEN); \ + copyData(DataSpace::CudaDevice, coeff, DataSpace::Host, tcoeff, FIR_COEFFLEN); #define FIR_DATA_TEARDOWN_CUDA \ - deallocCudaData(coeff); + deallocData(DataSpace::CudaDevice, coeff); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index ca0ab50b5..5147e3f16 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -53,11 +53,12 @@ __global__ void fir(Real_ptr out, Real_ptr in, Real_ptr coeff; \ \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitHipData(coeff, tcoeff, FIR_COEFFLEN); + allocAndInitData(DataSpace::HipDevice, coeff, FIR_COEFFLEN); \ + copyData(DataSpace::HipDevice, coeff, DataSpace::Host, tcoeff, FIR_COEFFLEN); #define FIR_DATA_TEARDOWN_HIP \ - deallocHipData(coeff); + deallocData(DataSpace::HipDevice, coeff); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 6b417bd6a..d5eeaddee 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -31,11 +31,12 @@ namespace apps Real_ptr coeff; \ \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitOpenMPDeviceData(coeff, tcoeff, FIR_COEFFLEN); + allocAndInitData(DataSpace::OmpTarget, coeff, FIR_COEFFLEN); \ + copyData(DataSpace::OmpTarget, coeff, DataSpace::Host, tcoeff, FIR_COEFFLEN); #define FIR_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(coeff, did); + deallocData(DataSpace::OmpTarget, coeff, did); void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index aeee13c18..464cc114d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -26,28 +26,28 @@ namespace apps Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocCudaPinnedData(pack_buffer_ptrs, num_neighbors * num_vars); \ - allocCudaPinnedData(pack_list_ptrs, num_neighbors * num_vars); \ - allocCudaPinnedData(pack_var_ptrs, num_neighbors * num_vars); \ - allocCudaPinnedData(pack_len_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocCudaPinnedData(unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocCudaPinnedData(unpack_list_ptrs, num_neighbors * num_vars); \ - allocCudaPinnedData(unpack_var_ptrs, num_neighbors * num_vars); \ - allocCudaPinnedData(unpack_len_ptrs, num_neighbors * num_vars); + allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); #define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ - deallocCudaPinnedData(pack_buffer_ptrs); \ - deallocCudaPinnedData(pack_list_ptrs); \ - deallocCudaPinnedData(pack_var_ptrs); \ - deallocCudaPinnedData(pack_len_ptrs); \ - deallocCudaPinnedData(unpack_buffer_ptrs); \ - deallocCudaPinnedData(unpack_list_ptrs); \ - deallocCudaPinnedData(unpack_var_ptrs); \ - deallocCudaPinnedData(unpack_len_ptrs); + deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_len_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index cbc7b868d..99ece760e 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -26,28 +26,28 @@ namespace apps Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocHipPinnedData(pack_buffer_ptrs, num_neighbors * num_vars); \ - allocHipPinnedData(pack_list_ptrs, num_neighbors * num_vars); \ - allocHipPinnedData(pack_var_ptrs, num_neighbors * num_vars); \ - allocHipPinnedData(pack_len_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocHipPinnedData(unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocHipPinnedData(unpack_list_ptrs, num_neighbors * num_vars); \ - allocHipPinnedData(unpack_var_ptrs, num_neighbors * num_vars); \ - allocHipPinnedData(unpack_len_ptrs, num_neighbors * num_vars); + allocData(DataSpace::HipPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, unpack_len_ptrs, num_neighbors * num_vars); #define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ - deallocHipPinnedData(pack_buffer_ptrs); \ - deallocHipPinnedData(pack_list_ptrs); \ - deallocHipPinnedData(pack_var_ptrs); \ - deallocHipPinnedData(pack_len_ptrs); \ - deallocHipPinnedData(unpack_buffer_ptrs); \ - deallocHipPinnedData(unpack_list_ptrs); \ - deallocHipPinnedData(unpack_var_ptrs); \ - deallocHipPinnedData(unpack_len_ptrs); + deallocData(DataSpace::HipPinned, pack_buffer_ptrs); \ + deallocData(DataSpace::HipPinned, pack_list_ptrs); \ + deallocData(DataSpace::HipPinned, pack_var_ptrs); \ + deallocData(DataSpace::HipPinned, pack_len_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_buffer_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_list_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_var_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 430f2f7f5..914f4d452 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -28,7 +28,7 @@ namespace apps #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ - allocOpenMPDeviceData(pack_ptrs, 4 * num_neighbors * num_vars, did); \ + allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ Int_ptr* pack_list_ptrs = reinterpret_cast(pack_ptrs) + 1 * num_neighbors * num_vars; \ Real_ptr* pack_var_ptrs = reinterpret_cast(pack_ptrs) + 2 * num_neighbors * num_vars; \ @@ -39,7 +39,7 @@ namespace apps Real_ptr* h_pack_var_ptrs = reinterpret_cast(h_pack_ptrs) + 2 * num_neighbors * num_vars; \ Index_type* h_pack_len_ptrs = reinterpret_cast(h_pack_ptrs) + 3 * num_neighbors * num_vars; \ void** unpack_ptrs; \ - allocOpenMPDeviceData(unpack_ptrs, 4 * num_neighbors * num_vars, did); \ + allocData(DataSpace::OmpTarget, unpack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs = reinterpret_cast(unpack_ptrs) + 0 * num_neighbors * num_vars; \ Int_ptr* unpack_list_ptrs = reinterpret_cast(unpack_ptrs) + 1 * num_neighbors * num_vars; \ Real_ptr* unpack_var_ptrs = reinterpret_cast(unpack_ptrs) + 2 * num_neighbors * num_vars; \ @@ -51,15 +51,15 @@ namespace apps Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; #define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ - copyOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars, did, hid); + initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); #define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ - copyOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars, did, hid); + initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); #define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(pack_ptrs, did); \ + deallocData(DataSpace::OmpTarget, pack_ptrs); \ delete[] h_pack_ptrs; \ - deallocOpenMPDeviceData(unpack_ptrs, did); \ + deallocData(DataSpace::OmpTarget, unpack_ptrs); \ delete[] h_unpack_ptrs; diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 1fbac7291..b476d4255 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -261,13 +261,13 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) const size_t shmem_size = 0; Index_type* len; - allocCudaPinnedData(len, 1); + allocData(DataSpace::CudaPinned, len, 1); Index_type* block_counts; - allocCudaDeviceData(block_counts, grid_size); + allocData(DataSpace::CudaDevice, block_counts, grid_size); Index_type* grid_counts; - allocCudaDeviceData(grid_counts, grid_size); + allocData(DataSpace::CudaDevice, grid_counts, grid_size); unsigned* block_readys; - allocCudaDeviceData(block_readys, grid_size); + allocData(DataSpace::CudaDevice, block_readys, grid_size); cudaErrchk( cudaMemset(block_readys, 0, sizeof(unsigned)*grid_size) ); startTimer(); @@ -286,10 +286,10 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaPinnedData(len); - deallocCudaDeviceData(block_counts); - deallocCudaDeviceData(grid_counts); - deallocCudaDeviceData(block_readys); + deallocData(DataSpace::CudaPinned, len); + deallocData(DataSpace::CudaDevice, block_counts); + deallocData(DataSpace::CudaDevice, grid_counts); + deallocData(DataSpace::CudaDevice, block_readys); } else { getCout() << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 38ccb0f09..3ae245616 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -261,13 +261,13 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) const size_t shmem_size = 0; Index_type* len; - allocHipPinnedData(len, 1); + allocData(DataSpace::HipPinned, len, 1); Index_type* block_counts; - allocHipDeviceData(block_counts, grid_size); + allocData(DataSpace::HipDevice, block_counts, grid_size); Index_type* grid_counts; - allocHipDeviceData(grid_counts, grid_size); + allocData(DataSpace::HipDevice, grid_counts, grid_size); unsigned* block_readys; - allocHipDeviceData(block_readys, grid_size); + allocData(DataSpace::HipDevice, block_readys, grid_size); hipErrchk( hipMemset(block_readys, 0, sizeof(unsigned)*grid_size) ); startTimer(); @@ -286,10 +286,10 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipPinnedData(len); - deallocHipDeviceData(block_counts); - deallocHipDeviceData(grid_counts); - deallocHipDeviceData(block_readys); + deallocData(DataSpace::HipPinned, len); + deallocData(DataSpace::HipDevice, block_counts); + deallocData(DataSpace::HipDevice, grid_counts); + deallocData(DataSpace::HipDevice, block_readys); } else { getCout() << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 8e086ab6d..b8fde073c 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -23,10 +23,10 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ Index_type* counts; \ - allocCudaData(counts, iend+1); + allocData(DataSpace::CudaDevice, counts, iend+1); #define INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA \ - deallocCudaData(counts); + deallocData(DataSpace::CudaDevice, counts); template < size_t block_size > @@ -72,7 +72,7 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_CUDA; Index_type* len; - allocCudaPinnedData(len, 1); + allocData(DataSpace::CudaPinned, len, 1); cudaStream_t stream = RAJA::resources::Cuda::get_default().get_stream(); @@ -91,7 +91,7 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) stream)); unsigned char* temp_storage; - allocCudaDeviceData(temp_storage, temp_storage_bytes); + allocData(DataSpace::CudaDevice, temp_storage, temp_storage_bytes); d_temp_storage = temp_storage; startTimer(); @@ -121,8 +121,8 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaDeviceData(temp_storage); - deallocCudaPinnedData(len); + deallocData(DataSpace::CudaDevice, temp_storage); + deallocData(DataSpace::CudaPinned, len); INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA; diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 65f63db77..b53cb9450 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -23,10 +23,10 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_HIP \ Index_type* counts; \ - allocHipData(counts, iend+1); + allocData(DataSpace::HipDevice, counts, iend+1); #define INDEXLIST_3LOOP_DATA_TEARDOWN_HIP \ - deallocHipData(counts); + deallocData(DataSpace::HipDevice, counts); template < size_t block_size > @@ -72,7 +72,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_HIP; Index_type* len; - allocHipPinnedData(len, 1); + allocData(DataSpace::HipPinned, len, 1); hipStream_t stream = RAJA::resources::Hip::get_default().get_stream(); @@ -102,7 +102,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) #endif unsigned char* temp_storage; - allocHipDeviceData(temp_storage, temp_storage_bytes); + allocData(DataSpace::HipDevice, temp_storage, temp_storage_bytes); d_temp_storage = temp_storage; startTimer(); @@ -143,8 +143,8 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipDeviceData(temp_storage); - deallocHipPinnedData(len); + deallocData(DataSpace::HipDevice, temp_storage); + deallocData(DataSpace::HipPinned, len); INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp index d73c193cb..e30c7a712 100644 --- a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -29,10 +29,10 @@ namespace basic int did = omp_get_default_device(); \ \ Index_type* counts = nullptr; \ - allocOpenMPDeviceData(counts, iend+1, did); + allocData(DataSpace::OmpTarget, counts, iend+1); #define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(counts, did); + deallocData(DataSpace::OmpTarget, counts); #endif diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 3fb294cb1..495c6e8d3 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -70,7 +70,7 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { Real_ptr dpi; - allocAndInitCudaDeviceData(dpi, &m_pi_init, 1); + allocData(DataSpace::CudaDevice, dpi, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -93,7 +93,7 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaDeviceData(dpi); + deallocData(DataSpace::CudaDevice, dpi); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index c0ec0becf..2e8fc70c7 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -70,7 +70,7 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { Real_ptr dpi; - allocAndInitHipDeviceData(dpi, &m_pi_init, 1); + allocData(DataSpace::HipDevice, dpi, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,7 +92,7 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipDeviceData(dpi); + deallocData(DataSpace::HipDevice, dpi); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index f2b023dcd..10fd2a4a5 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -85,10 +85,10 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { Int_ptr vmem_init; - allocCudaPinnedData(vmem_init, 3); + allocData(DataSpace::CudaPinned, vmem_init, 3); Int_ptr vmem; - allocCudaDeviceData(vmem, 3); + allocData(DataSpace::CudaDevice, vmem, 3); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -118,8 +118,8 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaDeviceData(vmem); - deallocCudaPinnedData(vmem_init); + deallocData(DataSpace::CudaDevice, vmem); + deallocData(DataSpace::CudaPinned, vmem_init); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 33a8e7f7a..f5fa49149 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -85,10 +85,10 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { Int_ptr vmem_init; - allocHipPinnedData(vmem_init, 3); + allocData(DataSpace::HipPinned, vmem_init, 3); Int_ptr vmem; - allocHipDeviceData(vmem, 3); + allocData(DataSpace::HipDevice, vmem, 3); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -118,8 +118,8 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipDeviceData(vmem); - deallocHipPinnedData(vmem_init); + deallocData(DataSpace::HipDevice, vmem); + deallocData(DataSpace::HipPinned, vmem_init); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 677427128..229ed129a 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -107,7 +107,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocCudaDeviceData(mem,6); + allocData(DataSpace::CudaDevice, mem,6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -138,7 +138,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaDeviceData(mem); + deallocData(DataSpace::CudaDevice, mem); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index be26c830d..4d0acacaa 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -108,7 +108,7 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocHipDeviceData(mem,6); + allocData(DataSpace::HipDevice, mem,6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -141,7 +141,7 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipDeviceData(mem); + deallocData(DataSpace::HipDevice, mem); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 150e5dfc5..8890192e4 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -90,7 +90,7 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { Real_ptr sumx; - allocAndInitCudaDeviceData(sumx, &m_sumx_init, 1); + allocData(DataSpace::CudaDevice, sumx, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,7 +114,7 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaDeviceData(sumx); + deallocData(DataSpace::CudaDevice, sumx); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 0c6c32abf..9d9bb9e67 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -90,7 +90,7 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { Real_ptr sumx; - allocAndInitHipDeviceData(sumx, &m_sumx_init, 1); + allocData(DataSpace::HipDevice, sumx, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -113,7 +113,7 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipDeviceData(sumx); + deallocData(DataSpace::HipDevice, sumx); } else if ( vid == RAJA_HIP ) { diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 6d49fa0fd..413503d53 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -29,9 +29,6 @@ namespace rajaperf { -namespace detail -{ - /*! * \brief Device timer, returns a time in ns from an arbitrary starting point. * Note that this time is consistent across the whole device. @@ -116,6 +113,9 @@ __device__ inline Index_type lambda_cuda_get_index() } +namespace detail +{ + /* * Copy memory len bytes from src to dst. */ @@ -125,123 +125,91 @@ inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len) cudaMemcpyDefault ) ); } -/*! - * \brief Copy given hptr (host) data to CUDA device (dptr). - * - * Method assumes both host and device data arrays are allocated - * and of propoer size for copy operation to succeed. - */ -template -void initCudaDeviceData(T& dptr, const T hptr, int len) -{ - cudaErrchk( cudaMemcpy( dptr, hptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemcpyHostToDevice ) ); -} - /*! * \brief Allocate CUDA device data array (dptr). */ -template -void allocCudaDeviceData(T& dptr, int len) +inline void* allocCudaDeviceData(size_t len) { - cudaErrchk( cudaMalloc( (void**)&dptr, - len * sizeof(typename std::remove_pointer::type) ) ); + void* dptr = nullptr; + cudaErrchk( cudaMalloc( &dptr, len ) ); + return dptr; } /*! * \brief Allocate CUDA managed data array (dptr). */ -template -void allocCudaManagedData(T& mptr, int len) +inline void* allocCudaManagedData(size_t len) { - cudaErrchk( cudaMallocManaged( (void**)&mptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemAttachGlobal ) ); + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + return mptr; } /*! * \brief Allocate CUDA pinned data array (pptr). */ -template -void allocCudaPinnedData(T& pptr, int len) -{ - cudaErrchk( cudaHostAlloc( (void**)&pptr, - len * sizeof(typename std::remove_pointer::type), - cudaHostAllocMapped ) ); -} - -/*! - * \brief Allocate CUDA device data array (dptr) and copy given hptr (host) - * data to device array. - */ -template -void allocAndInitCudaDeviceData(T& dptr, const T hptr, int len) +inline void* allocCudaPinnedData(size_t len) { - allocCudaDeviceData(dptr, len); - initCudaDeviceData(dptr, hptr, len); + void* pptr = nullptr; + cudaErrchk( cudaHostAlloc( &pptr, len, cudaHostAllocMapped ) ); + return pptr; } -/*! - * \brief Copy given dptr (CUDA device) data to host (hptr). - * - * Method assumes both host and device data arrays are allocated - * and of propoer size for copy operation to succeed. - */ -template -void getCudaDeviceData(T& hptr, const T dptr, int len) -{ - cudaErrchk( cudaMemcpy( hptr, dptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemcpyDeviceToHost ) ); -} /*! * \brief Free device data array. */ -template -void deallocCudaDeviceData(T& dptr) +inline void deallocCudaDeviceData(void* dptr) { cudaErrchk( cudaFree( dptr ) ); - dptr = nullptr; } /*! * \brief Free managed data array. */ -template -void deallocCudaManagedData(T& mptr) +inline void deallocCudaManagedData(void* mptr) { cudaErrchk( cudaFree( mptr ) ); - mptr = nullptr; } /*! * \brief Free pinned data array. */ -template -void deallocCudaPinnedData(T& pptr) +inline void deallocCudaPinnedData(void* pptr) { cudaErrchk( cudaFreeHost( pptr ) ); - pptr = nullptr; } +} // closing brace for detail namespace + + /*! - * \brief Copy given cptr (CUDA) data to host (hptr). + * \brief Copy given hptr (host) data to CUDA device (dptr). * * Method assumes both host and device data arrays are allocated * and of propoer size for copy operation to succeed. */ template -void getCudaData(T& hptr, const T cptr, int len) +void initCudaDeviceData(T& dptr, const T hptr, int len) { - cudaErrchk( cudaMemcpy( hptr, cptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemcpyDefault ) ); + cudaErrchk( cudaMemcpy( dptr, hptr, + len * sizeof(typename std::remove_pointer::type), + cudaMemcpyHostToDevice ) ); } - -} // closing brace for detail namespace +/*! + * \brief Copy given dptr (CUDA device) data to host (hptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void getCudaDeviceData(T& hptr, const T dptr, int len) +{ + cudaErrchk( cudaMemcpy( hptr, dptr, + len * sizeof(typename std::remove_pointer::type), + cudaMemcpyDeviceToHost ) ); +} } // closing brace for rajaperf namespace diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 11eeea04a..317555060 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -53,109 +53,24 @@ void copyHostData(void* dst_ptr, const void* src_ptr, size_t len) /* * Allocate data arrays of given type. */ -void allocHostData(Int_ptr& ptr, int len, int align) +void* allocHostData(size_t len, size_t align) { - ptr = RAJA::allocate_aligned_type( - align, len*sizeof(Int_type)); -} -/// -void allocHostData(Index_type*& ptr, int len, int align) -{ - ptr = RAJA::allocate_aligned_type( - align, len*sizeof(Index_type)); -} - -void allocHostData(Real_ptr& ptr, int len, int align) -{ - ptr = RAJA::allocate_aligned_type( - align, len*sizeof(Real_type)); -} - -void allocHostData(Complex_ptr& ptr, int len, int align) -{ - ptr = RAJA::allocate_aligned_type( - align, len*sizeof(Complex_type)); + return RAJA::allocate_aligned_type( + align, len); } /* * Free data arrays of given type. */ -void deallocHostData(Int_ptr& ptr) -{ - if (ptr) { - RAJA::free_aligned(ptr); - ptr = nullptr; - } -} - -void deallocHostData(Index_type*& ptr) -{ - if (ptr) { - RAJA::free_aligned(ptr); - ptr = nullptr; - } -} - -void deallocHostData(Real_ptr& ptr) -{ - if (ptr) { - RAJA::free_aligned(ptr); - ptr = nullptr; - } -} - -void deallocHostData(Complex_ptr& ptr) +void deallocHostData(void* ptr) { if (ptr) { RAJA::free_aligned(ptr); - ptr = nullptr; } } -/* - * \brief Touch Int_type data array with omp threads. - */ -void touchOmpData(Int_ptr& ptr, int len) -{ -// First touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = -987654321; - }; -#endif -} - -/* - * \brief Touch Real_type data array with omp threads. - */ -void touchOmpData(Real_ptr& ptr, int len) -{ -// First touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = -(i + 1.11111111)/(i + 1.23456789); - }; -#endif -} - -/* - * \brief Touch Complex_type data array with omp threads. - */ -void touchOmpData(Complex_ptr& ptr, int len) -{ -// First touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = -(i + 1.11111111)/(i + 1.23456789); - }; -#endif -} - /* * \brief Initialize Int_type data array to * randomly signed positive and negative values. diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 8bc83b6f7..a1ed34142 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -52,40 +52,30 @@ void copyHostData(void* dst_ptr, const void* src_ptr, size_t len); /*! * \brief Allocate data arrays. */ -void allocHostData(Int_ptr& ptr, int len, int align); -/// -void allocHostData(Index_type*& ptr, int len, int align); -/// -void allocHostData(Real_ptr& ptr, int len, int align); -/// -void allocHostData(Complex_ptr& ptr, int len, int align); +void* allocHostData(int len, int align); /*! * \brief Free data arrays. */ -void deallocHostData(Int_ptr& ptr); -/// -void deallocHostData(Index_type*& ptr); -/// -void deallocHostData(Real_ptr& ptr); -/// -void deallocHostData(Complex_ptr& ptr); +void deallocHostData(void* ptr); -/*! - * \brief Touch Int_type data array with omp threads. - */ -void touchOmpData(Int_ptr& ptr, int len); - -/*! - * \brief Touch Real_type data array with omp threads. - */ -void touchOmpData(Real_ptr& ptr, int len); - -/*! - * \brief Touch Complex_type data array with omp threads. +/* + * \brief Touch data array with omp threads. */ -void touchOmpData(Complex_ptr& ptr, int len); +template < typename T > +void touchOmpData(T* ptr, int len) +{ +// First touch... +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + #pragma omp parallel for + for (int i = 0; i < len; ++i) { + ptr[i] = std::numeric_limits::max() - i; + }; +#else + (void)ptr; (void)len; +#endif +} /*! * \brief Initialize Int_type data array. @@ -287,88 +277,91 @@ inline bool isHipDataSpace(DataSpace dataSpace) * \brief Allocate data array (ptr). */ template -inline void allocData(DataSpace dataSpace, T& ptr, int len, int align) +inline void allocData(DataSpace dataSpace, T& ptr_ref, int len, int align) { + void* ptr = nullptr; + size_t nbytes = len*sizeof(std::remove_pointer_t); + switch (dataSpace) { case DataSpace::Host: { - detail::allocHostData(ptr, len, align); + ptr = detail::allocHostData(nbytes, align); } break; #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) case DataSpace::Omp: { - detail::allocHostData(ptr, len, align); - detail::touchOmpData(ptr, len); + ptr = detail::allocHostData(nbytes, align); + detail::touchOmpData(static_cast(ptr), len); } break; #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) case DataSpace::OmpTarget: { - detail::allocOpenMPDeviceData(ptr, len); + ptr = detail::allocOpenMPDeviceData(nbytes); } break; #endif #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: { - detail::allocCudaPinnedData(ptr, len); + ptr = detail::allocCudaPinnedData(nbytes); } break; case DataSpace::CudaManaged: { - detail::allocCudaManagedData(ptr, len); + ptr = detail::allocCudaManagedData(nbytes); } break; case DataSpace::CudaDevice: { - detail::allocCudaDeviceData(ptr, len); + ptr = detail::allocCudaDeviceData(nbytes); } break; #endif #if defined(RAJA_ENABLE_HIP) case DataSpace::HipHostAdviseFine: { - detail::allocHostData(ptr, len, align); - detail::adviseHipFineData(ptr, len); + ptr = detail::allocHostData(nbytes, align); + detail::adviseHipFineData(ptr, nbytes); } break; case DataSpace::HipHostAdviseCoarse: { - detail::allocHostData(ptr, len, align); - detail::adviseHipCoarseData(ptr, len); + ptr = detail::allocHostData(nbytes, align); + detail::adviseHipCoarseData(ptr, nbytes); } break; case DataSpace::HipPinned: { - detail::allocHipPinnedData(ptr, len); + ptr = detail::allocHipPinnedData(nbytes); } break; case DataSpace::HipPinnedFine: { - detail::allocHipPinnedFineData(ptr, len); + ptr = detail::allocHipPinnedFineData(nbytes); } break; case DataSpace::HipPinnedCoarse: { - detail::allocHipPinnedCoarseData(ptr, len); + ptr = detail::allocHipPinnedCoarseData(nbytes); } break; case DataSpace::HipManaged: { - detail::allocHipManagedData(ptr, len); + ptr = detail::allocHipManagedData(nbytes); } break; case DataSpace::HipManagedAdviseFine: { - detail::allocHipManagedData(ptr, len); - detail::adviseHipFineData(ptr, len); + ptr = detail::allocHipManagedData(nbytes); + detail::adviseHipFineData(ptr, nbytes); } break; case DataSpace::HipManagedAdviseCoarse: { - detail::allocHipManagedData(ptr, len); - detail::adviseHipCoarseData(ptr, len); + ptr = detail::allocHipManagedData(nbytes); + detail::adviseHipCoarseData(ptr, nbytes); } break; case DataSpace::HipDevice: { - detail::allocHipDeviceData(ptr, len); + ptr = detail::allocHipDeviceData(nbytes); } break; case DataSpace::HipDeviceFine: { - detail::allocHipDeviceFineData(ptr, len); + ptr = detail::allocHipDeviceFineData(nbytes); } break; #endif @@ -377,6 +370,7 @@ inline void allocData(DataSpace dataSpace, T& ptr, int len, int align) throw std::invalid_argument("allocData : Unknown data space"); } break; } + ptr_ref = static_cast(ptr); } /*! @@ -441,6 +435,7 @@ inline void deallocData(DataSpace dataSpace, T& ptr) throw std::invalid_argument("deallocData : Unknown data space"); } break; } + ptr = nullptr; } /*! @@ -451,9 +446,11 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, int len) { + size_t nbytes = len*sizeof(T); + if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { - detail::copyHostData(dst_ptr, src_ptr, sizeof(T)*len); + detail::copyHostData(dst_ptr, src_ptr, nbytes); } #if defined(RAJA_ENABLE_TARGET_OPENMP) @@ -463,7 +460,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, : omp_get_initial_device(); auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? omp_get_default_device() : omp_get_initial_device(); - detail::copyOpenMPTargetData(dst_ptr, src_ptr, sizeof(T)*len, + detail::copyOpenMPTargetData(dst_ptr, src_ptr, nbytes, dst_did, src_did); } #endif @@ -471,14 +468,14 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, #if defined(RAJA_ENABLE_CUDA) else if (isCudaDataSpace(dst_dataSpace) || isCudaDataSpace(src_dataSpace)) { - detail::copyCudaData(dst_ptr, src_ptr, sizeof(T)*len); + detail::copyCudaData(dst_ptr, src_ptr, nbytes); } #endif #if defined(RAJA_ENABLE_HIP) else if (isHipDataSpace(dst_dataSpace) || isHipDataSpace(src_dataSpace)) { - detail::copyHipData(dst_ptr, src_ptr, sizeof(T)*len); + detail::copyHipData(dst_ptr, src_ptr, nbytes); } #endif diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 6debbe54f..9bd9b5da0 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -29,9 +29,6 @@ namespace rajaperf { -namespace detail -{ - /*! * \brief Simple forall hip kernel that runs a lambda. */ @@ -102,6 +99,10 @@ __device__ inline Index_type lambda_hip_get_index() { return blockIdx.z; } + +namespace detail +{ + /*! * \brief Get current hip device. */ @@ -121,180 +122,144 @@ inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len) hipMemcpyDefault ) ); } -/*! - * \brief Copy given hptr (host) data to HIP device (dptr). - * - * Method assumes both host and device data arrays are allocated - * and of propoer size for copy operation to succeed. - */ -template -void initHipDeviceData(T& dptr, const T hptr, int len) -{ - hipErrchk( hipMemcpy( dptr, hptr, - len * sizeof(typename std::remove_pointer::type), - hipMemcpyHostToDevice ) ); -} - /*! * \brief Allocate HIP device data array (dptr). */ -template -void allocHipDeviceData(T& dptr, int len) +inline void* allocHipDeviceData(size_t len) { - hipErrchk( hipMalloc( (void**)&dptr, - len * sizeof(typename std::remove_pointer::type) ) ); + void* dptr = nullptr; + hipErrchk( hipMalloc( &dptr, len ) ); + return dptr; } /*! * \brief Allocate HIP fine-grained device data array (dfptr). */ -template -void allocHipDeviceFineData(T& dfptr, int len) +inline void* allocHipDeviceFineData(size_t len) { - hipErrchk( hipExtMallocWithFlags( (void**)&dfptr, - len * sizeof(typename std::remove_pointer::type), + void* dfptr = nullptr; + hipErrchk( hipExtMallocWithFlags( &dfptr, len, hipDeviceMallocFinegrained ) ); + return dfptr; } /*! * \brief Allocate HIP managed data array (mptr). */ -template -void allocHipManagedData(T& mptr, int len) +inline void* allocHipManagedData(size_t len) { - hipErrchk( hipMallocManaged( (void**)&mptr, - len * sizeof(typename std::remove_pointer::type), + void* mptr = nullptr; + hipErrchk( hipMallocManaged( &mptr, len, hipMemAttachGlobal ) ); + return mptr; } /*! * \brief Allocate HIP pinned data array (pptr). */ -template -void allocHipPinnedData(T& pptr, int len) +inline void* allocHipPinnedData(size_t len) { - hipErrchk( hipHostMalloc( (void**)&pptr, - len * sizeof(typename std::remove_pointer::type), + void* pptr = nullptr; + hipErrchk( hipHostMalloc( &pptr, len, hipHostMallocMapped ) ); + return pptr; } /*! * \brief Allocate HIP fine-grained pinned data array (pfptr). */ -template -void allocHipPinnedFineData(T& pfptr, int len) +inline void* allocHipPinnedFineData(size_t len) { - hipErrchk( hipHostMalloc( (void**)&pfptr, - len * sizeof(typename std::remove_pointer::type), + void* pfptr = nullptr; + hipErrchk( hipHostMalloc( &pfptr, len, hipHostMallocMapped | hipHostMallocCoherent ) ); + return pfptr; } /*! * \brief Allocate HIP coarse-grained pinned data array (pcptr). */ -template -void allocHipPinnedCoarseData(T& pcptr, int len) +inline void* allocHipPinnedCoarseData(size_t len) { - hipErrchk( hipHostMalloc( (void**)&pcptr, - len * sizeof(typename std::remove_pointer::type), + void* pcptr = nullptr; + hipErrchk( hipHostMalloc( &pcptr, len, hipHostMallocMapped | hipHostMallocNonCoherent ) ); + return pcptr; } /*! * \brief Apply mem advice to HIP data array (ptr). */ -template -void adviseHipData(T& ptr, int len, hipMemoryAdvise advice, int device) +inline void adviseHipData(void* ptr, int len, hipMemoryAdvise advice, int device) { - hipErrchk( hipMemAdvise( (void*)ptr, - len * sizeof(typename std::remove_pointer::type), - advice, device ) ); + hipErrchk( hipMemAdvise( ptr, len, advice, device ) ); } -template -void adviseHipCoarseData(T& ptr, int len) +inline void adviseHipCoarseData(void* ptr, size_t len) { adviseHipData(ptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); } -template -void adviseHipFineData(T& ptr, int len) +inline void adviseHipFineData(void* ptr, size_t len) { adviseHipData(ptr, len, hipMemAdviseUnsetCoarseGrain, getHipDevice()); } -/*! - * \brief Allocate HIP device data array (dptr) and copy given hptr (host) - * data to device array. - */ -template -void allocAndInitHipDeviceData(T& dptr, const T hptr, int len) -{ - allocHipDeviceData(dptr, len); - initHipDeviceData(dptr, hptr, len); -} - -/*! - * \brief Copy given dptr (HIP device) data to host (hptr). - * - * Method assumes both host and device data arrays are allocated - * and of propoer size for copy operation to succeed. - */ -template -void getHipDeviceData(T& hptr, const T dptr, int len) -{ - hipErrchk( hipMemcpy( hptr, dptr, - len * sizeof(typename std::remove_pointer::type), - hipMemcpyDeviceToHost ) ); -} /*! * \brief Free device data array. */ -template -void deallocHipDeviceData(T& dptr) +inline void deallocHipDeviceData(void* dptr) { hipErrchk( hipFree( dptr ) ); - dptr = nullptr; } /*! * \brief Free managed data array. */ -template -void deallocHipManagedData(T& mptr) +inline void deallocHipManagedData(void* mptr) { hipErrchk( hipFree( mptr ) ); - mptr = nullptr; } /*! * \brief Free pinned data array. */ -template -void deallocHipPinnedData(T& pptr) +inline void deallocHipPinnedData(void* pptr) { hipErrchk( hipHostFree( pptr ) ); - pptr = nullptr; } +} // closing brace for detail namespace + + /*! - * \brief Copy given cptr (HIP) data to host (hptr). + * \brief Copy given hptr (host) data to HIP device (dptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void initHipDeviceData(T& dptr, const T hptr, int len) +{ + hipErrchk( hipMemcpy( dptr, hptr, + len * sizeof(typename std::remove_pointer::type), + hipMemcpyHostToDevice ) ); +} +/*! + * \brief Copy given dptr (HIP device) data to host (hptr). * * Method assumes both host and device data arrays are allocated * and of propoer size for copy operation to succeed. */ template -void getHipData(T& hptr, const T cptr, int len) +void getHipDeviceData(T& hptr, const T dptr, int len) { - hipErrchk( hipMemcpy( hptr, cptr, + hipErrchk( hipMemcpy( hptr, dptr, len * sizeof(typename std::remove_pointer::type), - hipMemcpyDefault ) ); + hipMemcpyDeviceToHost ) ); } - -} // closing brace for detail namespace - } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_HIP diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index cc5f18470..3291bd172 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -251,6 +251,18 @@ class KernelBase } } + template + void allocData(DataSpace dataSpace, T& ptr, int len) + { + rajaperf::allocData(dataSpace, + ptr, len, getDataAlignment()); + } + template + void deallocData(DataSpace dataSpace, T& ptr) + { + rajaperf::deallocData(dataSpace, ptr); + } + template void allocData(T*& ptr, int len, VariantID vid) { diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index dfb8f4c00..e27c695d2 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -38,21 +38,27 @@ inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, } /*! - * \brief Copy given hptr (host) data to device (dptr). - * - * Method assumes both host and device data arrays are allocated - * and of propoer size for copy operation to succeed. + * \brief Allocate device data array (dptr) and copy given hptr (host) + * data to device array. */ -template -void copyOpenMPDeviceData(T& dptr, const T hptr, int len, - int did = omp_get_default_device(), - int hid = omp_get_initial_device()) +inline void* allocOpenMPDeviceData(size_t len, + int did = omp_get_default_device()) { - omp_target_memcpy( dptr, hptr, - len * sizeof(typename std::remove_pointer::type), - 0, 0, did, hid ); + return omp_target_alloc( len, did); +} + +/*! + * \brief Free device data array. + */ +inline void deallocOpenMPDeviceData(void* dptr, + int did = omp_get_default_device()) +{ + omp_target_free( dptr, did ); } +} // closing brace for detail namespace + + /*! * \brief Copy given hptr (host) data to device (dptr). * @@ -67,32 +73,6 @@ void initOpenMPDeviceData(T& dptr, const T hptr, int len, copyOpenMPDeviceData(dptr, hptr, len, did, hid); } -/*! - * \brief Allocate device data array (dptr) and copy given hptr (host) - * data to device array. - */ -template -void allocOpenMPDeviceData(T& dptr, int len, - int did = omp_get_default_device()) -{ - dptr = static_cast( omp_target_alloc( - len * sizeof(typename std::remove_pointer::type), - did) ); -} - -/*! - * \brief Allocate device data array (dptr) and copy given hptr (host) - * data to device array. - */ -template -void allocAndInitOpenMPDeviceData(T& dptr, const T hptr, int len, - int did = omp_get_default_device(), - int hid = omp_get_initial_device()) -{ - allocOpenMPDeviceData(dptr, len, did); - initOpenMPDeviceData(dptr, hptr, len, did, hid); -} - /*! * \brief Copy given device ptr (dptr) data to host ptr (hptr). * @@ -109,20 +89,6 @@ void getOpenMPDeviceData(T& hptr, const T dptr, int len, 0, 0, hid, did ); } -/*! - * \brief Free device data array. - */ -template -void deallocOpenMPDeviceData(T& dptr, - int did = omp_get_default_device()) -{ - omp_target_free( dptr, did ); - dptr = 0; -} - - -} // closing brace for detail namespace - } // closing brace for rajaperf namespace #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index d4dc1d55c..3df7bbaee 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -70,7 +70,7 @@ void DOT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { Real_ptr dprod; - allocAndInitCudaDeviceData(dprod, &m_dot_init, 1); + allocData(DataSpace::CudaDevice, dprod, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -90,7 +90,7 @@ void DOT::runCudaVariantImpl(VariantID vid) } stopTimer(); - deallocCudaDeviceData(dprod); + deallocData(DataSpace::CudaDevice, dprod); } else if ( vid == RAJA_CUDA ) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 27f724d1f..0411d2ce0 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -71,7 +71,7 @@ void DOT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { Real_ptr dprod; - allocAndInitHipDeviceData(dprod, &m_dot_init, 1); + allocData(DataSpace::HipDevice, dprod, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,7 +92,7 @@ void DOT::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocHipDeviceData(dprod); + deallocData(DataSpace::HipDevice, dprod); } else if ( vid == RAJA_HIP ) { From 44fc603f5770147c7202b16c836d2a2578aceded Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Mar 2023 10:17:08 -0700 Subject: [PATCH 094/137] Check error of cuda/hip copy to signal --- src/apps/FIR-Cuda.cpp | 2 +- src/apps/FIR-Hip.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 32e6227e5..6cfb91f36 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -30,7 +30,7 @@ namespace apps __constant__ Real_type coeff[FIR_COEFFLEN]; #define FIR_DATA_SETUP_CUDA \ - cudaMemcpyToSymbol(coeff, coeff_array, FIR_COEFFLEN * sizeof(Real_type)); + cudaErrchk(cudaMemcpyToSymbol(coeff, coeff_array, FIR_COEFFLEN * sizeof(Real_type))); #define FIR_DATA_TEARDOWN_CUDA diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 5147e3f16..e6d23cb13 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -30,7 +30,7 @@ namespace apps __constant__ Real_type coeff[FIR_COEFFLEN]; #define FIR_DATA_SETUP_HIP \ - hipMemcpyToSymbol(HIP_SYMBOL(coeff), coeff_array, FIR_COEFFLEN * sizeof(Real_type), 0, hipMemcpyHostToDevice); + hipErrchk( hipMemcpyToSymbol(HIP_SYMBOL(coeff), coeff_array, FIR_COEFFLEN * sizeof(Real_type), 0, hipMemcpyHostToDevice)); #define FIR_DATA_TEARDOWN_HIP From b0f3c6f8cb459af5ff940025f748c73d7e4920c4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Mar 2023 11:12:39 -0700 Subject: [PATCH 095/137] Use setup on host Use new allocSetupData calls to get accessible memory and then move it as necessary --- src/apps/DEL_DOT_VEC_2D.cpp | 4 +-- src/apps/HALOEXCHANGE.cpp | 6 ++-- src/apps/HALOEXCHANGE_FUSED.cpp | 6 ++-- src/apps/VOL3D.cpp | 6 ++-- src/basic/REDUCE_STRUCT.cpp | 8 ++--- src/common/DataUtils.hpp | 59 +++++++++++++++++++++++++++++++-- src/common/KernelBase.hpp | 46 +++++++++++++++++++++++++ src/lcals/FIRST_MIN.cpp | 2 +- 8 files changed, 119 insertions(+), 18 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 3c1298670..3ad46d719 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -71,8 +71,8 @@ DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_x, m_array_length, 0.0, vid); - allocAndInitDataConst(m_y, m_array_length, 0.0, vid); + auto finalize_x = allocAndInitSetupDataConst(m_x, m_array_length, 0.0, vid); + auto finalize_y = allocAndInitSetupDataConst(m_y, m_array_length, 0.0, vid); Real_type dx = 0.2; Real_type dy = 0.1; diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 6b5304d6c..29a4cced3 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -86,7 +86,7 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); + auto finalize_var = allocAndInitSetupData(m_vars[v], m_var_size, vid); Real_ptr var = m_vars[v]; @@ -260,7 +260,7 @@ void HALOEXCHANGE::create_pack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto finalize_list = allocAndInitSetupData(pack_index_lists[l], pack_index_list_lengths[l], vid); Int_ptr pack_list = pack_index_lists[l]; @@ -405,7 +405,7 @@ void HALOEXCHANGE::create_unpack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto finalize_list = allocAndInitSetupData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); Int_ptr unpack_list = unpack_index_lists[l]; diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 25385ace8..5e26e40a2 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -86,7 +86,7 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); + auto finalize_var = allocAndInitSetupData(m_vars[v], m_var_size, vid); Real_ptr var = m_vars[v]; @@ -260,7 +260,7 @@ void HALOEXCHANGE_FUSED::create_pack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto finalize_list = allocAndInitSetupData(pack_index_lists[l], pack_index_list_lengths[l], vid); Int_ptr pack_list = pack_index_lists[l]; @@ -405,7 +405,7 @@ void HALOEXCHANGE_FUSED::create_unpack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto finalize_list = allocAndInitSetupData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); Int_ptr unpack_list = unpack_index_lists[l]; diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 94957f135..a0ce4ae52 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -73,9 +73,9 @@ VOL3D::~VOL3D() void VOL3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_x, m_array_length, 0.0, vid); - allocAndInitDataConst(m_y, m_array_length, 0.0, vid); - allocAndInitDataConst(m_z, m_array_length, 0.0, vid); + auto finalize_x = allocAndInitSetupDataConst(m_x, m_array_length, 0.0, vid); + auto finalize_y = allocAndInitSetupDataConst(m_y, m_array_length, 0.0, vid); + auto finalize_z = allocAndInitSetupDataConst(m_z, m_array_length, 0.0, vid); Real_type dx = 0.3; Real_type dy = 0.2; diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 5edaadede..d17220e7e 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -67,13 +67,13 @@ void REDUCE_STRUCT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) m_init_sum = 0.0; m_init_min = std::numeric_limits::max(); m_init_max = std::numeric_limits::lowest(); - allocAndInitData(m_x, getActualProblemSize(), vid); - allocAndInitData(m_y, getActualProblemSize(), vid); + auto finalize_x = allocAndInitSetupData(m_x, getActualProblemSize(), vid); + auto finalize_y = allocAndInitSetupData(m_y, getActualProblemSize(), vid); Real_type dx = Lx/(Real_type)(getActualProblemSize()); Real_type dy = Ly/(Real_type)(getActualProblemSize()); for (int i=0;i::max() - i; + ptr[i] = T{}; }; #else (void)ptr; (void)len; @@ -505,6 +505,61 @@ inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, } } + +template +struct AutoDataMover +{ + AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, + T*& ptr, int len, int align) + : m_ptr(&ptr) + , m_new_dataSpace(new_dataSpace) + , m_old_dataSpace(old_dataSpace) + , m_len(len) + , m_align(align) + { } + + AutoDataMover(AutoDataMover const&) = delete; + AutoDataMover& operator=(AutoDataMover const&) = delete; + + AutoDataMover(AutoDataMover&& rhs) + : m_ptr(std::exchange(rhs.m_ptr, nullptr)) + , m_new_dataSpace(rhs.m_new_dataSpace) + , m_old_dataSpace(rhs.m_old_dataSpace) + , m_len(rhs.m_len) + , m_align(rhs.m_align) + { } + AutoDataMover& operator=(AutoDataMover&& rhs) + { + m_ptr = std::exchange(rhs.m_ptr, nullptr); + m_new_dataSpace = rhs.m_new_dataSpace; + m_old_dataSpace = rhs.m_old_dataSpace; + m_len = rhs.m_len; + m_align = rhs.m_align; + return *this; + } + + void finalize() + { + if (m_ptr) { + moveData(m_new_dataSpace, m_old_dataSpace, + *m_ptr, m_len, m_align); + m_ptr = nullptr; + } + } + + ~AutoDataMover() + { + finalize(); + } + +private: + T** m_ptr; + DataSpace m_new_dataSpace; + DataSpace m_old_dataSpace; + int m_len; + int m_align; +}; + /*! * \brief Allocate and initialize data array. */ diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 3291bd172..c48e779e4 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -32,6 +32,7 @@ #include #include #include +#include namespace rajaperf { @@ -251,6 +252,11 @@ class KernelBase } } + DataSpace getHostAccessibleDataSpace(VariantID vid) const + { + return hostAccessibleDataSpace(getDataSpace(vid)); + } + template void allocData(DataSpace dataSpace, T& ptr, int len) { @@ -298,6 +304,46 @@ class KernelBase ptr, len, getDataAlignment()); } + template + rajaperf::AutoDataMover allocSetupData(T*& ptr, int len, VariantID vid) + { + rajaperf::allocData(getHostAccessibleDataSpace(vid), + ptr, len, getDataAlignment()); + return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + } + + template + rajaperf::AutoDataMover allocAndInitSetupData(T*& ptr, int len, VariantID vid) + { + rajaperf::allocAndInitData(getHostAccessibleDataSpace(vid), + ptr, len, getDataAlignment()); + return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + } + + template + rajaperf::AutoDataMover allocAndInitSetupDataConst(T*& ptr, int len, T val, VariantID vid) + { + rajaperf::allocAndInitDataConst(getHostAccessibleDataSpace(vid), + ptr, len, getDataAlignment(), val); + return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + } + + template + rajaperf::AutoDataMover allocAndInitSetupDataRandSign(T*& ptr, int len, VariantID vid) + { + rajaperf::allocAndInitDataRandSign(getHostAccessibleDataSpace(vid), + ptr, len, getDataAlignment()); + return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + } + + template + rajaperf::AutoDataMover allocAndInitSetupDataRandValue(T*& ptr, int len, VariantID vid) + { + rajaperf::allocAndInitDataRandValue(getHostAccessibleDataSpace(vid), + ptr, len, getDataAlignment()); + return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + } + template void deallocData(T*& ptr, VariantID vid) { diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 427cbe0a6..f87535ed4 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -67,7 +67,7 @@ FIRST_MIN::~FIRST_MIN() void FIRST_MIN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_x, m_N, 0.0, vid); + auto finalize_x = allocAndInitSetupDataConst(m_x, m_N, 0.0, vid); m_x[ m_N / 2 ] = -1.0e+10; m_xmin_init = m_x[0]; m_initloc = 0; From f76d16b8a9e77e99d8b002e7912da1207c1a9773 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Mar 2023 15:07:54 -0700 Subject: [PATCH 096/137] Fix NESTED_INIT::tearDown --- src/basic/NESTED_INIT.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index f9fe82e44..fc64f5a0d 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -83,8 +83,7 @@ void NESTED_INIT::updateChecksum(VariantID vid, size_t tune_idx) void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - RAJA::free_aligned(m_array); - m_array = 0; + deallocData(m_array, vid); } } // end namespace basic From dfcbe363651fa1827046ac29154c78f639fa9627 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Mar 2023 15:35:56 -0700 Subject: [PATCH 097/137] Fix PI_ATOMIC final value --- src/basic/PI_ATOMIC-Cuda.cpp | 12 ++++++------ src/basic/PI_ATOMIC-Hip.cpp | 12 ++++++------ src/basic/PI_ATOMIC-OMP.cpp | 6 +++--- src/basic/PI_ATOMIC-OMPTarget.cpp | 14 +++++++------- src/basic/PI_ATOMIC-Seq.cpp | 6 +++--- src/basic/PI_ATOMIC.cpp | 3 ++- src/basic/PI_ATOMIC.hpp | 1 + src/common/CudaDataUtils.hpp | 12 ++++-------- src/common/HipDataUtils.hpp | 12 ++++-------- src/common/OpenMPTargetDataUtils.hpp | 10 ++++------ 10 files changed, 40 insertions(+), 48 deletions(-) diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index abb93ece2..037399bfd 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -56,8 +56,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) pi_atomic<<>>( pi, dx, iend ); cudaErrchk( cudaGetLastError() ); - getCudaDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; + getCudaDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); @@ -77,8 +77,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) }); cudaErrchk( cudaGetLastError() ); - getCudaDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; + getCudaDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); @@ -96,8 +96,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - getCudaDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; + getCudaDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 361e3b01d..c58709bec 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -56,8 +56,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) hipLaunchKernelGGL((atomic_pi),grid_size, block_size, 0, 0, pi, dx, iend ); hipErrchk( hipGetLastError() ); - getHipDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; + getHipDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); @@ -79,8 +79,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); - getHipDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; + getHipDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); @@ -98,8 +98,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - getHipDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; + getHipDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index 7fd29bc28..c031dcf32 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -42,7 +42,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #pragma omp atomic *pi += dx / (1.0 + x * x); } - *pi *= 4.0; + m_pi_final = *pi * 4.0; } stopTimer(); @@ -66,7 +66,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type i = ibegin; i < iend; ++i ) { piatomic_base_lam(i); } - *pi *= 4.0; + m_pi_final = *pi * 4.0; } stopTimer(); @@ -85,7 +85,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - *pi *= 4.0; + m_pi_final = *pi * 4.0; } stopTimer(); diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 30b57144b..9d4f2649f 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -40,9 +40,9 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); + initOpenMPDeviceData(pi, &m_pi_init, 1); - #pragma omp target is_device_ptr(pi) device( did ) + #pragma omp target is_device_ptr(pi) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { double x = (double(i) + 0.5) * dx; @@ -50,8 +50,8 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG *pi += dx / (1.0 + x * x); } - getOpenMPDeviceData(m_pi, pi, 1, hid, did); - *m_pi *= 4.0; + getOpenMPDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); @@ -61,7 +61,7 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); + initOpenMPDeviceData(pi, &m_pi_init, 1); RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { @@ -69,8 +69,8 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - getOpenMPDeviceData(m_pi, pi, 1, hid, did); - *m_pi *= 4.0; + getOpenMPDeviceData(&m_pi_final, pi, 1); + m_pi_final *= 4.0; } stopTimer(); diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index da13269a4..7e233ffd9 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -38,7 +38,7 @@ void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx double x = (double(i) + 0.5) * dx; *pi += dx / (1.0 + x * x); } - *pi *= 4.0; + m_pi_final = *pi * 4.0; } stopTimer(); @@ -61,7 +61,7 @@ void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx for (Index_type i = ibegin; i < iend; ++i ) { piatomic_base_lam(i); } - *pi *= 4.0; + m_pi_final = *pi * 4.0; } stopTimer(); @@ -80,7 +80,7 @@ void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - *pi *= 4.0; + m_pi_final = *pi * 4.0; } stopTimer(); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index e1f93dd2f..af33d01fc 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -66,11 +66,12 @@ void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) m_dx = 1.0 / double(getActualProblemSize()); allocAndInitDataConst(m_pi, 1, 0.0, vid); m_pi_init = 0.0; + m_pi_final = -static_cast(vid); } void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += Checksum_type(*m_pi); + checksum[vid][tune_idx] += static_cast(m_pi_final); } void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 803d6202f..fe26d9beb 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -70,6 +70,7 @@ class PI_ATOMIC : public KernelBase Real_type m_dx; Real_ptr m_pi; Real_type m_pi_init; + Real_type m_pi_final; }; } // end namespace basic diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 413503d53..e45ea3873 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -190,11 +190,9 @@ inline void deallocCudaPinnedData(void* pptr) * and of propoer size for copy operation to succeed. */ template -void initCudaDeviceData(T& dptr, const T hptr, int len) +void initCudaDeviceData(T* dptr, const T* hptr, int len) { - cudaErrchk( cudaMemcpy( dptr, hptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemcpyHostToDevice ) ); + cudaErrchk( cudaMemcpy( dptr, hptr, len * sizeof(T), cudaMemcpyHostToDevice ) ); } /*! @@ -204,11 +202,9 @@ void initCudaDeviceData(T& dptr, const T hptr, int len) * and of propoer size for copy operation to succeed. */ template -void getCudaDeviceData(T& hptr, const T dptr, int len) +void getCudaDeviceData(T* hptr, const T* dptr, int len) { - cudaErrchk( cudaMemcpy( hptr, dptr, - len * sizeof(typename std::remove_pointer::type), - cudaMemcpyDeviceToHost ) ); + cudaErrchk( cudaMemcpy( hptr, dptr, len * sizeof(T), cudaMemcpyDeviceToHost ) ); } } // closing brace for rajaperf namespace diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 9bd9b5da0..df1e34191 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -240,11 +240,9 @@ inline void deallocHipPinnedData(void* pptr) * and of propoer size for copy operation to succeed. */ template -void initHipDeviceData(T& dptr, const T hptr, int len) +void initHipDeviceData(T* dptr, const T* hptr, int len) { - hipErrchk( hipMemcpy( dptr, hptr, - len * sizeof(typename std::remove_pointer::type), - hipMemcpyHostToDevice ) ); + hipErrchk( hipMemcpy( dptr, hptr, len * sizeof(T), hipMemcpyHostToDevice ) ); } /*! * \brief Copy given dptr (HIP device) data to host (hptr). @@ -253,11 +251,9 @@ void initHipDeviceData(T& dptr, const T hptr, int len) * and of propoer size for copy operation to succeed. */ template -void getHipDeviceData(T& hptr, const T dptr, int len) +void getHipDeviceData(T* hptr, const T* dptr, int len) { - hipErrchk( hipMemcpy( hptr, dptr, - len * sizeof(typename std::remove_pointer::type), - hipMemcpyDeviceToHost ) ); + hipErrchk( hipMemcpy( hptr, dptr, len * sizeof(T), hipMemcpyDeviceToHost ) ); } } // closing brace for rajaperf namespace diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index e27c695d2..3328732b8 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -66,11 +66,11 @@ inline void deallocOpenMPDeviceData(void* dptr, * and of propoer size for copy operation to succeed. */ template -void initOpenMPDeviceData(T& dptr, const T hptr, int len, +void initOpenMPDeviceData(T* dptr, const T* hptr, int len, int did = omp_get_default_device(), int hid = omp_get_initial_device()) { - copyOpenMPDeviceData(dptr, hptr, len, did, hid); + omp_target_memcpy( dptr, hptr, len * sizeof(T), 0, 0, did, hid); } /*! @@ -80,13 +80,11 @@ void initOpenMPDeviceData(T& dptr, const T hptr, int len, * and of propoer size for copy operation to succeed. */ template -void getOpenMPDeviceData(T& hptr, const T dptr, int len, +void getOpenMPDeviceData(T* hptr, const T* dptr, int len, int hid = omp_get_initial_device(), int did = omp_get_default_device()) { - omp_target_memcpy( hptr, dptr, - len * sizeof(typename std::remove_pointer::type), - 0, 0, hid, did ); + omp_target_memcpy( hptr, dptr, len * sizeof(T), 0, 0, hid, did ); } } // closing brace for rajaperf namespace From cf2ca2a8f0470d6d3e99e39f3ffe1fe10021d062 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 10:14:16 -0700 Subject: [PATCH 098/137] Use scopedMoveData instead of SetupData constructs --- src/apps/DEL_DOT_VEC_2D.cpp | 15 +++++++++----- src/apps/HALOEXCHANGE.cpp | 9 ++++++--- src/apps/HALOEXCHANGE_FUSED.cpp | 9 ++++++--- src/apps/VOL3D.cpp | 22 ++++++++++++-------- src/basic/REDUCE_STRUCT.cpp | 22 ++++++++++++-------- src/common/DataUtils.hpp | 16 ++++----------- src/common/KernelBase.hpp | 36 ++------------------------------- src/lcals/FIRST_MIN.cpp | 12 ++++++++--- 8 files changed, 65 insertions(+), 76 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 3ad46d719..917c49abc 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -71,12 +71,17 @@ DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - auto finalize_x = allocAndInitSetupDataConst(m_x, m_array_length, 0.0, vid); - auto finalize_y = allocAndInitSetupDataConst(m_y, m_array_length, 0.0, vid); + allocAndInitDataConst(m_x, m_array_length, 0.0, vid); + allocAndInitDataConst(m_y, m_array_length, 0.0, vid); - Real_type dx = 0.2; - Real_type dy = 0.1; - setMeshPositions_2d(m_x, dx, m_y, dy, *m_domain); + { + auto reset_x = scopedMoveData(m_x, m_array_length, vid); + auto reset_y = scopedMoveData(m_y, m_array_length, vid); + + Real_type dx = 0.2; + Real_type dy = 0.1; + setMeshPositions_2d(m_x, dx, m_y, dy, *m_domain); + } allocAndInitData(m_xdot, m_array_length, vid); allocAndInitData(m_ydot, m_array_length, vid); diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 29a4cced3..58534da21 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -86,7 +86,8 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { - auto finalize_var = allocAndInitSetupData(m_vars[v], m_var_size, vid); + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); Real_ptr var = m_vars[v]; @@ -260,7 +261,8 @@ void HALOEXCHANGE::create_pack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - auto finalize_list = allocAndInitSetupData(pack_index_lists[l], pack_index_list_lengths[l], vid); + allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); Int_ptr pack_list = pack_index_lists[l]; @@ -405,7 +407,8 @@ void HALOEXCHANGE::create_unpack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - auto finalize_list = allocAndInitSetupData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); Int_ptr unpack_list = unpack_index_lists[l]; diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 5e26e40a2..74dd5b0d5 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -86,7 +86,8 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { - auto finalize_var = allocAndInitSetupData(m_vars[v], m_var_size, vid); + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); Real_ptr var = m_vars[v]; @@ -260,7 +261,8 @@ void HALOEXCHANGE_FUSED::create_pack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - auto finalize_list = allocAndInitSetupData(pack_index_lists[l], pack_index_list_lengths[l], vid); + allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); Int_ptr pack_list = pack_index_lists[l]; @@ -405,7 +407,8 @@ void HALOEXCHANGE_FUSED::create_unpack_lists( (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - auto finalize_list = allocAndInitSetupData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); Int_ptr unpack_list = unpack_index_lists[l]; diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index a0ce4ae52..11051adc5 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -73,14 +73,20 @@ VOL3D::~VOL3D() void VOL3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - auto finalize_x = allocAndInitSetupDataConst(m_x, m_array_length, 0.0, vid); - auto finalize_y = allocAndInitSetupDataConst(m_y, m_array_length, 0.0, vid); - auto finalize_z = allocAndInitSetupDataConst(m_z, m_array_length, 0.0, vid); - - Real_type dx = 0.3; - Real_type dy = 0.2; - Real_type dz = 0.1; - setMeshPositions_3d(m_x, dx, m_y, dy, m_z, dz, *m_domain); + allocAndInitDataConst(m_x, m_array_length, 0.0, vid); + allocAndInitDataConst(m_y, m_array_length, 0.0, vid); + allocAndInitDataConst(m_z, m_array_length, 0.0, vid); + + { + auto reset_x = scopedMoveData(m_x, m_array_length, vid); + auto reset_y = scopedMoveData(m_y, m_array_length, vid); + auto reset_z = scopedMoveData(m_z, m_array_length, vid); + + Real_type dx = 0.3; + Real_type dy = 0.2; + Real_type dz = 0.1; + setMeshPositions_3d(m_x, dx, m_y, dy, m_z, dz, *m_domain); + } allocAndInitDataConst(m_vol, m_array_length, 0.0, vid); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index d17220e7e..f18319eb2 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -67,14 +67,20 @@ void REDUCE_STRUCT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) m_init_sum = 0.0; m_init_min = std::numeric_limits::max(); m_init_max = std::numeric_limits::lowest(); - auto finalize_x = allocAndInitSetupData(m_x, getActualProblemSize(), vid); - auto finalize_y = allocAndInitSetupData(m_y, getActualProblemSize(), vid); - Real_type dx = Lx/(Real_type)(getActualProblemSize()); - Real_type dy = Ly/(Real_type)(getActualProblemSize()); - for (int i=0;i - rajaperf::AutoDataMover allocSetupData(T*& ptr, int len, VariantID vid) + rajaperf::AutoDataMover scopedMoveData(T*& ptr, int len, VariantID vid) { - rajaperf::allocData(getHostAccessibleDataSpace(vid), - ptr, len, getDataAlignment()); - return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; - } - - template - rajaperf::AutoDataMover allocAndInitSetupData(T*& ptr, int len, VariantID vid) - { - rajaperf::allocAndInitData(getHostAccessibleDataSpace(vid), - ptr, len, getDataAlignment()); - return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; - } - - template - rajaperf::AutoDataMover allocAndInitSetupDataConst(T*& ptr, int len, T val, VariantID vid) - { - rajaperf::allocAndInitDataConst(getHostAccessibleDataSpace(vid), - ptr, len, getDataAlignment(), val); - return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; - } - - template - rajaperf::AutoDataMover allocAndInitSetupDataRandSign(T*& ptr, int len, VariantID vid) - { - rajaperf::allocAndInitDataRandSign(getHostAccessibleDataSpace(vid), - ptr, len, getDataAlignment()); - return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; - } - - template - rajaperf::AutoDataMover allocAndInitSetupDataRandValue(T*& ptr, int len, VariantID vid) - { - rajaperf::allocAndInitDataRandValue(getHostAccessibleDataSpace(vid), + rajaperf::moveData(getHostAccessibleDataSpace(vid), getDataSpace(vid), ptr, len, getDataAlignment()); return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; } diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index f87535ed4..875932958 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -67,9 +67,15 @@ FIRST_MIN::~FIRST_MIN() void FIRST_MIN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - auto finalize_x = allocAndInitSetupDataConst(m_x, m_N, 0.0, vid); - m_x[ m_N / 2 ] = -1.0e+10; - m_xmin_init = m_x[0]; + allocAndInitDataConst(m_x, m_N, 0.0, vid); + + { + auto reset_x = scopedMoveData(m_x, m_N, vid); + + m_x[ m_N / 2 ] = -1.0e+10; + m_xmin_init = m_x[0]; + } + m_initloc = 0; m_minloc = -1; } From 5706027302c71d04d14831069fd35b56451e468e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 10:14:35 -0700 Subject: [PATCH 099/137] Use scopedMoveData in INT_PREDICT::updateChecksum --- src/lcals/INT_PREDICT.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 24e357f50..eb56b5725 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -77,8 +77,12 @@ void INT_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) { - for (Index_type i = 0; i < getActualProblemSize(); ++i) { - m_px[i] -= m_px_initval; + { + auto reset_px = scopedMoveData(m_px, m_array_length, vid); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_px[i] -= m_px_initval; + } } checksum[vid][tune_idx] += calcChecksum(m_px, getActualProblemSize(), vid); From db83827ab062a72bbca8f97d33ca90e7d8a5609b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 11:21:08 -0700 Subject: [PATCH 100/137] Move real_zones out of ADomain So that it can be allocated in the proper place. --- src/apps/AppsData.cpp | 66 ++++++++++++++++++++ src/apps/AppsData.hpp | 50 ++++----------- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 20 +----- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 20 +----- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 10 +-- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 15 +---- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 10 +-- src/apps/DEL_DOT_VEC_2D.cpp | 4 ++ src/apps/DEL_DOT_VEC_2D.hpp | 8 ++- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 9 +-- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 9 +-- src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 7 +-- src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 9 +-- src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 7 +-- src/apps/NODAL_ACCUMULATION_3D.cpp | 8 +++ src/apps/NODAL_ACCUMULATION_3D.hpp | 5 +- src/common/DataUtils.cpp | 12 ++++ src/common/DataUtils.hpp | 16 +++-- 18 files changed, 145 insertions(+), 140 deletions(-) diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index 9bb06b261..29a553b20 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -16,6 +16,72 @@ namespace rajaperf namespace apps { +// +// Set zone indices for 2d mesh. +// +void setRealZones_2d(Index_type* real_zones, + const ADomain& domain) +{ + if (domain.ndims != 2) { + getCout() << "\n******* ERROR!!! domain is not 2d *******" << std::endl; + return; + } + + Index_type imin = domain.imin; + Index_type imax = domain.imax; + Index_type jmin = domain.jmin; + Index_type jmax = domain.jmax; + + Index_type jp = domain.jp; + + Index_type j_stride = (imax - imin); + + for (Index_type j = jmin; j < jmax; j++) { + for (Index_type i = imin; i < imax; i++) { + Index_type ip = i + j*jp ; + + Index_type id = i + j*j_stride ; + real_zones[id] = ip; + } + } +} + +// +// Set zone indices for 3d mesh. +// +void setRealZones_3d(Index_type* real_zones, + const ADomain& domain) +{ + if (domain.ndims != 3) { + getCout() << "\n******* ERROR!!! domain is not 3d *******" << std::endl; + return; + } + + Index_type imin = domain.imin; + Index_type imax = domain.imax; + Index_type jmin = domain.jmin; + Index_type jmax = domain.jmax; + Index_type kmin = domain.kmin; + Index_type kmax = domain.kmax; + + Index_type jp = domain.jp; + Index_type kp = domain.kp; + + Index_type j_stride = (imax - imin); + Index_type k_stride = j_stride * (jmax - jmin); + + for (Index_type k = kmin; k < kmax; k++) { + for (Index_type j = jmin; j < jmax; j++) { + for (Index_type i = imin; i < imax; i++) { + Index_type ip = i + j*jp + kp*k ; + + Index_type id = i + j*j_stride + k_stride*k ; + real_zones[id] = ip; + } + } + } +} + // // Set mesh positions for 2d mesh. // diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index afbfe98e5..31d47727c 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -55,17 +55,20 @@ class ADomain imax = rzmax + NPNR; jmax = rzmax + NPNR; jp = imax - imin + 1 + NPNL + NPNR; + n_real_zones = (imax - imin); if ( ndims == 2 ) { kmin = 0; kmax = 0; kp = 0; nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ; + n_real_zones *= (jmax - jmin); } else if ( ndims == 3 ) { kmin = NPNL; kmax = rzmax + NPNR; kp = jp * (jmax - jmin + 1 + NPNL + NPNR); nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ; + n_real_zones *= (jmax - jmin) * (kmax - kmin); } fpn = 0; @@ -75,45 +78,10 @@ class ADomain fpz = frn - jp - kp - 1; lpz = lrn; - - real_zones = new Index_type[nnalls]; - for (Index_type i = 0; i < nnalls; ++i) real_zones[i] = -1; - - n_real_zones = 0; - - if ( ndims == 2 ) { - - for (Index_type j = jmin; j < jmax; j++) { - for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp ; - - Index_type id = n_real_zones; - real_zones[id] = ip; - n_real_zones++; - } - } - - } else if ( ndims == 3 ) { - - for (Index_type k = kmin; k < kmax; k++) { - for (Index_type j = jmin; j < jmax; j++) { - for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp + kp*k ; - - Index_type id = n_real_zones; - real_zones[id] = ip; - n_real_zones++; - } - } - } - - } - } - ~ADomain() + ~ADomain() { - if (real_zones) delete [] real_zones; } Index_type ndims; @@ -139,10 +107,18 @@ class ADomain Index_type fpz; Index_type lpz; - Index_type* real_zones; Index_type n_real_zones; }; +// +// Routines for initializing real zone indices for 2d/3d domains. +// +void setRealZones_2d(Index_type* real_zones, + const ADomain& domain); + +void setRealZones_3d(Index_type* real_zones, + const ADomain& domain); + // // Routines for initializing mesh positions for 2d/3d domains. // diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 5fc3e5a23..c560e1638 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -58,11 +58,6 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -83,11 +78,6 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) } else if ( vid == Lambda_CUDA ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -107,15 +97,9 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) } else if ( vid == RAJA_CUDA ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - camp::resources::Resource working_res{camp::resources::Cuda::get_default()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 1cf0a3aab..a61bc4fc0 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -58,11 +58,6 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -83,11 +78,6 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) } else if ( vid == Lambda_HIP ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -109,15 +99,9 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) } else if ( vid == RAJA_HIP ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - camp::resources::Resource working_res{camp::resources::Hip::get_default()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index fd2977595..1fc9b5775 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -32,11 +32,6 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( DEL_DOT_VEC_2D_DATA_SETUP; - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - switch ( vid ) { case Base_OpenMP : { @@ -80,9 +75,8 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( case RAJA_OpenMP : { camp::resources::Resource working_res{camp::resources::Host::get_default()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); auto deldotvec2d_lam = [=](Index_type i) { DEL_DOT_VEC_2D_BODY; diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 617e0076b..8dfa12e6c 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -41,11 +41,6 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE if ( vid == Base_OpenMPTarget ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -63,15 +58,9 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSE } else if ( vid == RAJA_OpenMPTarget ) { - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - camp::resources::Resource working_res{camp::resources::Omp::get_default()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index 5a3080635..af4ae95b4 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -30,11 +30,6 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun DEL_DOT_VEC_2D_DATA_SETUP; - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - switch ( vid ) { case Base_Seq : { @@ -77,9 +72,8 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun case RAJA_Seq : { camp::resources::Resource working_res{camp::resources::Host::get_default()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); auto deldotvec2d_lam = [=](Index_type i) { DEL_DOT_VEC_2D_BODY; diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 917c49abc..c28fd4b55 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -73,14 +73,18 @@ void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); + allocAndInitDataConst(m_real_zones, m_domain->n_real_zones, + static_cast(-1), vid); { auto reset_x = scopedMoveData(m_x, m_array_length, vid); auto reset_y = scopedMoveData(m_y, m_array_length, vid); + auto reset_rz = scopedMoveData(m_real_zones, m_domain->n_real_zones, vid); Real_type dx = 0.2; Real_type dy = 0.1; setMeshPositions_2d(m_x, dx, m_y, dy, *m_domain); + setRealZones_2d(m_real_zones, *m_domain); } allocAndInitData(m_xdot, m_array_length, vid); diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 0e22bb399..d82efc12f 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -55,7 +55,12 @@ Real_ptr fx1,fx2,fx3,fx4 ; \ Real_ptr fy1,fy2,fy3,fy4 ; \ \ - Index_ptr real_zones = m_domain->real_zones; + NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; \ + NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; \ + NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; \ + NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; \ +\ + Index_ptr real_zones = m_real_zones; #define DEL_DOT_VEC_2D_BODY_INDEX \ Index_type i = real_zones[ii]; @@ -135,6 +140,7 @@ class DEL_DOT_VEC_2D : public KernelBase Real_type m_half; ADomain* m_domain; + Index_type* m_real_zones; Index_type m_array_length; }; diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 027f911be..27b16b301 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -53,8 +53,6 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +69,9 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) } else if ( vid == RAJA_CUDA ) { - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - camp::resources::Resource working_res{camp::resources::Cuda()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index 59e2b5b96..d44c3cd9d 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -53,8 +53,6 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -71,12 +69,9 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) } else if ( vid == RAJA_HIP ) { - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - camp::resources::Resource working_res{camp::resources::Hip()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index 57fe4b861..0a5ba031b 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -30,8 +30,6 @@ void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUS NODAL_ACCUMULATION_3D_DATA_SETUP; - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - switch ( vid ) { @@ -112,9 +110,8 @@ void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUS case RAJA_OpenMP : { camp::resources::Resource working_res{camp::resources::Host()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); auto nodal_accumulation_3d_lam = [=](Index_type i) { NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::omp_atomic); diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index c8840cdae..cea05a423 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -39,8 +39,6 @@ void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPER if ( vid == Base_OpenMPTarget ) { - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -75,12 +73,9 @@ void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPER } else if ( vid == RAJA_OpenMPTarget ) { - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - camp::resources::Resource working_res{camp::resources::Omp()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index 9dc2e6b13..e3ff3cdc2 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -28,8 +28,6 @@ void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ NODAL_ACCUMULATION_3D_DATA_SETUP; - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - switch ( vid ) { case Base_Seq : { @@ -72,9 +70,8 @@ void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ case RAJA_Seq : { camp::resources::Resource working_res{camp::resources::Host()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); auto nodal_accumulation_3d_lam = [=](Index_type i) { NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::seq_atomic); diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 389848ebe..51ef8cac0 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -78,6 +78,14 @@ void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune { allocAndInitDataConst(m_x, m_nodal_array_length, 0.0, vid); allocAndInitDataConst(m_vol, m_zonal_array_length, 1.0, vid); + allocAndInitDataConst(m_real_zones, m_domain->n_real_zones, + static_cast(-1), vid); + + { + auto reset_rz = scopedMoveData(m_real_zones, m_domain->n_real_zones, vid); + + setRealZones_3d(m_real_zones, *m_domain); + } } void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx) diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 5b0ce0d77..a8d194387 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -37,7 +37,9 @@ \ Real_ptr x0,x1,x2,x3,x4,x5,x6,x7; \ \ - Index_ptr real_zones = m_domain->real_zones; + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; \ + \ + Index_ptr real_zones = m_real_zones; #define NODAL_ACCUMULATION_3D_BODY_INDEX \ Index_type i = real_zones[ii]; @@ -111,6 +113,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase Real_ptr m_vol; ADomain* m_domain; + Index_type* m_real_zones; Index_type m_nodal_array_length; Index_type m_zonal_array_length; }; diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 317555060..8fe585df7 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -125,6 +125,18 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val) incDataInitCount(); } +/* + * Initialize Index_type data array to constant values. + */ +void initDataConst(Index_type*& ptr, int len, Index_type val) +{ + for (int i = 0; i < len; ++i) { + ptr[i] = val; + }; + + incDataInitCount(); +} + /* * Initialize Real_type data array with random sign. */ diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index a5dffdcfc..b2a8b3291 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -102,6 +102,13 @@ void initData(Real_ptr& ptr, int len); */ void initDataConst(Real_ptr& ptr, int len, Real_type val); +/*! + * \brief Initialize Index_type data array. + * + * Array entries are set to given constant value. + */ +void initDataConst(Index_type*& ptr, int len, Index_type val); + /*! * \brief Initialize Real_type data array with random sign. * @@ -579,12 +586,11 @@ inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align) /*! * \brief Allocate and initialize aligned Real_type data array. * - * Array entries are initialized using the method - * initDataConst(Real_ptr& ptr...) below. + * Array entries are initialized using the method initDataConst. */ template inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int align, - Real_type val) + T val) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -598,7 +604,7 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int ali /*! * \brief Allocate and initialize aligned Real_type data array with random sign. * - * Array is initialized using method initDataRandSign(Real_ptr& ptr...) below. + * Array is initialized using method initDataRandSign. */ template inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align) @@ -616,7 +622,7 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int * \brief Allocate and initialize aligned Real_type data array with random * values. * - * Array is initialized using method initDataRandValue(Real_ptr& ptr...) below. + * Array is initialized using method initDataRandValue. */ template inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align) From ed677cf42aa8fc692d3c364946d043fd3e07375f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 11:29:43 -0700 Subject: [PATCH 101/137] Add n_real_nodes to simplify some calculations --- src/apps/AppsData.hpp | 4 ++++ src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index 31d47727c..a4b566c6b 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -56,6 +56,7 @@ class ADomain jmax = rzmax + NPNR; jp = imax - imin + 1 + NPNL + NPNR; n_real_zones = (imax - imin); + n_real_nodes = (imax+1 - imin); if ( ndims == 2 ) { kmin = 0; @@ -63,12 +64,14 @@ class ADomain kp = 0; nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ; n_real_zones *= (jmax - jmin); + n_real_nodes *= (jmax+1 - jmin); } else if ( ndims == 3 ) { kmin = NPNL; kmax = rzmax + NPNR; kp = jp * (jmax - jmin + 1 + NPNL + NPNR); nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ; n_real_zones *= (jmax - jmin) * (kmax - kmin); + n_real_nodes *= (jmax+1 - jmin) * (kmax+1 - kmin); } fpn = 0; @@ -108,6 +111,7 @@ class ADomain Index_type lpz; Index_type n_real_zones; + Index_type n_real_nodes; }; // diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index c28fd4b55..7baba4a84 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -39,7 +39,7 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setKernelsPerRep(1); setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + (1*sizeof(Real_type) + 0*sizeof(Real_type) ) * getItsPerRep() + - (0*sizeof(Real_type) + 4*sizeof(Real_type) ) * (m_domain->imax+1-m_domain->imin)*(m_domain->jmax+1-m_domain->jmin) ) ; // touched data size, not actual number of stores and loads + (0*sizeof(Real_type) + 4*sizeof(Real_type) ) * m_domain->n_real_nodes ) ; // touched data size, not actual number of stores and loads setFLOPsPerRep(54 * m_domain->n_real_zones); setUsesFeature(Forall); diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 51ef8cac0..76fc37e59 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -41,7 +41,7 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) // touched data size, not actual number of stores and loads setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * (m_domain->imax+1 - m_domain->imin)*(m_domain->jmax+1 - m_domain->jmin)*(m_domain->kmax+1 - m_domain->kmin)); + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes); setFLOPsPerRep(9 * getItsPerRep()); checksum_scale_factor = 0.001 * From 6050034c653e042323dcc3a331a0c91ecc6f974e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 12:16:36 -0700 Subject: [PATCH 102/137] fixup setRealZones calculations --- src/apps/AppsData.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index 29a553b20..bade73b59 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -40,7 +40,7 @@ void setRealZones_2d(Index_type* real_zones, for (Index_type i = imin; i < imax; i++) { Index_type ip = i + j*jp ; - Index_type id = i + j*j_stride ; + Index_type id = (i-imin) + (j-jmin)*j_stride ; real_zones[id] = ip; } } @@ -73,9 +73,9 @@ void setRealZones_3d(Index_type* real_zones, for (Index_type k = kmin; k < kmax; k++) { for (Index_type j = jmin; j < jmax; j++) { for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp + kp*k ; + Index_type ip = i + j*jp + k*kp ; - Index_type id = i + j*j_stride + k_stride*k ; + Index_type id = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ; real_zones[id] = ip; } } @@ -160,7 +160,7 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx, for (Index_type k = kmin - npnl; k < kmax + npnr; k++) { for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp + kp*k ; + Index_type iz = i + j*jp + k*kp ; x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx; x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx; From 04b395b7e72930e9bdd9c0bb8388f6385a819898 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 12:16:51 -0700 Subject: [PATCH 103/137] fixup AutoDataMover::operator= --- src/common/DataUtils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index b2a8b3291..adbf86481 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -537,6 +537,7 @@ struct AutoDataMover { } AutoDataMover& operator=(AutoDataMover&& rhs) { + finalize(); m_ptr = std::exchange(rhs.m_ptr, nullptr); m_new_dataSpace = rhs.m_new_dataSpace; m_old_dataSpace = rhs.m_old_dataSpace; From c3f85ddfd702bfd5bca056324de7c126065ca4cd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 12:53:33 -0700 Subject: [PATCH 104/137] Change how data is reset in HEAT_3D, JACOBI_1D, JACOBI_2D --- src/common/KernelBase.hpp | 6 ++++++ src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 6 ------ src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 6 ------ src/polybench/POLYBENCH_HEAT_3D.cpp | 4 ++-- src/polybench/POLYBENCH_HEAT_3D.hpp | 14 ++++++-------- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 6 ------ src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 6 ------ src/polybench/POLYBENCH_JACOBI_1D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D.hpp | 14 ++++++-------- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 6 ------ src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 6 ------ src/polybench/POLYBENCH_JACOBI_2D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_2D.hpp | 15 ++++++--------- 13 files changed, 30 insertions(+), 67 deletions(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 3b81e5a9b..2f6b0f620 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -312,6 +312,12 @@ class KernelBase return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; } + template + void copyData(T* dst_ptr, const T* src_ptr, int len, VariantID vid) + { + rajaperf::copyData(getDataSpace(vid), dst_ptr, getDataSpace(vid), src_ptr, len); + } + template void deallocData(T*& ptr, VariantID vid) { diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 1586866c5..c72dab1ad 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -59,8 +59,6 @@ void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - POLYBENCH_HEAT_3D_DATA_RESET; - break; } @@ -103,8 +101,6 @@ void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - POLYBENCH_HEAT_3D_DATA_RESET; - break; } @@ -148,8 +144,6 @@ void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - POLYBENCH_HEAT_3D_DATA_RESET; - break; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index 7222e5934..791010f55 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -55,8 +55,6 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } stopTimer(); - POLYBENCH_HEAT_3D_DATA_RESET; - break; } @@ -98,8 +96,6 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } stopTimer(); - POLYBENCH_HEAT_3D_DATA_RESET; - break; } @@ -148,8 +144,6 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } stopTimer(); - POLYBENCH_HEAT_3D_DATA_RESET; - break; } #endif // RUN_RAJA_SEQ diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index e3f0371e6..4f14b54f9 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -81,8 +81,8 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx (void) vid; allocAndInitData(m_Ainit, m_N*m_N*m_N, vid); allocAndInitData(m_Binit, m_N*m_N*m_N, vid); - allocAndInitDataConst(m_A, m_N*m_N*m_N, 0.0, vid); - allocAndInitDataConst(m_B, m_N*m_N*m_N, 0.0, vid); + allocData(m_A, m_N*m_N*m_N, vid); + allocData(m_B, m_N*m_N*m_N, vid); } void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx) diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 8d7eff93c..de5bd081d 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -41,17 +41,15 @@ #define RAJAPerf_POLYBENCH_HEAT_3D_HPP #define POLYBENCH_HEAT_3D_DATA_SETUP \ - Real_ptr A = m_Ainit; \ - Real_ptr B = m_Binit; \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ + \ + copyData(A, m_Ainit, m_N*m_N*m_N, vid); \ + copyData(B, m_Binit, m_N*m_N*m_N, vid); \ + \ const Index_type N = m_N; \ const Index_type tsteps = m_tsteps; -#define POLYBENCH_HEAT_3D_DATA_RESET \ - m_Ainit = m_A; \ - m_Binit = m_B; \ - m_A = A; \ - m_B = B; - #define POLYBENCH_HEAT_3D_BODY1 \ B[k + N*(j + N*i)] = \ diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index e002b02fb..42ae4a0d5 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -57,8 +57,6 @@ void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_JACOBI_1D_DATA_RESET; - break; } @@ -83,8 +81,6 @@ void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_JACOBI_1D_DATA_RESET; - break; } @@ -108,8 +104,6 @@ void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_JACOBI_1D_DATA_RESET; - break; } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index a7e81fe87..895507afc 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -53,8 +53,6 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - POLYBENCH_JACOBI_1D_DATA_RESET; - break; } @@ -79,8 +77,6 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - POLYBENCH_JACOBI_1D_DATA_RESET; - break; } @@ -104,8 +100,6 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - POLYBENCH_JACOBI_1D_DATA_RESET; - break; } #endif // RUN_RAJA_SEQ diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 9648571f1..b2beb0dfd 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -78,8 +78,8 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i (void) vid; allocAndInitData(m_Ainit, m_N, vid); allocAndInitData(m_Binit, m_N, vid); - allocAndInitDataConst(m_A, m_N, 0.0, vid); - allocAndInitDataConst(m_B, m_N, 0.0, vid); + allocData(m_A, m_N, vid); + allocData(m_B, m_N, vid); } void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx) diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 035096f89..aa774aba8 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -24,17 +24,15 @@ #define RAJAPerf_POLYBENCH_JACOBI_1D_HPP #define POLYBENCH_JACOBI_1D_DATA_SETUP \ - Real_ptr A = m_Ainit; \ - Real_ptr B = m_Binit; \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ + \ + copyData(A, m_Ainit, m_N, vid); \ + copyData(B, m_Binit, m_N, vid); \ + \ const Index_type N = m_N; \ const Index_type tsteps = m_tsteps; -#define POLYBENCH_JACOBI_1D_DATA_RESET \ - m_Ainit = m_A; \ - m_Binit = m_B; \ - m_A = A; \ - m_B = B; - #define POLYBENCH_JACOBI_1D_BODY1 \ B[i] = 0.33333 * (A[i-1] + A[i] + A[i + 1]); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 7561f7ed8..8ea60b32a 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -55,8 +55,6 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_JACOBI_2D_DATA_RESET; - break; } @@ -93,8 +91,6 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_JACOBI_2D_DATA_RESET; - break; } @@ -140,8 +136,6 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - POLYBENCH_JACOBI_2D_DATA_RESET; - break; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index 25f875d4b..bd7f752c6 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -50,8 +50,6 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - POLYBENCH_JACOBI_2D_DATA_RESET; - break; } @@ -88,8 +86,6 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - POLYBENCH_JACOBI_2D_DATA_RESET; - break; } @@ -135,8 +131,6 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } stopTimer(); - POLYBENCH_JACOBI_2D_DATA_RESET; - break; } #endif // RUN_RAJA_SEQ diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 1ea1620ea..9fe51e5c1 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -80,8 +80,8 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i (void) vid; allocAndInitData(m_Ainit, m_N*m_N, vid); allocAndInitData(m_Binit, m_N*m_N, vid); - allocAndInitDataConst(m_A, m_N*m_N, 0.0, vid); - allocAndInitDataConst(m_B, m_N*m_N, 0.0, vid); + allocData(m_A, m_N*m_N, vid); + allocData(m_B, m_N*m_N, vid); } void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx) diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 49ab2cd40..37e70407b 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -28,18 +28,15 @@ #define RAJAPerf_POLYBENCH_JACOBI_2D_HPP #define POLYBENCH_JACOBI_2D_DATA_SETUP \ - Real_ptr A = m_Ainit; \ - Real_ptr B = m_Binit; \ -\ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ + \ + copyData(A, m_Ainit, m_N*m_N, vid); \ + copyData(B, m_Binit, m_N*m_N, vid); \ + \ const Index_type N = m_N; \ const Index_type tsteps = m_tsteps; -#define POLYBENCH_JACOBI_2D_DATA_RESET \ - m_Ainit = m_A; \ - m_Binit = m_B; \ - m_A = A; \ - m_B = B; - #define POLYBENCH_JACOBI_2D_BODY1 \ B[j + i*N] = 0.2 * (A[j + i*N] + A[j-1 + i*N] + A[j+1 + i*N] + A[j + (i+1)*N] + A[j + (i-1)*N]); From a872135211fdeddedee72d97cf888b2873d2711d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:09:43 -0700 Subject: [PATCH 105/137] fixup some omp target stuff --- src/apps/FIR-OMPTarget.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 2 -- src/basic/PI_REDUCE-OMPTarget.cpp | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index d5eeaddee..43c0b2d60 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps #define FIR_DATA_TEARDOWN_OMP_TARGET \ - deallocData(DataSpace::OmpTarget, coeff, did); + deallocData(DataSpace::OmpTarget, coeff); void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp index e30c7a712..86ecbc3ea 100644 --- a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -26,8 +26,6 @@ namespace basic const size_t threads_per_team = 256; #define INDEXLIST_3LOOP_DATA_SETUP_OMP_TARGET \ - int did = omp_get_default_device(); \ - \ Index_type* counts = nullptr; \ allocData(DataSpace::OmpTarget, counts, iend+1); diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 5dc133661..f4c20a665 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -37,8 +37,6 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG if ( vid == Base_OpenMPTarget ) { - int did = omp_get_default_device(); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From 6089166583fa9f6b05c5917800b2889ee8cc3e1b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:18:59 -0700 Subject: [PATCH 106/137] fixup omp include --- src/common/OpenMPTargetDataUtils.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 3328732b8..86f2243b6 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -19,6 +19,8 @@ #if defined(RAJA_ENABLE_TARGET_OPENMP) +#include + namespace rajaperf { From f8e44974a9644ffa2a487d47330bcba0b1701703 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:19:07 -0700 Subject: [PATCH 107/137] fixup DIFF_PREDICT --- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index 79322aff8..e04b1e07d 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -24,7 +24,7 @@ namespace lcals // // Define threads per team for target execution // - const size_t threads_per_team = 256 + const size_t threads_per_team = 256; void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) From 93a9907a129e4174cc6acfa6c581b3255d335e99 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:19:31 -0700 Subject: [PATCH 108/137] work around omp_target_memcpy signature issue --- src/common/OpenMPTargetDataUtils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 86f2243b6..592f543fa 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -35,7 +35,7 @@ namespace detail inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, int dst_did, int src_did) { - omp_target_memcpy( dst_ptr, src_ptr, len, + omp_target_memcpy( dst_ptr, const_cast(src_ptr), len, 0, 0, dst_did, src_did ); } @@ -72,7 +72,7 @@ void initOpenMPDeviceData(T* dptr, const T* hptr, int len, int did = omp_get_default_device(), int hid = omp_get_initial_device()) { - omp_target_memcpy( dptr, hptr, len * sizeof(T), 0, 0, did, hid); + omp_target_memcpy( dptr, const_cast(hptr), len * sizeof(T), 0, 0, did, hid); } /*! @@ -86,7 +86,7 @@ void getOpenMPDeviceData(T* hptr, const T* dptr, int len, int hid = omp_get_initial_device(), int did = omp_get_default_device()) { - omp_target_memcpy( hptr, dptr, len * sizeof(T), 0, 0, hid, did ); + omp_target_memcpy( hptr, const_cast(dptr), len * sizeof(T), 0, 0, hid, did ); } } // closing brace for rajaperf namespace From d3dd1ec5a6d43055efee000a1ffb728b16ec6a57 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:28:01 -0700 Subject: [PATCH 109/137] Add wrappers for getting OpenMPTarget devices --- src/common/DataUtils.hpp | 8 ++++---- src/common/OpenMPTargetDataUtils.hpp | 29 +++++++++++++++++++++------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index adbf86481..8c223ffa7 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -463,10 +463,10 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, #if defined(RAJA_ENABLE_TARGET_OPENMP) else if (isOpenMPTargetDataSpace(dst_dataSpace) || isOpenMPTargetDataSpace(src_dataSpace)) { - auto dst_did = isOpenMPTargetDataSpace(dst_dataSpace) ? omp_get_default_device() - : omp_get_initial_device(); - auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? omp_get_default_device() - : omp_get_initial_device(); + auto dst_did = isOpenMPTargetDataSpace(dst_dataSpace) ? getOpenMPTargetDevice() + : getOpenMPTargetHost(); + auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? getOpenMPTargetDevice() + : getOpenMPTargetHost(); detail::copyOpenMPTargetData(dst_ptr, src_ptr, nbytes, dst_did, src_did); } diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 592f543fa..b5c98cb97 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -25,9 +25,24 @@ namespace rajaperf { -namespace detail +/*! + * \brief Get OpenMPTarget compute device id. + */ +inline int getOpenMPTargetDevice() { + return omp_get_default_device(); +} +/*! + * \brief Get OpenMPTarget host device id. + */ +inline int getOpenMPTargetHost() +{ + return omp_get_initial_device(); +} + +namespace detail +{ /* * Copy memory len bytes from src to dst. @@ -44,7 +59,7 @@ inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, * data to device array. */ inline void* allocOpenMPDeviceData(size_t len, - int did = omp_get_default_device()) + int did = getOpenMPTargetDevice()) { return omp_target_alloc( len, did); } @@ -53,7 +68,7 @@ inline void* allocOpenMPDeviceData(size_t len, * \brief Free device data array. */ inline void deallocOpenMPDeviceData(void* dptr, - int did = omp_get_default_device()) + int did = getOpenMPTargetDevice()) { omp_target_free( dptr, did ); } @@ -69,8 +84,8 @@ inline void deallocOpenMPDeviceData(void* dptr, */ template void initOpenMPDeviceData(T* dptr, const T* hptr, int len, - int did = omp_get_default_device(), - int hid = omp_get_initial_device()) + int did = getOpenMPTargetDevice(), + int hid = getOpenMPTargetHost()) { omp_target_memcpy( dptr, const_cast(hptr), len * sizeof(T), 0, 0, did, hid); } @@ -83,8 +98,8 @@ void initOpenMPDeviceData(T* dptr, const T* hptr, int len, */ template void getOpenMPDeviceData(T* hptr, const T* dptr, int len, - int hid = omp_get_initial_device(), - int did = omp_get_default_device()) + int hid = getOpenMPTargetHost(), + int did = getOpenMPTargetDevice()) { omp_target_memcpy( hptr, const_cast(dptr), len * sizeof(T), 0, 0, hid, did ); } From 58849d5bb69e44ac65075fb2bae6de37234be121 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:28:20 -0700 Subject: [PATCH 110/137] Get OpenMPTarget did in KernelBase --- src/common/KernelBase.cpp | 7 +++++-- src/common/KernelBase.hpp | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index c620c4880..c398f6d0a 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -15,8 +15,11 @@ namespace rajaperf { -KernelBase::KernelBase(KernelID kid, const RunParams& params) : - run_params(params) +KernelBase::KernelBase(KernelID kid, const RunParams& params) + : run_params(params) +#if defined(RAJA_ENABLE_TARGET_OPENMP) + , did(getOpenMPTargetDevice()) +#endif { kernel_id = kid; name = getFullKernelName(kernel_id); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 2f6b0f620..78874462d 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -409,6 +409,10 @@ class KernelBase std::vector checksum[NumVariants]; Checksum_type checksum_scale_factor; +#if defined(RAJA_ENABLE_TARGET_OPENMP) + int did; +#endif + private: KernelBase() = delete; From 467c36fd4939c6e7ea704bc822e7b346572eb929 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:33:17 -0700 Subject: [PATCH 111/137] fixup REDUCE_STRUCT::runOpenMPTargetVariant --- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index 58a4a8ee5..cfbcba44a 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -34,6 +34,8 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + REDUCE_STRUCT_DATA_SETUP; + if ( vid == Base_OpenMPTarget ) { Real_ptr xa = points.x; From 6336a2e2016ea4189c2ded6ce6357b3575584a26 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:40:31 -0700 Subject: [PATCH 112/137] Fixup FIR_DATA_SETUP macros --- src/apps/FIR-Hip.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index e6d23cb13..3d85da83a 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -53,7 +53,7 @@ __global__ void fir(Real_ptr out, Real_ptr in, Real_ptr coeff; \ \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitData(DataSpace::HipDevice, coeff, FIR_COEFFLEN); \ + allocData(DataSpace::HipDevice, coeff, FIR_COEFFLEN); \ copyData(DataSpace::HipDevice, coeff, DataSpace::Host, tcoeff, FIR_COEFFLEN); diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 43c0b2d60..5715f884a 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -31,7 +31,7 @@ namespace apps Real_ptr coeff; \ \ Real_ptr tcoeff = &coeff_array[0]; \ - allocAndInitData(DataSpace::OmpTarget, coeff, FIR_COEFFLEN); \ + allocData(DataSpace::OmpTarget, coeff, FIR_COEFFLEN); \ copyData(DataSpace::OmpTarget, coeff, DataSpace::Host, tcoeff, FIR_COEFFLEN); From a0a82d598ed1b6a51c36b1a506af1240d556dfc5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 23 Mar 2023 13:51:05 -0700 Subject: [PATCH 113/137] Use DataSpace specific copyData --- src/common/KernelBase.hpp | 15 +++++++++------ src/polybench/POLYBENCH_HEAT_3D.hpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D.hpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_2D.hpp | 4 ++-- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 78874462d..9249e1f65 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -263,6 +263,15 @@ class KernelBase rajaperf::allocData(dataSpace, ptr, len, getDataAlignment()); } + + template + void copyData(DataSpace dst_dataSpace, T* dst_ptr, + DataSpace src_dataSpace, const T* src_ptr, + int len) + { + rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len); + } + template void deallocData(DataSpace dataSpace, T& ptr) { @@ -312,12 +321,6 @@ class KernelBase return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; } - template - void copyData(T* dst_ptr, const T* src_ptr, int len, VariantID vid) - { - rajaperf::copyData(getDataSpace(vid), dst_ptr, getDataSpace(vid), src_ptr, len); - } - template void deallocData(T*& ptr, VariantID vid) { diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index de5bd081d..03150a267 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -44,8 +44,8 @@ Real_ptr A = m_A; \ Real_ptr B = m_B; \ \ - copyData(A, m_Ainit, m_N*m_N*m_N, vid); \ - copyData(B, m_Binit, m_N*m_N*m_N, vid); \ + copyData(getDataSpace(vid), A, getDataSpace(vid), m_Ainit, m_N*m_N*m_N); \ + copyData(getDataSpace(vid), B, getDataSpace(vid), m_Binit, m_N*m_N*m_N); \ \ const Index_type N = m_N; \ const Index_type tsteps = m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index aa774aba8..5c84e0682 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -27,8 +27,8 @@ Real_ptr A = m_A; \ Real_ptr B = m_B; \ \ - copyData(A, m_Ainit, m_N, vid); \ - copyData(B, m_Binit, m_N, vid); \ + copyData(getDataSpace(vid), A, getDataSpace(vid), m_Ainit, m_N); \ + copyData(getDataSpace(vid), B, getDataSpace(vid), m_Binit, m_N); \ \ const Index_type N = m_N; \ const Index_type tsteps = m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 37e70407b..fe77836cb 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -31,8 +31,8 @@ Real_ptr A = m_A; \ Real_ptr B = m_B; \ \ - copyData(A, m_Ainit, m_N*m_N, vid); \ - copyData(B, m_Binit, m_N*m_N, vid); \ + copyData(getDataSpace(vid), A, getDataSpace(vid), m_Ainit, m_N*m_N); \ + copyData(getDataSpace(vid), B, getDataSpace(vid), m_Binit, m_N*m_N); \ \ const Index_type N = m_N; \ const Index_type tsteps = m_tsteps; From a6ce45e4cfd59c4c3db093a4ac2ef197c7bd4cef Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 09:38:38 -0700 Subject: [PATCH 114/137] Move some memory utils into source files --- src/common/DataUtils.cpp | 322 ++++++++++++++++++++++++++++++++++++ src/common/DataUtils.hpp | 339 ++++---------------------------------- src/common/KernelBase.cpp | 46 ++++++ src/common/KernelBase.hpp | 51 +----- 4 files changed, 402 insertions(+), 356 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 8fe585df7..22686956b 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -23,6 +23,83 @@ namespace rajaperf namespace detail { +/*! + * \brief Get if the data space is a host DataSpace. + */ +bool isHostDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Host: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a omp DataSpace. + */ +bool isOpenMPDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Omp: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a omp target DataSpace. + */ +bool isOpenMPTargetDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::OmpTarget: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a cuda DataSpace. + */ +bool isCudaDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::CudaPinned: + case DataSpace::CudaManaged: + case DataSpace::CudaDevice: + return true; + default: + return false; + } +} + +/*! + * \brief Get if the data space is a hip DataSpace. + */ +bool isHipDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return true; + default: + return false; + } +} + + static int data_init_count = 0; /* @@ -71,6 +148,211 @@ void deallocHostData(void* ptr) } +/* + * Allocate data arrays of given dataSpace. + */ +void* allocData(DataSpace dataSpace, int nbytes, int align) +{ + void* ptr = nullptr; + + switch (dataSpace) { + case DataSpace::Host: + { + ptr = detail::allocHostData(nbytes, align); + } break; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + case DataSpace::Omp: + { + ptr = detail::allocHostData(nbytes, align); + } break; +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case DataSpace::OmpTarget: + { + ptr = detail::allocOpenMPDeviceData(nbytes); + } break; +#endif + +#if defined(RAJA_ENABLE_CUDA) + case DataSpace::CudaPinned: + { + ptr = detail::allocCudaPinnedData(nbytes); + } break; + case DataSpace::CudaManaged: + { + ptr = detail::allocCudaManagedData(nbytes); + } break; + case DataSpace::CudaDevice: + { + ptr = detail::allocCudaDeviceData(nbytes); + } break; +#endif + +#if defined(RAJA_ENABLE_HIP) + case DataSpace::HipHostAdviseFine: + { + ptr = detail::allocHostData(nbytes, align); + detail::adviseHipFineData(ptr, nbytes); + } break; + case DataSpace::HipHostAdviseCoarse: + { + ptr = detail::allocHostData(nbytes, align); + detail::adviseHipCoarseData(ptr, nbytes); + } break; + case DataSpace::HipPinned: + { + ptr = detail::allocHipPinnedData(nbytes); + } break; + case DataSpace::HipPinnedFine: + { + ptr = detail::allocHipPinnedFineData(nbytes); + } break; + case DataSpace::HipPinnedCoarse: + { + ptr = detail::allocHipPinnedCoarseData(nbytes); + } break; + case DataSpace::HipManaged: + { + ptr = detail::allocHipManagedData(nbytes); + } break; + case DataSpace::HipManagedAdviseFine: + { + ptr = detail::allocHipManagedData(nbytes); + detail::adviseHipFineData(ptr, nbytes); + } break; + case DataSpace::HipManagedAdviseCoarse: + { + ptr = detail::allocHipManagedData(nbytes); + detail::adviseHipCoarseData(ptr, nbytes); + } break; + case DataSpace::HipDevice: + { + ptr = detail::allocHipDeviceData(nbytes); + } break; + case DataSpace::HipDeviceFine: + { + ptr = detail::allocHipDeviceFineData(nbytes); + } break; +#endif + + default: + { + throw std::invalid_argument("allocData : Unknown data space"); + } break; + } + + return ptr; +} + +/*! + * \brief Copy data from one dataSpace to another. + */ +void copyData(DataSpace dst_dataSpace, void* dst_ptr, + DataSpace src_dataSpace, const void* src_ptr, + size_t nbytes) +{ + if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && + hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { + detail::copyHostData(dst_ptr, src_ptr, nbytes); + } + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + else if (isOpenMPTargetDataSpace(dst_dataSpace) || + isOpenMPTargetDataSpace(src_dataSpace)) { + auto dst_did = isOpenMPTargetDataSpace(dst_dataSpace) ? getOpenMPTargetDevice() + : getOpenMPTargetHost(); + auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? getOpenMPTargetDevice() + : getOpenMPTargetHost(); + detail::copyOpenMPTargetData(dst_ptr, src_ptr, nbytes, + dst_did, src_did); + } +#endif + +#if defined(RAJA_ENABLE_CUDA) + else if (isCudaDataSpace(dst_dataSpace) || + isCudaDataSpace(src_dataSpace)) { + detail::copyCudaData(dst_ptr, src_ptr, nbytes); + } +#endif + +#if defined(RAJA_ENABLE_HIP) + else if (isHipDataSpace(dst_dataSpace) || + isHipDataSpace(src_dataSpace)) { + detail::copyHipData(dst_ptr, src_ptr, nbytes); + } +#endif + + else { + throw std::invalid_argument("copyData : Unknown data space"); + } +} + +/*! + * \brief Deallocate data array (ptr). + */ +void deallocData(DataSpace dataSpace, void* ptr) +{ + switch (dataSpace) { + case DataSpace::Host: + case DataSpace::Omp: + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + { + detail::deallocHostData(ptr); + } break; + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case DataSpace::OmpTarget: + { + detail::deallocOpenMPDeviceData(ptr); + } break; +#endif + +#if defined(RAJA_ENABLE_CUDA) + case DataSpace::CudaPinned: + { + detail::deallocCudaPinnedData(ptr); + } break; + case DataSpace::CudaManaged: + { + detail::deallocCudaManagedData(ptr); + } break; + case DataSpace::CudaDevice: + { + detail::deallocCudaDeviceData(ptr); + } break; +#endif + +#if defined(RAJA_ENABLE_HIP) + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: + { + detail::deallocHipPinnedData(ptr); + } break; + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: + { + detail::deallocHipManagedData(ptr); + } break; + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + { + detail::deallocHipDeviceData(ptr); + } break; +#endif + + default: + { + throw std::invalid_argument("deallocData : Unknown data space"); + } break; + } +} + + /* * \brief Initialize Int_type data array to * randomly signed positive and negative values. @@ -266,4 +548,44 @@ long double calcChecksum(Complex_ptr ptr, int len, } // closing brace for detail namespace + +/*! + * \brief Get an host accessible data space for this dataSpace. + */ +DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Host: + case DataSpace::Omp: + case DataSpace::CudaPinned: + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: + return dataSpace; + + case DataSpace::OmpTarget: + return DataSpace::Host; + + case DataSpace::CudaManaged: + case DataSpace::CudaDevice: + return DataSpace::CudaPinned; + + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: + return dataSpace; + + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return DataSpace::HipPinned; + + default: + { + throw std::invalid_argument("hostAccessibleDataSpace : Unknown data space"); + } break; + } +} + } // closing brace for rajaperf namespace diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 8c223ffa7..b6295d771 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -60,22 +60,23 @@ void* allocHostData(size_t len, size_t align); void deallocHostData(void* ptr); -/* - * \brief Touch data array with omp threads. +/*! + * \brief Allocate data array in dataSpace. */ -template < typename T > -void touchOmpData(T* ptr, int len) -{ -// First touch... -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - ptr[i] = T{}; - }; -#else - (void)ptr; (void)len; -#endif -} +void* allocData(DataSpace dataSpace, int nbytes, int align); + +/*! + * \brief Copy data from one dataSpace to another. + */ +void copyData(DataSpace dst_dataSpace, void* dst_ptr, + DataSpace src_dataSpace, const void* src_ptr, + size_t nbytes); + +/*! + * \brief Free data arrays in dataSpace. + */ +void deallocData(DataSpace dataSpace, void* ptr); + /*! * \brief Initialize Int_type data array. @@ -167,281 +168,37 @@ long double calcChecksum(Complex_ptr d, int len, * Intended to be a space that is quick to copy to from the given space if * the given space is not accessible on the Host. */ -inline DataSpace hostAccessibleDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::Host: - case DataSpace::Omp: - case DataSpace::CudaPinned: - case DataSpace::HipHostAdviseFine: - case DataSpace::HipHostAdviseCoarse: - case DataSpace::HipPinned: - case DataSpace::HipPinnedFine: - case DataSpace::HipPinnedCoarse: - return dataSpace; - - case DataSpace::OmpTarget: - return DataSpace::Host; - - case DataSpace::CudaManaged: - case DataSpace::CudaDevice: - return DataSpace::CudaPinned; - - case DataSpace::HipManaged: - case DataSpace::HipManagedAdviseFine: - case DataSpace::HipManagedAdviseCoarse: - return dataSpace; - - case DataSpace::HipDevice: - case DataSpace::HipDeviceFine: - return DataSpace::HipPinned; - - default: - { - throw std::invalid_argument("hostAccessibleDataSpace : Unknown data space"); - } break; - } -} - -/*! - * \brief Get if the data space is a host DataSpace. - */ -inline bool isHostDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::Host: - return true; - default: - return false; - } -} - -/*! - * \brief Get if the data space is a omp DataSpace. - */ -inline bool isOpenMPDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::Omp: - return true; - default: - return false; - } -} - -/*! - * \brief Get if the data space is a omp target DataSpace. - */ -inline bool isOpenMPTargetDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::OmpTarget: - return true; - default: - return false; - } -} - -/*! - * \brief Get if the data space is a cuda DataSpace. - */ -inline bool isCudaDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::CudaPinned: - case DataSpace::CudaManaged: - case DataSpace::CudaDevice: - return true; - default: - return false; - } -} - -/*! - * \brief Get if the data space is a hip DataSpace. - */ -inline bool isHipDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::HipHostAdviseFine: - case DataSpace::HipHostAdviseCoarse: - case DataSpace::HipPinned: - case DataSpace::HipPinnedFine: - case DataSpace::HipPinnedCoarse: - case DataSpace::HipManaged: - case DataSpace::HipManagedAdviseFine: - case DataSpace::HipManagedAdviseCoarse: - case DataSpace::HipDevice: - case DataSpace::HipDeviceFine: - return true; - default: - return false; - } -} - +DataSpace hostAccessibleDataSpace(DataSpace dataSpace); /*! * \brief Allocate data array (ptr). */ template -inline void allocData(DataSpace dataSpace, T& ptr_ref, int len, int align) +inline void allocData(DataSpace dataSpace, T*& ptr_ref, int len, int align) { - void* ptr = nullptr; - size_t nbytes = len*sizeof(std::remove_pointer_t); - - switch (dataSpace) { - case DataSpace::Host: - { - ptr = detail::allocHostData(nbytes, align); - } break; + size_t nbytes = len*sizeof(T); + T* ptr = static_cast(detail::allocData(dataSpace, nbytes, align)); #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - case DataSpace::Omp: - { - ptr = detail::allocHostData(nbytes, align); - detail::touchOmpData(static_cast(ptr), len); - } break; -#endif - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - case DataSpace::OmpTarget: - { - ptr = detail::allocOpenMPDeviceData(nbytes); - } break; -#endif - -#if defined(RAJA_ENABLE_CUDA) - case DataSpace::CudaPinned: - { - ptr = detail::allocCudaPinnedData(nbytes); - } break; - case DataSpace::CudaManaged: - { - ptr = detail::allocCudaManagedData(nbytes); - } break; - case DataSpace::CudaDevice: - { - ptr = detail::allocCudaDeviceData(nbytes); - } break; -#endif - -#if defined(RAJA_ENABLE_HIP) - case DataSpace::HipHostAdviseFine: - { - ptr = detail::allocHostData(nbytes, align); - detail::adviseHipFineData(ptr, nbytes); - } break; - case DataSpace::HipHostAdviseCoarse: - { - ptr = detail::allocHostData(nbytes, align); - detail::adviseHipCoarseData(ptr, nbytes); - } break; - case DataSpace::HipPinned: - { - ptr = detail::allocHipPinnedData(nbytes); - } break; - case DataSpace::HipPinnedFine: - { - ptr = detail::allocHipPinnedFineData(nbytes); - } break; - case DataSpace::HipPinnedCoarse: - { - ptr = detail::allocHipPinnedCoarseData(nbytes); - } break; - case DataSpace::HipManaged: - { - ptr = detail::allocHipManagedData(nbytes); - } break; - case DataSpace::HipManagedAdviseFine: - { - ptr = detail::allocHipManagedData(nbytes); - detail::adviseHipFineData(ptr, nbytes); - } break; - case DataSpace::HipManagedAdviseCoarse: - { - ptr = detail::allocHipManagedData(nbytes); - detail::adviseHipCoarseData(ptr, nbytes); - } break; - case DataSpace::HipDevice: - { - ptr = detail::allocHipDeviceData(nbytes); - } break; - case DataSpace::HipDeviceFine: - { - ptr = detail::allocHipDeviceFineData(nbytes); - } break; + if (dataSpace == DataSpace::Omp) { + // perform first touch on Omp Data + #pragma omp parallel for + for (int i = 0; i < len; ++i) { + ptr[i] = T{}; + }; + } #endif - default: - { - throw std::invalid_argument("allocData : Unknown data space"); - } break; - } - ptr_ref = static_cast(ptr); + ptr_ref = ptr; } /*! * \brief Deallocate data array (ptr). */ template -inline void deallocData(DataSpace dataSpace, T& ptr) +inline void deallocData(DataSpace dataSpace, T*& ptr) { - switch (dataSpace) { - case DataSpace::Host: - case DataSpace::Omp: - case DataSpace::HipHostAdviseFine: - case DataSpace::HipHostAdviseCoarse: - { - detail::deallocHostData(ptr); - } break; - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - case DataSpace::OmpTarget: - { - detail::deallocOpenMPDeviceData(ptr); - } break; -#endif - -#if defined(RAJA_ENABLE_CUDA) - case DataSpace::CudaPinned: - { - detail::deallocCudaPinnedData(ptr); - } break; - case DataSpace::CudaManaged: - { - detail::deallocCudaManagedData(ptr); - } break; - case DataSpace::CudaDevice: - { - detail::deallocCudaDeviceData(ptr); - } break; -#endif - -#if defined(RAJA_ENABLE_HIP) - case DataSpace::HipPinned: - case DataSpace::HipPinnedFine: - case DataSpace::HipPinnedCoarse: - { - detail::deallocHipPinnedData(ptr); - } break; - case DataSpace::HipManaged: - case DataSpace::HipManagedAdviseFine: - case DataSpace::HipManagedAdviseCoarse: - { - detail::deallocHipManagedData(ptr); - } break; - case DataSpace::HipDevice: - case DataSpace::HipDeviceFine: - { - detail::deallocHipDeviceData(ptr); - } break; -#endif - - default: - { - throw std::invalid_argument("deallocData : Unknown data space"); - } break; - } + detail::deallocData(dataSpace, ptr); ptr = nullptr; } @@ -454,41 +211,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, int len) { size_t nbytes = len*sizeof(T); - - if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && - hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { - detail::copyHostData(dst_ptr, src_ptr, nbytes); - } - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - else if (isOpenMPTargetDataSpace(dst_dataSpace) || - isOpenMPTargetDataSpace(src_dataSpace)) { - auto dst_did = isOpenMPTargetDataSpace(dst_dataSpace) ? getOpenMPTargetDevice() - : getOpenMPTargetHost(); - auto src_did = isOpenMPTargetDataSpace(src_dataSpace) ? getOpenMPTargetDevice() - : getOpenMPTargetHost(); - detail::copyOpenMPTargetData(dst_ptr, src_ptr, nbytes, - dst_did, src_did); - } -#endif - -#if defined(RAJA_ENABLE_CUDA) - else if (isCudaDataSpace(dst_dataSpace) || - isCudaDataSpace(src_dataSpace)) { - detail::copyCudaData(dst_ptr, src_ptr, nbytes); - } -#endif - -#if defined(RAJA_ENABLE_HIP) - else if (isHipDataSpace(dst_dataSpace) || - isHipDataSpace(src_dataSpace)) { - detail::copyHipData(dst_ptr, src_ptr, nbytes); - } -#endif - - else { - throw std::invalid_argument("copyData : Unknown data space"); - } + detail::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, nbytes); } /*! diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index c398f6d0a..bd4a39b15 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -156,6 +156,52 @@ void KernelBase::setVariantDefined(VariantID vid) tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0); } +int KernelBase::getDataAlignment() const +{ + return run_params.getDataAlignment(); +} + +DataSpace KernelBase::getDataSpace(VariantID vid) const +{ + switch ( vid ) { + + case Base_Seq : + case Lambda_Seq : + case RAJA_Seq : + return run_params.getSeqDataSpace(); + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + return run_params.getOmpDataSpace(); + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + return run_params.getOmpTargetDataSpace(); + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + return run_params.getCudaDataSpace(); + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + return run_params.getHipDataSpace(); + + case Kokkos_Lambda : + return run_params.getKokkosDataSpace(); + + default: + throw std::invalid_argument("getDataSpace : Unknown variant id"); + } +} + +DataSpace KernelBase::getHostAccessibleDataSpace(VariantID vid) const +{ + return hostAccessibleDataSpace(getDataSpace(vid)); +} + void KernelBase::execute(VariantID vid, size_t tune_idx) { running_variant = vid; diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 9249e1f65..dac9b4b6a 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -207,55 +207,10 @@ class KernelBase #endif } - int getDataAlignment() const { return run_params.getDataAlignment(); } + int getDataAlignment() const; - DataSpace getSeqDataSpace() const { return run_params.getSeqDataSpace(); } - DataSpace getOmpDataSpace() const { return run_params.getOmpDataSpace(); } - DataSpace getOmpTargetDataSpace() const { return run_params.getOmpTargetDataSpace(); } - DataSpace getCudaDataSpace() const { return run_params.getCudaDataSpace(); } - DataSpace getHipDataSpace() const { return run_params.getHipDataSpace(); } - DataSpace getKokkosDataSpace() const { return run_params.getKokkosDataSpace(); } - - DataSpace getDataSpace(VariantID vid) const - { - switch ( vid ) { - - case Base_Seq : - case Lambda_Seq : - case RAJA_Seq : - return getSeqDataSpace(); - - case Base_OpenMP : - case Lambda_OpenMP : - case RAJA_OpenMP : - return getOmpDataSpace(); - - case Base_OpenMPTarget : - case RAJA_OpenMPTarget : - return getOmpTargetDataSpace(); - - case Base_CUDA : - case Lambda_CUDA : - case RAJA_CUDA : - return getCudaDataSpace(); - - case Base_HIP : - case Lambda_HIP : - case RAJA_HIP : - return getHipDataSpace(); - - case Kokkos_Lambda : - return getKokkosDataSpace(); - - default: - throw std::invalid_argument("getDataSpace : Unknown variant id"); - } - } - - DataSpace getHostAccessibleDataSpace(VariantID vid) const - { - return hostAccessibleDataSpace(getDataSpace(vid)); - } + DataSpace getDataSpace(VariantID vid) const; + DataSpace getHostAccessibleDataSpace(VariantID vid) const; template void allocData(DataSpace dataSpace, T& ptr, int len) From 93789cb88f789ef287cb5dd186a4c9c09f35761f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 09:49:20 -0700 Subject: [PATCH 115/137] Move some backend dependencies to src files --- src/common/DataUtils.cpp | 1 + src/common/DataUtils.hpp | 3 --- src/common/KernelBase.cpp | 1 + src/common/KernelBase.hpp | 2 -- 4 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 22686956b..4d7b40f99 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -9,6 +9,7 @@ #include "DataUtils.hpp" #include "CudaDataUtils.hpp" #include "HipDataUtils.hpp" +#include "OpenMPTargetDataUtils.hpp" #include "RAJA/internal/MemUtils_CPU.hpp" diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index b6295d771..1b233e574 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -16,9 +16,6 @@ #include "RAJAPerfSuite.hpp" #include "RPTypes.hpp" -#include "common/OpenMPTargetDataUtils.hpp" -#include "common/CudaDataUtils.hpp" -#include "common/HipDataUtils.hpp" #include #include diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index bd4a39b15..e6cb5d20d 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -9,6 +9,7 @@ #include "KernelBase.hpp" #include "RunParams.hpp" +#include "OpenMPTargetDataUtils.hpp" #include #include diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index dac9b4b6a..e48c96c62 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -14,8 +14,6 @@ #include "common/DataUtils.hpp" #include "common/RunParams.hpp" #include "common/GPUUtils.hpp" -#include "common/CudaDataUtils.hpp" -#include "common/HipDataUtils.hpp" #include "RAJA/util/Timer.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) From 330d30aebba6b02371d072be1c1cf71637a068da Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 24 Mar 2023 13:13:07 -0700 Subject: [PATCH 116/137] Make unit test env variable name consistent with other configuration variable names --- test/test-raja-perf-suite.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 8b4552cde..26ebcbda5 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -40,14 +40,14 @@ TEST(ShortSuiteTest, Basic) sargv.emplace_back(std::string("FIRST_MIN")); #endif - char *unit_test = getenv("RAJA_PERF_UNIT_TEST"); + char *unit_test = getenv("RAJA_PERFSUITE_UNIT_TEST"); if (unit_test != NULL) { sargv.emplace_back(std::string("-k")); sargv.emplace_back(std::string(unit_test)); } char** argv = new char* [sargv.size()]; - for (int is = 0; is < sargv.size(); ++is) { + for (size_t is = 0; is < sargv.size(); ++is) { argv[is] = const_cast(sargv[is].c_str()); } From 489bc32999ac66b95b43fcaa062e462561776862 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 13:26:12 -0700 Subject: [PATCH 117/137] Add include for exceptions --- src/common/DataUtils.cpp | 1 + src/common/KernelBase.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 4d7b40f99..44cdd19d0 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -16,6 +16,7 @@ #include #include +#include #include namespace rajaperf diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index e6cb5d20d..cbc2083dd 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -13,6 +13,7 @@ #include #include +#include namespace rajaperf { From 97fb9784ef908247ddb12c435866354ec8e78ace Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 14:12:45 -0700 Subject: [PATCH 118/137] Fix parsing of data space args --- src/common/RunParams.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 7ee741e0d..d85873425 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -485,6 +485,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) bool got_someting = false; i++; if ( i < argc ) { + auto opt_name = std::move(opt); opt = std::string(argv[i]); if ( opt.at(0) == '-' ) { i--; @@ -492,23 +493,23 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) for (int ds = 0; ds < static_cast(DataSpace::NumSpaces); ++ds) { if (getDataSpaceName(static_cast(ds)) == opt) { got_someting = true; - if ( opt == std::string("--seq-data-space") || - opt == std::string("-sds") ) { + if ( opt_name == std::string("--seq-data-space") || + opt_name == std::string("-sds") ) { seqDataSpace = static_cast(ds); - } else if ( opt == std::string("--omp-data-space") || - opt == std::string("-ods") ) { + } else if ( opt_name == std::string("--omp-data-space") || + opt_name == std::string("-ods") ) { ompDataSpace = static_cast(ds); - } else if ( opt == std::string("--omptarget-data-space") || - opt == std::string("-otds") ) { + } else if ( opt_name == std::string("--omptarget-data-space") || + opt_name == std::string("-otds") ) { ompTargetDataSpace = static_cast(ds); - } else if ( opt == std::string("--cuda-data-space") || - opt == std::string("-cds") ) { + } else if ( opt_name == std::string("--cuda-data-space") || + opt_name == std::string("-cds") ) { cudaDataSpace = static_cast(ds); - } else if ( opt == std::string("--hip-data-space") || - opt == std::string("-hds") ) { + } else if ( opt_name == std::string("--hip-data-space") || + opt_name == std::string("-hds") ) { hipDataSpace = static_cast(ds); - } else if ( opt == std::string("--kokkos-data-space") || - opt == std::string("-kds") ) { + } else if ( opt_name == std::string("--kokkos-data-space") || + opt_name == std::string("-kds") ) { kokkosDataSpace = static_cast(ds); } else { got_someting = false; @@ -519,7 +520,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } if (!got_someting) { getCout() << "\nBad input:" - << " must give " << opt << " a valid data space" + << " must give " << opt_name << " a valid data space" << std::endl; input_state = BadInput; } From ff4b91d073230cd38aebd73edfb6a0b81383c8b5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 14:13:03 -0700 Subject: [PATCH 119/137] Print data spaces used --- src/common/Executor.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index a41e3fd00..fcdb7ee93 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -715,6 +715,16 @@ void Executor::reportRunSummary(ostream& str) const str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl; + str << "\nData Spaces" + << "\n--------"; + str << "\nSeq - " << getDataSpaceName(run_params.getSeqDataSpace()); + str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpDataSpace()); + str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetDataSpace()); + str << "\nCuda - " << getDataSpaceName(run_params.getCudaDataSpace()); + str << "\nHip - " << getDataSpaceName(run_params.getHipDataSpace()); + str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosDataSpace()); + str << endl; + str << "\nVariants and Tunings" << "\n--------\n"; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { From c7672928174e2ccd1bc30be25fe2e4519510e505 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 14:25:36 -0700 Subject: [PATCH 120/137] improve ifdeffing around Data Spaces --- src/common/DataUtils.cpp | 10 ++++++++++ src/common/HipDataUtils.hpp | 2 ++ 2 files changed, 12 insertions(+) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 44cdd19d0..f1831cc1f 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -198,11 +198,13 @@ void* allocData(DataSpace dataSpace, int nbytes, int align) ptr = detail::allocHostData(nbytes, align); detail::adviseHipFineData(ptr, nbytes); } break; +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) case DataSpace::HipHostAdviseCoarse: { ptr = detail::allocHostData(nbytes, align); detail::adviseHipCoarseData(ptr, nbytes); } break; +#endif case DataSpace::HipPinned: { ptr = detail::allocHipPinnedData(nbytes); @@ -224,11 +226,13 @@ void* allocData(DataSpace dataSpace, int nbytes, int align) ptr = detail::allocHipManagedData(nbytes); detail::adviseHipFineData(ptr, nbytes); } break; +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) case DataSpace::HipManagedAdviseCoarse: { ptr = detail::allocHipManagedData(nbytes); detail::adviseHipCoarseData(ptr, nbytes); } break; +#endif case DataSpace::HipDevice: { ptr = detail::allocHipDeviceData(nbytes); @@ -299,8 +303,12 @@ void deallocData(DataSpace dataSpace, void* ptr) switch (dataSpace) { case DataSpace::Host: case DataSpace::Omp: +#if defined(RAJA_ENABLE_HIP) case DataSpace::HipHostAdviseFine: +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) case DataSpace::HipHostAdviseCoarse: +#endif +#endif { detail::deallocHostData(ptr); } break; @@ -336,7 +344,9 @@ void deallocData(DataSpace dataSpace, void* ptr) } break; case DataSpace::HipManaged: case DataSpace::HipManagedAdviseFine: +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) case DataSpace::HipManagedAdviseCoarse: +#endif { detail::deallocHipManagedData(ptr); } break; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index df1e34191..17f66d318 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -195,10 +195,12 @@ inline void adviseHipData(void* ptr, int len, hipMemoryAdvise advice, int device hipErrchk( hipMemAdvise( ptr, len, advice, device ) ); } +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) inline void adviseHipCoarseData(void* ptr, size_t len) { adviseHipData(ptr, len, hipMemAdviseSetCoarseGrain, getHipDevice()); } +#endif inline void adviseHipFineData(void* ptr, size_t len) { From 16cf3144e08077c0a8676e4909535a9549833180 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 14:37:31 -0700 Subject: [PATCH 121/137] Add checks for available data spaces based on the config and reject the use of an unavailable data space --- src/common/DataUtils.cpp | 49 ++++++++++++++++++++++++++++++++++++++++ src/common/DataUtils.hpp | 5 ++++ src/common/RunParams.cpp | 39 ++++++++++++++++++++++++-------- 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index f1831cc1f..efd5cdb7b 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -561,6 +561,55 @@ long double calcChecksum(Complex_ptr ptr, int len, } // closing brace for detail namespace +/*! + * \brief Get if data space is available in this config. + */ +bool isAvailableDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Host: + return true; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + case DataSpace::Omp: + return true; +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case DataSpace::OmpTarget: + return true; +#endif + +#if defined(RAJA_ENABLE_CUDA) + case DataSpace::CudaPinned: + case DataSpace::CudaManaged: + case DataSpace::CudaDevice: + return true; +#endif + +#if defined(RAJA_ENABLE_HIP) + case DataSpace::HipHostAdviseFine: +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) + case DataSpace::HipHostAdviseCoarse: +#endif + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) + case DataSpace::HipManagedAdviseCoarse: +#endif + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return true; +#endif + + default: + return false; + } +} + /*! * \brief Get an host accessible data space for this dataSpace. */ diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 1b233e574..91c569fc6 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -159,6 +159,11 @@ long double calcChecksum(Complex_ptr d, int len, } // closing brace for detail namespace +/*! + * \brief Get if data space is available in this config. + */ +bool isAvailableDataSpace(DataSpace dataSpace); + /*! * \brief Get an host accessible data space for this dataSpace. * diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index d85873425..b64ed69f3 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -483,6 +483,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("-kds") ) { bool got_someting = false; + bool got_something_available = false; i++; if ( i < argc ) { auto opt_name = std::move(opt); @@ -490,27 +491,29 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt.at(0) == '-' ) { i--; } else { - for (int ds = 0; ds < static_cast(DataSpace::NumSpaces); ++ds) { - if (getDataSpaceName(static_cast(ds)) == opt) { + for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { + DataSpace ds = static_cast(ids); + if (getDataSpaceName(ds) == opt) { got_someting = true; + got_something_available = isAvailableDataSpace(ds); if ( opt_name == std::string("--seq-data-space") || opt_name == std::string("-sds") ) { - seqDataSpace = static_cast(ds); + seqDataSpace = ds; } else if ( opt_name == std::string("--omp-data-space") || opt_name == std::string("-ods") ) { - ompDataSpace = static_cast(ds); + ompDataSpace = ds; } else if ( opt_name == std::string("--omptarget-data-space") || opt_name == std::string("-otds") ) { - ompTargetDataSpace = static_cast(ds); + ompTargetDataSpace = ds; } else if ( opt_name == std::string("--cuda-data-space") || opt_name == std::string("-cds") ) { - cudaDataSpace = static_cast(ds); + cudaDataSpace = ds; } else if ( opt_name == std::string("--hip-data-space") || opt_name == std::string("-hds") ) { - hipDataSpace = static_cast(ds); + hipDataSpace = ds; } else if ( opt_name == std::string("--kokkos-data-space") || opt_name == std::string("-kds") ) { - kokkosDataSpace = static_cast(ds); + kokkosDataSpace = ds; } else { got_someting = false; } @@ -523,6 +526,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) << " must give " << opt_name << " a valid data space" << std::endl; input_state = BadInput; + } else if (!got_something_available) { + getCout() << "\nBad input:" + << " must give " << opt_name << " a data space this is available in this config" + << std::endl; + input_state = BadInput; } } } @@ -865,8 +873,19 @@ void RunParams::printDataSpaceNames(std::ostream& str) const { str << "\nAvailable data spaces:"; str << "\n-------------------\n"; - for (int ds = 0; ds < static_cast(DataSpace::NumSpaces); ++ds) { - str << getDataSpaceName(static_cast(ds)) << std::endl; + for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { + DataSpace ds = static_cast(ids); + if (isAvailableDataSpace(ds)) { + str << getDataSpaceName(ds) << std::endl; + } + } + str << "\nUnavailable data spaces:"; + str << "\n-------------------\n"; + for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { + DataSpace ds = static_cast(ids); + if (!isAvailableDataSpace(ds)) { + str << getDataSpaceName(ds) << std::endl; + } } str.flush(); } From 577b81a492215c33daa405723ac9c0323d7f3a22 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 15:02:32 -0700 Subject: [PATCH 122/137] Move DataSpace availability function into RajaPerfSuite --- src/common/DataUtils.cpp | 49 ------------------------------- src/common/DataUtils.hpp | 5 ---- src/common/RAJAPerfSuite.cpp | 57 ++++++++++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.hpp | 9 ++++++ src/common/RunParams.cpp | 6 ++-- 5 files changed, 69 insertions(+), 57 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index efd5cdb7b..f1831cc1f 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -561,55 +561,6 @@ long double calcChecksum(Complex_ptr ptr, int len, } // closing brace for detail namespace -/*! - * \brief Get if data space is available in this config. - */ -bool isAvailableDataSpace(DataSpace dataSpace) -{ - switch (dataSpace) { - case DataSpace::Host: - return true; - -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - case DataSpace::Omp: - return true; -#endif - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - case DataSpace::OmpTarget: - return true; -#endif - -#if defined(RAJA_ENABLE_CUDA) - case DataSpace::CudaPinned: - case DataSpace::CudaManaged: - case DataSpace::CudaDevice: - return true; -#endif - -#if defined(RAJA_ENABLE_HIP) - case DataSpace::HipHostAdviseFine: -#if defined(RAJAPERF_USE_MEMADVISE_COARSE) - case DataSpace::HipHostAdviseCoarse: -#endif - case DataSpace::HipPinned: - case DataSpace::HipPinnedFine: - case DataSpace::HipPinnedCoarse: - case DataSpace::HipManaged: - case DataSpace::HipManagedAdviseFine: -#if defined(RAJAPERF_USE_MEMADVISE_COARSE) - case DataSpace::HipManagedAdviseCoarse: -#endif - case DataSpace::HipDevice: - case DataSpace::HipDeviceFine: - return true; -#endif - - default: - return false; - } -} - /*! * \brief Get an host accessible data space for this dataSpace. */ diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 91c569fc6..1b233e574 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -159,11 +159,6 @@ long double calcChecksum(Complex_ptr d, int len, } // closing brace for detail namespace -/*! - * \brief Get if data space is available in this config. - */ -bool isAvailableDataSpace(DataSpace dataSpace); - /*! * \brief Get an host accessible data space for this dataSpace. * diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 59c1e11cf..3b7fd8327 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -559,6 +559,63 @@ const std::string& getDataSpaceName(DataSpace ds) return DataSpaceNames[static_cast(ds)]; } +/*! + ******************************************************************************* + * + * Return true if the allocate associated with DataSpace enum value is available. + * + ******************************************************************************* + */ +bool isDataSpaceAvailable(DataSpace dataSpace) +{ + bool ret_val = false; + + switch (dataSpace) { + case DataSpace::Host: + ret_val = true; break; + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + case DataSpace::Omp: + ret_val = true; break; +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case DataSpace::OmpTarget: + ret_val = true; break; +#endif + +#if defined(RAJA_ENABLE_CUDA) + case DataSpace::CudaPinned: + case DataSpace::CudaManaged: + case DataSpace::CudaDevice: + ret_val = true; break; +#endif + +#if defined(RAJA_ENABLE_HIP) + case DataSpace::HipHostAdviseFine: +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) + case DataSpace::HipHostAdviseCoarse: +#endif + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: +#if defined(RAJAPERF_USE_MEMADVISE_COARSE) + case DataSpace::HipManagedAdviseCoarse: +#endif + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + ret_val = true; break; +#endif + + default: + ret_val = false; break; + } + + return ret_val; +} + /* ******************************************************************************* diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 3df084351..b709c669b 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -352,6 +352,15 @@ const std::string& getFeatureName(FeatureID vid); */ const std::string& getDataSpaceName(DataSpace cd); +/*! + ******************************************************************************* + * + * Return true if the allocate associated with DataSpace enum value is available. + * + ******************************************************************************* + */ +bool isDataSpaceAvailable(DataSpace dataSpace); + /*! ******************************************************************************* * diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b64ed69f3..7bc8f58c6 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -495,7 +495,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) DataSpace ds = static_cast(ids); if (getDataSpaceName(ds) == opt) { got_someting = true; - got_something_available = isAvailableDataSpace(ds); + got_something_available = isDataSpaceAvailable(ds); if ( opt_name == std::string("--seq-data-space") || opt_name == std::string("-sds") ) { seqDataSpace = ds; @@ -875,7 +875,7 @@ void RunParams::printDataSpaceNames(std::ostream& str) const str << "\n-------------------\n"; for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { DataSpace ds = static_cast(ids); - if (isAvailableDataSpace(ds)) { + if (isDataSpaceAvailable(ds)) { str << getDataSpaceName(ds) << std::endl; } } @@ -883,7 +883,7 @@ void RunParams::printDataSpaceNames(std::ostream& str) const str << "\n-------------------\n"; for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { DataSpace ds = static_cast(ids); - if (!isAvailableDataSpace(ds)) { + if (!isDataSpaceAvailable(ds)) { str << getDataSpaceName(ds) << std::endl; } } From ed75ae81643ae0fef5021628f7e0755796deddee Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 24 Mar 2023 15:02:56 -0700 Subject: [PATCH 123/137] Only print Data Spaces for used backends --- src/common/Executor.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index fcdb7ee93..538fc1df7 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -718,11 +718,21 @@ void Executor::reportRunSummary(ostream& str) const str << "\nData Spaces" << "\n--------"; str << "\nSeq - " << getDataSpaceName(run_params.getSeqDataSpace()); - str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpDataSpace()); - str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetDataSpace()); - str << "\nCuda - " << getDataSpaceName(run_params.getCudaDataSpace()); - str << "\nHip - " << getDataSpaceName(run_params.getHipDataSpace()); - str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosDataSpace()); + if (isVariantAvailable(VariantID::Base_OpenMP)) { + str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpDataSpace()); + } + if (isVariantAvailable(VariantID::Base_OpenMPTarget)) { + str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetDataSpace()); + } + if (isVariantAvailable(VariantID::Base_CUDA)) { + str << "\nCuda - " << getDataSpaceName(run_params.getCudaDataSpace()); + } + if (isVariantAvailable(VariantID::Base_HIP)) { + str << "\nHip - " << getDataSpaceName(run_params.getHipDataSpace()); + } + if (isVariantAvailable(VariantID::Kokkos_Lambda)) { + str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosDataSpace()); + } str << endl; str << "\nVariants and Tunings" From 89831cb41ca561ce3ff4cec979396ae57b6371c0 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Mar 2023 09:40:13 -0700 Subject: [PATCH 124/137] Deallocate real_zones --- src/apps/DEL_DOT_VEC_2D.cpp | 1 + src/apps/NODAL_ACCUMULATION_3D.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 7baba4a84..ffe5edeb2 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -107,6 +107,7 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_x, vid); deallocData(m_y, vid); + deallocData(m_real_zones, vid); deallocData(m_xdot, vid); deallocData(m_ydot, vid); deallocData(m_div, vid); diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 76fc37e59..ed1bd2078 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -99,6 +99,7 @@ void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_x, vid); deallocData(m_vol, vid); + deallocData(m_real_zones, vid); } } // end namespace apps From 26b95613027e62178a69866d2056c292717f303a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 27 Mar 2023 10:54:23 -0700 Subject: [PATCH 125/137] empty commit to kickstart GitLab CI From 347ccc0149ef7c8f4f6cb4ace4788a91d416127a Mon Sep 17 00:00:00 2001 From: holger Date: Mon, 27 Mar 2023 12:42:53 -0700 Subject: [PATCH 126/137] removed Apps_COUPLE encumberance wrt embedded code to exclude this kernel --- TODO/README.md | 17 +++++++++++++ {src/apps => TODO}/WIP-COUPLE.cpp | 0 {src/apps => TODO}/WIP-COUPLE.hpp | 0 src/CMakeLists.txt | 1 - src/apps/CMakeLists.txt | 1 - src/common/Executor.cpp | 6 +---- src/common/RAJAPerfSuite.cpp | 7 +----- src/common/RAJAPerfSuite.hpp | 1 - src/common/RunParams.cpp | 42 +++++++++++-------------------- tpl/RAJA | 2 +- 10 files changed, 35 insertions(+), 42 deletions(-) create mode 100644 TODO/README.md rename {src/apps => TODO}/WIP-COUPLE.cpp (100%) rename {src/apps => TODO}/WIP-COUPLE.hpp (100%) diff --git a/TODO/README.md b/TODO/README.md new file mode 100644 index 000000000..e51592b5c --- /dev/null +++ b/TODO/README.md @@ -0,0 +1,17 @@ +[comment]: # (#################################################################) +[comment]: # (Copyright 2017-23, Lawrence Livermore National Security, LLC) +[comment]: # (and RAJA Performance Suite project contributors.) +[comment]: # (See the RAJAPerf/LICENSE file for details.) +[comment]: # +[comment]: # (# SPDX-License-Identifier: BSD-3-Clause) +[comment]: # (#################################################################) + +# RAJA + +RAJA Performance Suite TODO +=========================== + +This directory holds kernel implementations that are on hold or WIP + +## Kernels +1. WIP-couple: aka apps/COUPLE -- requires lightweight device side complex math \ No newline at end of file diff --git a/src/apps/WIP-COUPLE.cpp b/TODO/WIP-COUPLE.cpp similarity index 100% rename from src/apps/WIP-COUPLE.cpp rename to TODO/WIP-COUPLE.cpp diff --git a/src/apps/WIP-COUPLE.hpp b/TODO/WIP-COUPLE.hpp similarity index 100% rename from src/apps/WIP-COUPLE.hpp rename to TODO/WIP-COUPLE.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5f93d967b..52d54cb67 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -78,7 +78,6 @@ blt_add_executable( apps/VOL3D.cpp apps/VOL3D-Seq.cpp apps/VOL3D-OMPTarget.cpp - apps/WIP-COUPLE.cpp basic/DAXPY.cpp basic/DAXPY-Seq.cpp basic/DAXPY-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 6d521d1df..7c1973810 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -87,6 +87,5 @@ blt_add_library( VOL3D-Cuda.cpp VOL3D-OMP.cpp VOL3D-OMPTarget.cpp - WIP-COUPLE.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index a41e3fd00..df789fd6d 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -594,11 +594,7 @@ void Executor::setupSuite() for (KIDset::iterator kid = run_kern.begin(); kid != run_kern.end(); ++kid) { -/// RDH DISABLE COUPLE KERNEL until we find a reasonable way to do -/// complex numbers in GPU code - if ( *kid != Apps_COUPLE ) { - kernels.push_back( getKernelObject(*kid, run_params) ); - } + kernels.push_back( getKernelObject(*kid, run_params) ); } if ( !(run_params.getInvalidVariantInput().empty()) || diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 27650cf56..88e58ffc6 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -79,7 +79,6 @@ // Apps kernels... // #include "apps/CONVECTION3DPA.hpp" -#include "apps/WIP-COUPLE.hpp" #include "apps/DEL_DOT_VEC_2D.hpp" #include "apps/DIFFUSION3DPA.hpp" #include "apps/ENERGY.hpp" @@ -215,7 +214,6 @@ static const std::string KernelNames [] = // Apps kernels... // std::string("Apps_CONVECTION3DPA"), - std::string("Apps_COUPLE"), std::string("Apps_DEL_DOT_VEC_2D"), std::string("Apps_DIFFUSION3DPA"), std::string("Apps_ENERGY"), @@ -723,10 +721,7 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::CONVECTION3DPA(run_params); break; } - case Apps_COUPLE : { - kernel = new apps::COUPLE(run_params); - break; - } + case Apps_DEL_DOT_VEC_2D : { kernel = new apps::DEL_DOT_VEC_2D(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index e73bd9888..cf70b31b6 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -132,7 +132,6 @@ enum KernelID { // Apps kernels... // Apps_CONVECTION3DPA, - Apps_COUPLE, Apps_DEL_DOT_VEC_2D, Apps_DIFFUSION3DPA, Apps_ENERGY, diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b4af80a20..d7f31cd80 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -718,10 +718,7 @@ void RunParams::printKernelNames(std::ostream& str) const str << "\nAvailable kernels:"; str << "\n------------------\n"; for (int kid = 0; kid < NumKernels; ++kid) { -/// RDH DISABLE COUPLE KERNEL - if (static_cast(kid) != Apps_COUPLE) { - str << getKernelName(static_cast(kid)) << std::endl; - } + str << getKernelName(static_cast(kid)) << std::endl; } str.flush(); } @@ -732,10 +729,7 @@ void RunParams::printFullKernelNames(std::ostream& str) const str << "\nAvailable kernels (_):"; str << "\n-----------------------------------------\n"; for (int kid = 0; kid < NumKernels; ++kid) { -/// RDH DISABLE COUPLE KERNEL - if (static_cast(kid) != Apps_COUPLE) { - str << getFullKernelName(static_cast(kid)) << std::endl; - } + str << getFullKernelName(static_cast(kid)) << std::endl; } str.flush(); } @@ -781,14 +775,11 @@ void RunParams::printFeatureKernels(std::ostream& str) const str << getFeatureName(tfid) << std::endl; for (int kid = 0; kid < NumKernels; ++kid) { KernelID tkid = static_cast(kid); -/// RDH DISABLE COUPLE KERNEL - if (tkid != Apps_COUPLE) { - KernelBase* kern = getKernelObject(tkid, *this); - if ( kern->usesFeature(tfid) ) { - str << "\t" << getFullKernelName(tkid) << std::endl; - } - delete kern; + KernelBase* kern = getKernelObject(tkid, *this); + if ( kern->usesFeature(tfid) ) { + str << "\t" << getFullKernelName(tkid) << std::endl; } + delete kern; } // loop over kernels str << std::endl; } // loop over features @@ -801,18 +792,15 @@ void RunParams::printKernelFeatures(std::ostream& str) const str << "\n-----------------------------------------\n"; for (int kid = 0; kid < NumKernels; ++kid) { KernelID tkid = static_cast(kid); -/// RDH DISABLE COUPLE KERNEL - if (tkid != Apps_COUPLE) { - str << getFullKernelName(tkid) << std::endl; - KernelBase* kern = getKernelObject(tkid, *this); - for (int fid = 0; fid < NumFeatures; ++fid) { - FeatureID tfid = static_cast(fid); - if ( kern->usesFeature(tfid) ) { - str << "\t" << getFeatureName(tfid) << std::endl; - } - } // loop over features - delete kern; - } + str << getFullKernelName(tkid) << std::endl; + KernelBase* kern = getKernelObject(tkid, *this); + for (int fid = 0; fid < NumFeatures; ++fid) { + FeatureID tfid = static_cast(fid); + if ( kern->usesFeature(tfid) ) { + str << "\t" << getFeatureName(tfid) << std::endl; + } + } // loop over features + delete kern; } // loop over kernels str.flush(); } diff --git a/tpl/RAJA b/tpl/RAJA index 6e1b9afbe..6250fbdcb 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 6e1b9afbebd3d536de447d36733b473cc8dea790 +Subproject commit 6250fbdcb3b74ae4d86ab1755aa73d70759bdcb2 From ce8260252e099008e30b3ff425a8840399e924cd Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 29 Mar 2023 12:39:45 -0700 Subject: [PATCH 127/137] Added short description about running the test --- docs/sphinx/user_guide/run.rst | 37 ++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index c4f53c522..595c5adcc 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -12,11 +12,40 @@ Running the RAJA Performance Suite ********************************************* -This section describes how to run the Suite and which execution options are -available. +This section describes how to run the Suite, after the Suite code is compiled +following the instructions provided in :ref:`build-label`. -After the Suite code is compiled, following the instructions provided in -:ref:`build-label`, the executable will reside in the ``bin`` subdirectory +.. _run_test-label: + +================== +Running a test +================== + +After compilation, a test executable will reside in the ``test`` subdirectory +of the build space. We use this test for our continuous integration testing +to make sure everything works when changes are made to the code. +To run the test, type the test executable name:: + + $ ./test/test-raja-perf-suite.exe + +This will run a few iterations of each kernel and variant that was built +based on the CMake options specified to configure the build. + +You can also run an individual kernel by setting an environment variable +to the name of the kernel you want to run. For example:: + + $ setenv RAJA_PERFSUITE_UNIT_TEST DAXPY + $ ./test/test-raja-perf-suite.exe + +will run all compiled variants of the 'DAXPY' kernel. + +.. _run_suite-label: + +================== +Running the Suite +================== + +After compilation, the main executable will reside in the ``bin`` subdirectory of the build space. The executable will be able to run all kernels and variants that have been built depending on which CMake options were specified to configure the build. From e2a2b47f4fde41261bfad0ab06a9f6700d39bd27 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 29 Mar 2023 12:47:16 -0700 Subject: [PATCH 128/137] Fix format --- docs/sphinx/user_guide/run.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index 595c5adcc..012c32d6a 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -78,7 +78,7 @@ option to the executable:: $ ./bin/raja-perf.exe --help -or +or:: $ ./bin/raja-perf.exe -h From dfbfeefa14cb755203e0258541fbf40995ba05d8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 29 Mar 2023 13:25:11 -0700 Subject: [PATCH 129/137] minor change. --- docs/sphinx/user_guide/run.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index 012c32d6a..3d8473221 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -32,12 +32,18 @@ This will run a few iterations of each kernel and variant that was built based on the CMake options specified to configure the build. You can also run an individual kernel by setting an environment variable -to the name of the kernel you want to run. For example:: +to the name of the kernel you want to run. For example, +if you use a csh/tcsh shell:: $ setenv RAJA_PERFSUITE_UNIT_TEST DAXPY $ ./test/test-raja-perf-suite.exe -will run all compiled variants of the 'DAXPY' kernel. +or, if you use a bash shell:: + + $ RAJA_PERFSUITE_UNIT_TEST=DAXPY ./test/test-raja-perf-suite.exe + +In either case, the test will run all compiled variants of the 'DAXPY' +kernel. .. _run_suite-label: From 96c524c77ecb2b054471fd804651baa9e6d83084 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 31 Mar 2023 13:25:00 -0700 Subject: [PATCH 130/137] Replace #ifdef usage with #if defined() for consistency --- src/RAJAPerfSuiteDriver.cpp | 12 ++++++------ src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 2 +- src/common/Executor.cpp | 22 +++++++++++----------- src/common/KernelBase.hpp | 4 ++-- src/common/OutputUtils.cpp | 6 +++--- src/common/RAJAPerfSuite.cpp | 4 ++-- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index 7b3317f65..3ce688d29 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifdef RUN_KOKKOS +#if defined(RUN_KOKKOS) #include #endif @@ -14,21 +14,21 @@ #include -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) #include #endif //------------------------------------------------------------------------------ int main( int argc, char** argv ) { -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) MPI_Init(&argc, &argv); int num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); rajaperf::getCout() << "\n\nRunning with " << num_ranks << " MPI ranks..." << std::endl; #endif -#ifdef RUN_KOKKOS +#if defined(RUN_KOKKOS) Kokkos::initialize(argc, argv); #endif @@ -50,10 +50,10 @@ int main( int argc, char** argv ) rajaperf::getCout() << "\n\nDONE!!!...." << std::endl; -#ifdef RUN_KOKKOS +#if defined(RUN_KOKKOS) Kokkos::finalize(); #endif -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) MPI_Finalize(); #endif diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 6fe04a8c0..f5771fe2d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -195,7 +195,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::hip_work_async, -#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL +#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, #else RAJA::ordered, diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index a41e3fd00..c59fe7435 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -12,7 +12,7 @@ #include "common/KernelBase.hpp" #include "common/OutputUtils.hpp" -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) #include #endif @@ -44,7 +44,7 @@ using namespace std; namespace { -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) void Allreduce(const Checksum_type* send, Checksum_type* recv, int count, MPI_Op op, MPI_Comm comm) @@ -737,7 +737,7 @@ void Executor::reportRunSummary(ostream& str) const void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const { if ( to_file ) { -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) int num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); str << "Kernels run on " << num_ranks << " MPI ranks" << endl; @@ -979,7 +979,7 @@ void Executor::outputRunData() unique_ptr Executor::openOutputFile(const string& filename) const { int rank = 0; -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) MPI_Comm_rank(MPI_COMM_WORLD, &rank); #endif if (rank == 0) { @@ -1376,7 +1376,7 @@ void Executor::writeChecksumReport(ostream& file) { if ( file ) { -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) int num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); #endif @@ -1411,7 +1411,7 @@ void Executor::writeChecksumReport(ostream& file) // file << equal_line << endl; file << "Checksum Report "; -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) file << "for " << num_ranks << " MPI ranks "; #endif file << endl; @@ -1423,7 +1423,7 @@ void Executor::writeChecksumReport(ostream& file) file <> checksums_sum(variant_ids.size()); @@ -1573,7 +1573,7 @@ void Executor::writeChecksumReport(ostream& file) if ( kern->wasVariantTuningRun(vid, tune_idx) ) { file < #endif @@ -46,7 +46,7 @@ std::string recursiveMkdir(const std::string& in_path) if ( path.empty() ) return std::string(); -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); @@ -139,7 +139,7 @@ std::string recursiveMkdir(const std::string& in_path) delete[] path_buf; -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) // Rank 0 lets the other processes know it made the directories if (rank == 0) { MPI_Barrier(MPI_COMM_WORLD); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index da6683907..5a7efeaad 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -10,7 +10,7 @@ #include "RunParams.hpp" -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) #include #endif @@ -839,7 +839,7 @@ std::ostream& getNullStream() std::ostream& getCout() { int rank = 0; -#ifdef RAJA_PERFSUITE_ENABLE_MPI +#if defined(RAJA_PERFSUITE_ENABLE_MPI) MPI_Comm_rank(MPI_COMM_WORLD, &rank); #endif if (rank == 0) { From d92621780cab0fe0e889571055c1c80048b971b0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 19 Apr 2023 13:13:38 -0700 Subject: [PATCH 131/137] Restructure unimplemented variant files following conventions used in other variant files. Also, squash unused variable warnings. --- src/algorithm/SCAN-OMPTarget.cpp | 2 ++ src/apps/CONVECTION3DPA-OMPTarget.cpp | 15 ++++++--------- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 15 ++++++--------- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 4 ++-- src/apps/MASS3DPA-OMPTarget.cpp | 15 ++++++--------- src/basic/MAT_MAT_SHARED-OMPTarget.cpp | 17 +++++++---------- 6 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp index 05347e7f9..762a3909c 100644 --- a/src/algorithm/SCAN-OMPTarget.cpp +++ b/src/algorithm/SCAN-OMPTarget.cpp @@ -25,7 +25,9 @@ namespace algorithm // // Define threads per team for target execution // +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) const size_t threads_per_team = 256; +#endif void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/CONVECTION3DPA-OMPTarget.cpp b/src/apps/CONVECTION3DPA-OMPTarget.cpp index 92ee3d987..e0317c930 100644 --- a/src/apps/CONVECTION3DPA-OMPTarget.cpp +++ b/src/apps/CONVECTION3DPA-OMPTarget.cpp @@ -19,17 +19,14 @@ namespace rajaperf { namespace apps { -void CONVECTION3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - const Index_type run_reps = getRunReps(); +void CONVECTION3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + if ( vid == Base_OpenMPTarget ) { - switch (vid) { + } else if ( vid == RAJA_OpenMPTarget ) { - default: { - - getCout() << "\n CONVECTION3DPA : Unknown OpenMPTarget variant id = " << vid - << std::endl; - break; - } + } else { + getCout() << "\n CONVECTION3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; } } diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index befedda9c..03a1811a3 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -19,17 +19,14 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - const Index_type run_reps = getRunReps(); +void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + if ( vid == Base_OpenMPTarget ) { - switch (vid) { + } else if ( vid == RAJA_OpenMPTarget ) { - default: { - - getCout() << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid - << std::endl; - break; - } + } else { + getCout() << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 914f4d452..4dd2dad31 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -22,9 +22,9 @@ namespace apps { // - // Define threads per team for target execution + // Define threads per team for target execution (unused) // - const size_t threads_per_team = 256; +//const size_t threads_per_team = 256; #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index 62fb1def4..e4cc02a4f 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -19,17 +19,14 @@ namespace rajaperf { namespace apps { +void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + if ( vid == Base_OpenMPTarget ) { -void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - const Index_type run_reps = getRunReps(); + } else if ( vid == RAJA_OpenMPTarget ) { - switch (vid) { - - default: { - - getCout() << "\n MASS3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; - break; - } + } else { + getCout() << "\n MASS3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; } } diff --git a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp index 6dac3ee94..806d220f4 100644 --- a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp +++ b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp @@ -19,19 +19,16 @@ namespace rajaperf { namespace basic { +void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + if ( vid == Base_OpenMPTarget ) { - void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - const Index_type run_reps = getRunReps(); + } else if ( vid == RAJA_OpenMPTarget ) { - switch (vid) { - - default: { - - getCout() << "\n MAT_MAT_SHARED : Unknown OpenMPTarget variant id = " << vid << std::endl; - break; - } - } + } else { + getCout() << "\n MAT_MAT_SHARED : Unknown OpenMPTarget variant id = " << vid << std::endl; } +} } // end namespace basic } // end namespace rajaperf From 9229d48bf381c2987484b50baceaf8ba7c94bf9e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 19 Apr 2023 13:14:56 -0700 Subject: [PATCH 132/137] Guard lambda expressions to prevent warnings in OpenMP target builds --- src/apps/ENERGY-Seq.cpp | 2 ++ src/apps/FIR-Seq.cpp | 2 ++ src/apps/LTIMES_NOVIEW-Seq.cpp | 2 ++ src/apps/PRESSURE-Seq.cpp | 2 ++ src/apps/VOL3D-Seq.cpp | 2 ++ src/basic/DAXPY-Seq.cpp | 2 ++ src/basic/IF_QUAD-Seq.cpp | 2 ++ src/basic/INIT3-Seq.cpp | 2 ++ src/basic/MULADDSUB-Seq.cpp | 2 ++ src/basic/NESTED_INIT-Seq.cpp | 2 ++ src/lcals/DIFF_PREDICT-Seq.cpp | 2 ++ src/lcals/EOS-Seq.cpp | 2 ++ src/lcals/FIRST_DIFF-Seq.cpp | 2 ++ src/lcals/FIRST_SUM-Seq.cpp | 2 ++ src/lcals/GEN_LIN_RECUR-Seq.cpp | 2 ++ src/lcals/HYDRO_1D-Seq.cpp | 2 ++ src/lcals/INT_PREDICT-Seq.cpp | 2 ++ src/lcals/PLANCKIAN-Seq.cpp | 2 ++ src/lcals/TRIDIAG_ELIM-Seq.cpp | 2 ++ src/stream/ADD-Seq.cpp | 2 ++ src/stream/COPY-Seq.cpp | 2 ++ src/stream/MUL-Seq.cpp | 2 ++ src/stream/TRIAD-Seq.cpp | 2 ++ 23 files changed, 46 insertions(+) diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index 68eb1b2dc..fd27c0cfb 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -26,6 +26,7 @@ void ENERGY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) ENERGY_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto energy_lam1 = [=](Index_type i) { ENERGY_BODY1; }; @@ -44,6 +45,7 @@ void ENERGY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) auto energy_lam6 = [=](Index_type i) { ENERGY_BODY6; }; +#endif switch ( vid ) { diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index d5dba3ade..bb07006c6 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -32,9 +32,11 @@ void FIR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) Real_type coeff[FIR_COEFFLEN]; std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); +#if defined(RUN_RAJA_SEQ) auto fir_lam = [=](Index_type i) { FIR_BODY; }; +#endif switch ( vid ) { diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index f025f8dc4..4ed0cd883 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -24,10 +24,12 @@ void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune LTIMES_NOVIEW_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto ltimesnoview_lam = [=](Index_type d, Index_type z, Index_type g, Index_type m) { LTIMES_NOVIEW_BODY; }; +#endif switch ( vid ) { diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 22731d5c2..aba74c408 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -26,12 +26,14 @@ void PRESSURE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) PRESSURE_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto pressure_lam1 = [=](Index_type i) { PRESSURE_BODY1; }; auto pressure_lam2 = [=](Index_type i) { PRESSURE_BODY2; }; +#endif switch ( vid ) { diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index fd3e7dcd1..d952066c3 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -28,9 +28,11 @@ void VOL3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) VOL3D_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto vol3d_lam = [=](Index_type i) { VOL3D_BODY; }; +#endif switch ( vid ) { diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 13e7d230f..7309705c0 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -26,9 +26,11 @@ void DAXPY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) DAXPY_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto daxpy_lam = [=](Index_type i) { DAXPY_BODY; }; +#endif switch ( vid ) { diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index 09227c748..4fef21d39 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -26,9 +26,11 @@ void IF_QUAD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) IF_QUAD_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto ifquad_lam = [=](Index_type i) { IF_QUAD_BODY; }; +#endif switch ( vid ) { diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index a757497f1..a99ba7ece 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -26,9 +26,11 @@ void INIT3::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) INIT3_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto init3_lam = [=](Index_type i) { INIT3_BODY; }; +#endif switch ( vid ) { diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index 5c23bc9b0..bb07b8520 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -26,9 +26,11 @@ void MULADDSUB::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx MULADDSUB_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto mas_lam = [=](Index_type i) { MULADDSUB_BODY; }; +#endif switch ( vid ) { diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index 0a9c81ff6..f9d3034a3 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -24,9 +24,11 @@ void NESTED_INIT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i NESTED_INIT_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index 81f35a4fb..af94c9e0c 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -26,9 +26,11 @@ void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ DIFF_PREDICT_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto diffpredict_lam = [=](Index_type i) { DIFF_PREDICT_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index 52637ff3b..c694819b2 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -26,9 +26,11 @@ void EOS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) EOS_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto eos_lam = [=](Index_type i) { EOS_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 13513b2a3..6ea1ae7e8 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -26,9 +26,11 @@ void FIRST_DIFF::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id FIRST_DIFF_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto firstdiff_lam = [=](Index_type i) { FIRST_DIFF_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index fb1be5ac2..cbf74f6d9 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -26,9 +26,11 @@ void FIRST_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx FIRST_SUM_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto firstsum_lam = [=](Index_type i) { FIRST_SUM_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index f4f77bac6..2ade85fe7 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -24,12 +24,14 @@ void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune GEN_LIN_RECUR_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto genlinrecur_lam1 = [=](Index_type k) { GEN_LIN_RECUR_BODY1; }; auto genlinrecur_lam2 = [=](Index_type i) { GEN_LIN_RECUR_BODY2; }; +#endif switch ( vid ) { diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index 168d02656..9a99a5cc6 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -26,9 +26,11 @@ void HYDRO_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) HYDRO_1D_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto hydro1d_lam = [=](Index_type i) { HYDRO_1D_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index ce202a596..b4005d979 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -26,9 +26,11 @@ void INT_PREDICT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i INT_PREDICT_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto intpredict_lam = [=](Index_type i) { INT_PREDICT_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 04d3e4536..3349c3595 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -27,9 +27,11 @@ void PLANCKIAN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx PLANCKIAN_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto planckian_lam = [=](Index_type i) { PLANCKIAN_BODY; }; +#endif switch ( vid ) { diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index 97d0a7cb0..54319a912 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -26,9 +26,11 @@ void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ TRIDIAG_ELIM_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto tridiag_elim_lam = [=](Index_type i) { TRIDIAG_ELIM_BODY; }; +#endif switch ( vid ) { diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index eb47fe3f0..cd2fec8bb 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -26,9 +26,11 @@ void ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) ADD_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto add_lam = [=](Index_type i) { ADD_BODY; }; +#endif switch ( vid ) { diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 0ce95ae72..d1d05d6be 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -26,9 +26,11 @@ void COPY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) COPY_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto copy_lam = [=](Index_type i) { COPY_BODY; }; +#endif switch ( vid ) { diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index dfd078b64..9d31fcf37 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -26,9 +26,11 @@ void MUL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) MUL_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto mul_lam = [=](Index_type i) { MUL_BODY; }; +#endif switch ( vid ) { diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index f3e68603c..6e9f497fa 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -26,9 +26,11 @@ void TRIAD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) TRIAD_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto triad_lam = [=](Index_type i) { TRIAD_BODY; }; +#endif switch ( vid ) { From a77b333c00905ab1f50e01832413c7204f7a3d63 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 19 Apr 2023 14:00:03 -0700 Subject: [PATCH 133/137] Change to loop_exec policy to prevent compiler warning --- src/basic/DAXPY_ATOMIC-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index 7d47cbebe..84711dbd9 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -68,7 +68,7 @@ void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { DAXPY_ATOMIC_RAJA_BODY(RAJA::seq_atomic); From a401bd080f13b3e16930ceb9ca95a36f5e760963 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 19 Apr 2023 15:38:52 -0700 Subject: [PATCH 134/137] squash compiler warning for openmp target build. --- src/lcals/HYDRO_2D-Seq.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index 1f67b2aa7..95a19602c 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -27,6 +27,9 @@ void HYDRO_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) const Index_type jend = m_jn - 1; HYDRO_2D_DATA_SETUP; +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(kn); // prevents a compiler warning in the OpenMP target offload build case +#endif switch ( vid ) { From f4e68fb5b653249293b44c55e1c56aafc6abceae Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 May 2023 10:13:06 -0700 Subject: [PATCH 135/137] Fix RTD build issue --- .readthedocs.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index c796cf536..be19340f4 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,6 +5,12 @@ # Required version: 2 +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + ## ## Code for debugging directory structure in readthedocs ## to fix broken links, etc. @@ -27,8 +33,7 @@ sphinx: # Optionally build docs in add'l formats such as PDF and ePub #formats: all -# Optionally set the version of Python and requirements to build the docs +# Set requirements needed to build the docs python: - version: 3.7 install: - requirements: docs/requirements.txt From 882c8ab0950bf6336fd23ddb1c712f03dccf3a46 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 May 2023 10:45:42 -0700 Subject: [PATCH 136/137] Fix configuration file for RTD build --- docs/conf.py | 68 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 3e1e9e534..6673fa10f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,23 +22,23 @@ #sys.path.insert(0, os.path.abspath('_exts')) # Call doxygen in ReadtheDocs -#read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' -#if read_the_docs_build: -# # Generate an RST file for Doxygen index, this is replaced by the real -# # index.html by hooking into the Sphinx build-finished event at the bottom of -# # this file -# cwd=os.getcwd() -# fpath=os.path.join(cwd,"doxygen/html") -# if (os.path.isdir(fpath) == 0): -# os.makedirs(fpath) -# with open(os.path.join(fpath,"index.rst"), 'w') as f: -# print("Writing file {}", f) -# f.write(".. _doxygen:\n") -# f.write("\n") -# f.write("*******\n") -# f.write("Doxygen\n") -# f.write("*******\n") -# +read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' +if read_the_docs_build: + # Generate an RST file for Doxygen index, this is replaced by the real + # index.html by hooking into the Sphinx build-finished event at the bottom of + # this file + cwd=os.getcwd() + fpath=os.path.join(cwd,"doxygen/html") + if (os.path.isdir(fpath) == 0): + os.makedirs(fpath) + with open(os.path.join(fpath,"index.rst"), 'w') as f: + print("Writing file {}", f) + f.write(".. _doxygen:\n") + f.write("\n") + f.write("*******\n") + f.write("Doxygen\n") + f.write("*******\n") + # Get current directory conf_directory = os.path.dirname(os.path.realpath(__file__)) @@ -332,20 +332,20 @@ # Generate Doxygen, and overwrite the index.rst in doxygen/html # Only do this on readthedocs -#def gendoxy(app, exception): -# if read_the_docs_build: -# buildpath=os.path.join(conf_directory,"_build/html/doxygen/html") -# if (os.path.isdir(buildpath) == 0): -# os.makedirs(buildpath) -# -# if (os.path.exists(os.path.join(buildpath, 'index.html"'))): -# print("Removing existing index.html") -# os.remove(os.path.join(buildpath, "index.html")) -# -# # Call doxygen -# from subprocess import call -# call(['doxygen', "./doxygen/Doxyfile"]) -# -# -#def setup(app): -# app.connect('build-finished', gendoxy) +def gendoxy(app, exception): + if read_the_docs_build: + buildpath=os.path.join(conf_directory,"_build/html/doxygen/html") + if (os.path.isdir(buildpath) == 0): + os.makedirs(buildpath) + + if (os.path.exists(os.path.join(buildpath, 'index.html"'))): + print("Removing existing index.html") + os.remove(os.path.join(buildpath, "index.html")) + + # Call doxygen + from subprocess import call + call(['doxygen', "./doxygen/Doxyfile"]) + + +def setup(app): + app.connect('build-finished', gendoxy) From 982041d87cb954f853fccff46f18aaaf8fecfddf Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 May 2023 14:07:27 -0700 Subject: [PATCH 137/137] Try newer version of docutils --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 93120e66c..6b1d35172 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1 @@ -docutils<0.18 +docutils<0.20