From b153d6f6648cd188c1a2698c730a76300b3e62c2 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 19 Sep 2024 10:25:40 -0700 Subject: [PATCH 1/8] attempt at fixing build errors --- benchmark/CMakeLists.txt | 2 + benchmark/ltimes.cpp | 220 +++++++++++++-------------- scripts/lc-builds/blueos_nvcc_gcc.sh | 1 + scripts/lc-builds/toss4_amdclang.sh | 1 + 4 files changed, 114 insertions(+), 110 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8ddeba454d..15d919c566 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -11,9 +11,11 @@ if (RAJA_ENABLE_CUDA) SOURCES host-device-lambda-benchmark.cpp) endif() +if (RAJA_ENABLE_DESUL) raja_add_benchmark( NAME benchmark-atomic SOURCES benchmark-atomic.cpp) +endif() raja_add_benchmark( NAME ltimes diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp index b2fa413b8e..c85826eff8 100644 --- a/benchmark/ltimes.cpp +++ b/benchmark/ltimes.cpp @@ -88,18 +88,18 @@ extern "C" { * RAJA 'statement' concepts * * Note that calls to the checkResult() method after each variant is run - * are turned off so the example code runs much faster. If you want + * are turned off so the example code runs much faster. If you want * to verify the results are correct, define the 'DEBUG_LTIMES' macro * below or turn on checking for individual variants. */ - using namespace RAJA; +using namespace RAJA::expt; // -// Index value types for strongly-typed indices must be defined outside +// Index value types for strongly-typed indices must be defined outside // function scope for RAJA CUDA variants to work. // // These types provide strongly-typed index values so if something is wrong @@ -116,7 +116,7 @@ RAJA_INDEX_VALUE_T(IZ, int, "IZ"); // template void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, - const int num_m, + const int num_m, const int num_d, const int num_g, const int num_z); @@ -231,14 +231,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{1, 0}}; @@ -255,7 +255,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::Timer timer; - timer.start(); + timer.start(); for (int iter = 0;iter < num_iter;++ iter) for (IG g(0); g < num_g; ++g) { @@ -268,7 +268,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } } - timer.stop(); + timer.stop(); double t = timer.elapsed(); double gflop_rate = total_flops / t / 1.0e9; std::cout << " C-version of LTimes run time (with Views) (sec.): " @@ -291,14 +291,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{1, 0}}; @@ -313,11 +313,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using EXECPOL = + using EXECPOL = RAJA::KernelPolicy< - statement::For<2, loop_exec, // g - statement::For<3, loop_exec, // z - statement::For<0, loop_exec, // m + statement::For<2, seq_exec, // g + statement::For<3, seq_exec, // z + statement::For<0, seq_exec, // m statement::For<1, simd_exec, // d statement::Lambda<0> > @@ -388,9 +388,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXECPOL = RAJA::KernelPolicy< - statement::For<2, loop_exec, // g - statement::For<3, loop_exec, // z - statement::For<0, loop_exec, // m + statement::For<2, seq_exec, // g + statement::For<3, seq_exec, // z + statement::For<0, seq_exec, // m statement::For<1, simd_exec, // d statement::Lambda<0, Segs<0, 1, 2, 3>> > @@ -462,10 +462,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using pol_launch = RAJA::LaunchPolicy; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; @@ -535,14 +535,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using vector_t = RAJA::VectorRegister; - using VecIZ = RAJA::VectorIndex; + using vector_t = RAJA::expt::VectorRegister; + using VecIZ = RAJA::expt::VectorIndex; using EXECPOL = RAJA::KernelPolicy< - statement::For<2, loop_exec, // g - statement::For<0, loop_exec, // m - statement::For<1, loop_exec, // d + statement::For<2, seq_exec, // g + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d statement::Lambda<0> > @@ -552,7 +552,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::resetVectorStats(); + RAJA::expt::tensor_stats::resetVectorStats(); #endif @@ -622,9 +622,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::SquareMatrixRegister; - //using matrix_t = RAJA::SquareMatrixRegister; -// using matrix_t = RAJA::RectMatrixRegister; + using matrix_t = RAJA::expt::SquareMatrixRegister; + //using matrix_t = RAJA::expt::SquareMatrixRegister; +// using matrix_t = RAJA::expt::RectMatrixRegister; @@ -633,9 +633,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers); - using RowM = RAJA::RowIndex; - using ColD = RAJA::ColIndex; - using ColZ = RAJA::ColIndex; + using RowM = RAJA::expt::RowIndex; + using ColD = RAJA::expt::ColIndex; + using ColZ = RAJA::expt::ColIndex; #ifdef RAJA_ENABLE_VECTOR_STATS @@ -648,7 +648,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int iter = 0;iter < num_iter;++ iter){ - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), [=](IG g) { @@ -718,19 +718,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::SquareMatrixRegister; + using matrix_t = RAJA::expt::SquareMatrixRegister; std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x" << matrix_t::s_dim_elem(1) << std::endl; - using RowM = RAJA::RowIndex; - using ColD = RAJA::ColIndex; - using ColZ = RAJA::ColIndex; + using RowM = RAJA::expt::RowIndex; + using ColD = RAJA::expt::ColIndex; + using ColZ = RAJA::expt::ColIndex; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::resetVectorStats(); + RAJA::expt::tensor_stats::resetVectorStats(); #endif RAJA::Timer timer; @@ -738,7 +738,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int iter = 0;iter < num_iter;++ iter){ - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), [=](IG g) { @@ -820,13 +820,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) statement::InitLocalMem, // Tile outer m,d loops - statement::Tile<0, tile_fixed, loop_exec, // m - statement::Tile<1, tile_fixed, loop_exec, // d + statement::Tile<0, tile_fixed, seq_exec, // m + statement::Tile<1, tile_fixed, seq_exec, // d // Load L(m,d) for m,d tile into shmem - statement::For<0, loop_exec, // m - statement::For<1, loop_exec, // d + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d statement::Lambda<0, Segs<0, 1>, Params<0>, Offsets<0, 1>> @@ -834,13 +834,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >, // Run inner g, z loops with z loop tiled - statement::For<2, loop_exec, // g - statement::Tile<3, tile_fixed, loop_exec, // z + statement::For<2, seq_exec, // g + statement::Tile<3, tile_fixed, seq_exec, // z // Load psi into shmem - statement::For<1, loop_exec, // d - statement::For<3, loop_exec, // z + statement::For<1, seq_exec, // d + statement::For<3, seq_exec, // z statement::Lambda<1, Segs<1, 2, 3>, Params<1>, Offsets<1, 2, 3>> @@ -848,25 +848,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >, // Compute phi - statement::For<0, loop_exec, // m + statement::For<0, seq_exec, // m // Load phi into shmem - statement::For<3, loop_exec, // z + statement::For<3, seq_exec, // z statement::Lambda<2, Segs<0, 2, 3>, Params<2>, Offsets<0, 2, 3>> >, // Compute phi in shmem - statement::For<1, loop_exec, // d - statement::For<3, loop_exec, // z + statement::For<1, seq_exec, // d + statement::For<3, seq_exec, // z statement::Lambda<3, Params<0, 1, 2>, Offsets<0, 1, 2, 3>> > >, // Store phi - statement:: For<3, loop_exec, // z + statement:: For<3, seq_exec, // z statement::Lambda<4, Segs<0, 2, 3>, Params<2>, Offsets<0, 2, 3>> @@ -992,14 +992,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{0, 1}}; @@ -1019,9 +1019,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXECPOL = RAJA::KernelPolicy< statement::For<0, omp_parallel_for_exec, // m - statement::For<1, loop_exec, // d - statement::For<2, loop_exec, // g - statement::For<3, simd_exec, // z + statement::For<1, seq_exec, // d + statement::For<2, seq_exec, // g + statement::For<3, simd_exec, // z statement::Lambda<0> > > @@ -1037,7 +1037,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< statement::Collapse, // m, g, z - statement::For<1, loop_exec, // d + statement::For<1, seq_exec, // d statement::Lambda<0> > > @@ -1096,14 +1096,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{0, 1}}; @@ -1120,7 +1120,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXECPOL = RAJA::KernelPolicy< - statement::CudaKernelAsync< + statement::CudaKernelAsync< statement::For<0, cuda_block_x_loop, // m statement::For<2, cuda_block_y_loop, // g statement::For<3, cuda_thread_x_loop, // z @@ -1130,9 +1130,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > > - > - >; - + > + >; + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), RAJA::TypedRangeSegment(0, num_d), RAJA::TypedRangeSegment(0, num_g), @@ -1165,9 +1165,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaErrchk( cudaFree( dphi_data ) ); // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) checkResult(phi, L, psi, num_m, num_d, num_g, num_z); @@ -1201,10 +1201,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; // @@ -1325,22 +1325,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using matrix_layout = RowMajorLayout; - using L_matrix_host_t = RAJA::SquareMatrixRegister; - using L_matrix_device_t = RAJA::RectMatrixRegister; + using L_matrix_host_t = RAJA::expt::SquareMatrixRegister; + using L_matrix_device_t = RAJA::expt::RectMatrixRegister; using L_matrix_hd_t = RAJA::LaunchPolicy; - using phi_matrix_host_t = RAJA::SquareMatrixRegister; - using phi_matrix_device_t = RAJA::RectMatrixRegister; + using phi_matrix_host_t = RAJA::expt::SquareMatrixRegister; + using phi_matrix_device_t = RAJA::expt::RectMatrixRegister; using phi_matrix_hd_t = RAJA::LaunchPolicy; - using psi_matrix_host_t = RAJA::SquareMatrixRegister; - using psi_matrix_device_t = RAJA::RectMatrixRegister; + using psi_matrix_host_t = RAJA::expt::SquareMatrixRegister; + using psi_matrix_device_t = RAJA::expt::RectMatrixRegister; using psi_matrix_hd_t = RAJA::LaunchPolicy; using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; // @@ -1388,16 +1388,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t); - using L_RowM = RAJA::RowIndex; - using L_ColD = RAJA::ColIndex; + using L_RowM = RAJA::expt::RowIndex; + using L_ColD = RAJA::expt::ColIndex; using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t); - using psi_RowD = RAJA::RowIndex; - using psi_ColZ = RAJA::ColIndex; + using psi_RowD = RAJA::expt::RowIndex; + using psi_ColZ = RAJA::expt::ColIndex; using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t); - using phi_RowM = RAJA::RowIndex; - using phi_ColZ = RAJA::ColIndex; + using phi_RowM = RAJA::expt::RowIndex; + using phi_ColZ = RAJA::expt::ColIndex; RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ @@ -1468,8 +1468,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; // psi(d, g, z) : 2 -> z is stride-1 dimension @@ -1530,11 +1530,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< statement::CudaKernelAsync< statement::InitLocalMem, - // Tile outer m,d loops + // Tile outer m,d loops statement::Tile<0, tile_fixed, seq_exec, // m statement::Tile<1, tile_fixed, seq_exec, // d - // Load L for m,d tile into shmem + // Load L for m,d tile into shmem statement::For<1, cuda_thread_x_loop, // d statement::For<0, cuda_thread_y_direct, // m statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> @@ -1571,7 +1571,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // d > // m >, // z - + // finish tile over directions statement::CudaSyncThreads, @@ -1582,7 +1582,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >, statement::CudaSyncThreads - + > // Tile z > // g @@ -1594,7 +1594,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >; // KernelPolicy - + RAJA::Timer timer; @@ -1671,9 +1671,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaMemcpyDeviceToHost ) ); // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif @@ -1874,11 +1874,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< statement::HipKernelAsync< statement::InitLocalMem, - // Tile outer m,d loops + // Tile outer m,d loops statement::Tile<0, tile_fixed, seq_exec, // m statement::Tile<1, tile_fixed, seq_exec, // d - // Load L for m,d tile into shmem + // Load L for m,d tile into shmem statement::For<1, hip_thread_x_loop, // d statement::For<0, hip_thread_y_direct, // m statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> @@ -1915,7 +1915,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // d > // m >, // z - + // finish tile over directions statement::HipSyncThreads, @@ -1926,7 +1926,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >, statement::HipSyncThreads - + > // Tile z > // g @@ -1983,7 +1983,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // Lambda<3> Compute thread-local phi value - [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), + [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z), shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local, IM tm, ID td, IG tg, IZ tz) { @@ -2040,7 +2040,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // template void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, - const int num_m, + const int num_m, const int num_d, const int num_g, const int num_z) diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 7619c360d4..60e1fadcd0 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -44,6 +44,7 @@ cmake \ -C ../host-configs/lc-builds/blueos/nvcc_gcc_X.cmake \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DENABLE_BENCHMARKS=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 9e738fc781..cb5872c0c5 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -73,6 +73,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. From 887d3d3b744fababba3b5e61d185e2d5d1717c9d Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Fri, 27 Sep 2024 15:55:07 -0700 Subject: [PATCH 2/8] Add strongly typed index support for TensorIndex. --- include/RAJA/pattern/tensor/TensorIndex.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp index f992649876..379e13ae41 100644 --- a/include/RAJA/pattern/tensor/TensorIndex.hpp +++ b/include/RAJA/pattern/tensor/TensorIndex.hpp @@ -29,7 +29,7 @@ namespace expt { - template LENGTH_VALUE> + template INDEX_VALUE, strip_index_type_t LENGTH_VALUE> struct StaticTensorIndexInner; template @@ -56,8 +56,8 @@ namespace expt RAJA_HOST_DEVICE static constexpr - StaticTensorIndex> static_all(){ - return StaticTensorIndex>(); + StaticTensorIndex> static_all(){ + return StaticTensorIndex>(); } RAJA_INLINE @@ -103,7 +103,7 @@ namespace expt TensorIndex(TensorIndex const &c) : m_index(*c), m_length(c.size()) {} - template LEN_VAL> + template IDX_VAL, strip_index_type_t LEN_VAL> RAJA_INLINE RAJA_HOST_DEVICE constexpr @@ -156,7 +156,7 @@ namespace expt }; - template LENGTH_VALUE> + template INDEX_VALUE, strip_index_type_t LENGTH_VALUE> struct StaticTensorIndex> { using base_type = TensorIndex; From 5a8ee0650ece6218ad61b10f2e33cf1c07b9e5b8 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Fri, 27 Sep 2024 16:18:04 -0700 Subject: [PATCH 3/8] Fix allocating too much to std::vector. --- benchmark/ltimes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp index c85826eff8..15a3e54eb8 100644 --- a/benchmark/ltimes.cpp +++ b/benchmark/ltimes.cpp @@ -142,7 +142,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const long num_z = 32 + (rand()/RAND_MAX); #else const int num_iter = 10 + (rand()/RAND_MAX); - const int num_z = 32*65536 + (rand()/RAND_MAX); + const int num_z = 32*657 + (rand()/RAND_MAX); #endif From 84d9c9f34685c9680affa99db57ad09af169adce Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 9 Oct 2024 11:18:00 -0700 Subject: [PATCH 4/8] optional benchmark --- scripts/lc-builds/blueos_nvcc_gcc.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 60e1fadcd0..40a67205c1 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -9,22 +9,30 @@ if [[ $# -lt 3 ]]; then echo - echo "You must pass 3 arguments to the script (in this order): " + echo "You must pass at least 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" echo " 2) CUDA compute architecture (number only, not 'sm_70' for example)" echo " 3) compiler version number for gcc" + echo " 4) (optional) benchmarks On" echo echo "For example: " - echo " blueos_nvcc_gcc.sh 10.2.89 70 8.3.1" + echo " blueos_nvcc_gcc.sh 10.2.89 70 8.3.1 On" exit fi COMP_NVCC_VER=$1 COMP_ARCH=$2 COMP_GCC_VER=$3 -shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER} +if [[ $# -gt 3 ]]; then + BENCHMARKS=$4 + shift 4 + BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER}-benchmark_${BENCHMARKS} +else + shift 3 + BENCHMARKS=Off + BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER} +fi echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -44,7 +52,7 @@ cmake \ -C ../host-configs/lc-builds/blueos/nvcc_gcc_X.cmake \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ - -DENABLE_BENCHMARKS=On \ + -DENABLE_BENCHMARKS=${BENCHMARKS} \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ From 4d904622df5ac6b216923d5e3a0ca29452376f4e Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 9 Oct 2024 14:01:00 -0700 Subject: [PATCH 5/8] turn on benchmarks --- scripts/lc-builds/blueos_clang.sh | 1 + scripts/lc-builds/blueos_clang_omptarget.sh | 1 + scripts/lc-builds/blueos_clangcuda.sh | 1 + scripts/lc-builds/blueos_gcc.sh | 1 + scripts/lc-builds/blueos_nvcc_clang.sh | 1 + scripts/lc-builds/blueos_nvcc_gcc.sh | 18 +++++------------- scripts/lc-builds/blueos_nvcc_xl.sh | 1 + scripts/lc-builds/blueos_pgi.sh | 1 + scripts/lc-builds/blueos_xl.sh | 1 + scripts/lc-builds/blueos_xl_omptarget.sh | 1 + scripts/lc-builds/corona_sycl.sh | 1 + scripts/lc-builds/toss4_amdclang_asan.sh | 1 + scripts/lc-builds/toss4_cce_hip.sh | 1 + scripts/lc-builds/toss4_clang.sh | 1 + scripts/lc-builds/toss4_clang_san.sh | 1 + scripts/lc-builds/toss4_gcc.sh | 1 + scripts/lc-builds/toss4_hipcc.sh | 1 + scripts/lc-builds/toss4_icpc-classic.sh | 1 + scripts/lc-builds/toss4_icpc.sh | 1 + scripts/lc-builds/toss4_icpx.sh | 1 + 20 files changed, 24 insertions(+), 13 deletions(-) diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index 7e0b6600d8..b511977e52 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -38,6 +38,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/clang_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 154a1f764a..2f5ae90e87 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -42,6 +42,7 @@ cmake \ -DRAJA_ENABLE_TARGET_OPENMP=On \ -DBLT_OPENMP_COMPILE_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ -DBLT_OPENMP_LINK_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_clangcuda.sh b/scripts/lc-builds/blueos_clangcuda.sh index 16ac65b825..00e230fb43 100755 --- a/scripts/lc-builds/blueos_clangcuda.sh +++ b/scripts/lc-builds/blueos_clangcuda.sh @@ -50,6 +50,7 @@ cmake \ -DBLT_CLANG_CUDA_ARCH=${CUDA_ARCH} \ -DENABLE_CUDA=On \ -DCUDA_ARCH=${CUDA_ARCH} \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index 65bfb2c307..a87fd91fab 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/gcc_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index faf0c8938a..82f70ba38e 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -48,6 +48,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 40a67205c1..1b84321f1b 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -9,30 +9,22 @@ if [[ $# -lt 3 ]]; then echo - echo "You must pass at least 3 arguments to the script (in this order): " + echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" echo " 2) CUDA compute architecture (number only, not 'sm_70' for example)" echo " 3) compiler version number for gcc" - echo " 4) (optional) benchmarks On" echo echo "For example: " - echo " blueos_nvcc_gcc.sh 10.2.89 70 8.3.1 On" + echo " blueos_nvcc_gcc.sh 10.2.89 70 8.3.1" exit fi COMP_NVCC_VER=$1 COMP_ARCH=$2 COMP_GCC_VER=$3 +shift 3 -if [[ $# -gt 3 ]]; then - BENCHMARKS=$4 - shift 4 - BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER}-benchmark_${BENCHMARKS} -else - shift 3 - BENCHMARKS=Off - BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER} -fi +BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER} echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -52,10 +44,10 @@ cmake \ -C ../host-configs/lc-builds/blueos/nvcc_gcc_X.cmake \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ - -DENABLE_BENCHMARKS=${BENCHMARKS} \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index d0507d0eb1..142b3fbd40 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -47,6 +47,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index 16c9816d19..743f7afbb6 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -37,6 +37,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/pgi_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 58a13fc065..08e693023b 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/xl_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 3fd7e22ce8..cd7d9d67c6 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -39,6 +39,7 @@ cmake \ -DRAJA_ENABLE_TARGET_OPENMP=On \ -DBLT_OPENMP_COMPILE_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \ -DBLT_OPENMP_LINK_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 815928e434..cf0997df65 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -55,6 +55,7 @@ cmake \ -DBLT_CXX_STD=c++17 \ -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ + -DENABLE_BENCHMARKS=ON \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index b22cdce3bd..e7fc4045e5 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -83,6 +83,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index 80db5f37be..826fadf5fa 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -57,6 +57,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index ab1cb69e81..d031b1ae7e 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/clang_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_clang_san.sh b/scripts/lc-builds/toss4_clang_san.sh index e7501c09cd..7d917f4c96 100755 --- a/scripts/lc-builds/toss4_clang_san.sh +++ b/scripts/lc-builds/toss4_clang_san.sh @@ -45,6 +45,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/clang_X_${SAN_VER}.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index 07fa26cc3f..9629cdd944 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/gcc_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index f7342e474c..0cec1abc19 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -74,6 +74,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index 447cf35ac8..73481fd794 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -43,6 +43,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpc-classic_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index 93d4d62cad..921a8dad54 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -43,6 +43,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpc_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index d6fe0d867b..2f7f768a60 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -45,6 +45,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpx_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. From a8c6c3813e43fa3ae530df2d2e870b83118b421e Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 9 Oct 2024 14:06:09 -0700 Subject: [PATCH 6/8] Use On instead of ON --- scripts/lc-builds/blueos_clang.sh | 2 +- scripts/lc-builds/blueos_clang_omptarget.sh | 2 +- scripts/lc-builds/blueos_clangcuda.sh | 2 +- scripts/lc-builds/blueos_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_nvcc_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_xl.sh | 2 +- scripts/lc-builds/blueos_pgi.sh | 2 +- scripts/lc-builds/blueos_xl.sh | 2 +- scripts/lc-builds/blueos_xl_omptarget.sh | 2 +- scripts/lc-builds/corona_sycl.sh | 2 +- scripts/lc-builds/toss4_amdclang.sh | 2 +- scripts/lc-builds/toss4_amdclang_asan.sh | 2 +- scripts/lc-builds/toss4_cce_hip.sh | 2 +- scripts/lc-builds/toss4_clang.sh | 2 +- scripts/lc-builds/toss4_gcc.sh | 2 +- scripts/lc-builds/toss4_hipcc.sh | 2 +- scripts/lc-builds/toss4_icpc-classic.sh | 2 +- scripts/lc-builds/toss4_icpc.sh | 2 +- scripts/lc-builds/toss4_icpx.sh | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index b511977e52..99954b3eb8 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -38,7 +38,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/clang_X.cmake \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 2f5ae90e87..977e0ec4a3 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -42,7 +42,7 @@ cmake \ -DRAJA_ENABLE_TARGET_OPENMP=On \ -DBLT_OPENMP_COMPILE_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ -DBLT_OPENMP_LINK_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_clangcuda.sh b/scripts/lc-builds/blueos_clangcuda.sh index 00e230fb43..98d2619546 100755 --- a/scripts/lc-builds/blueos_clangcuda.sh +++ b/scripts/lc-builds/blueos_clangcuda.sh @@ -50,7 +50,7 @@ cmake \ -DBLT_CLANG_CUDA_ARCH=${CUDA_ARCH} \ -DENABLE_CUDA=On \ -DCUDA_ARCH=${CUDA_ARCH} \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index a87fd91fab..f89ad3e6bf 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -36,7 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/gcc_X.cmake \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 82f70ba38e..ad846fa101 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -48,7 +48,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 1b84321f1b..141936aad8 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -47,7 +47,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 142b3fbd40..ffeea779ca 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -47,7 +47,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index 743f7afbb6..d8a6b904ec 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -37,7 +37,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/pgi_X.cmake \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 08e693023b..a39a0f4c93 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -36,7 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/xl_X.cmake \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index cd7d9d67c6..d331044a99 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -39,7 +39,7 @@ cmake \ -DRAJA_ENABLE_TARGET_OPENMP=On \ -DBLT_OPENMP_COMPILE_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \ -DBLT_OPENMP_LINK_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index cf0997df65..4b636e4da0 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -55,7 +55,7 @@ cmake \ -DBLT_CXX_STD=c++17 \ -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index cb5872c0c5..ec7910e148 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -73,7 +73,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index e7fc4045e5..dd9526a1d2 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -83,7 +83,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index 826fadf5fa..9d05fddca8 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -57,7 +57,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index d031b1ae7e..a1bf63ff89 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -36,7 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/clang_X.cmake \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index 9629cdd944..c73c5ba6e9 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -36,7 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/gcc_X.cmake \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index 0cec1abc19..5743692ba6 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -74,7 +74,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index 73481fd794..a892d08ecf 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -43,7 +43,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpc-classic_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index 921a8dad54..82dab8e8df 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -43,7 +43,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpc_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index 2f7f768a60..88cc43d824 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -45,7 +45,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpx_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ - -DENABLE_BENCHMARKS=ON \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. From 907d4321a7bb66b173c9e27beab64fc7d6bd52d6 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Wed, 9 Oct 2024 14:45:46 -0700 Subject: [PATCH 7/8] Use value_type where it should be used. --- include/RAJA/pattern/tensor/TensorIndex.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp index 379e13ae41..8f152d92ce 100644 --- a/include/RAJA/pattern/tensor/TensorIndex.hpp +++ b/include/RAJA/pattern/tensor/TensorIndex.hpp @@ -103,7 +103,7 @@ namespace expt TensorIndex(TensorIndex const &c) : m_index(*c), m_length(c.size()) {} - template IDX_VAL, strip_index_type_t LEN_VAL> + template RAJA_INLINE RAJA_HOST_DEVICE constexpr From 6385af16da3e35485927208d721905791d925775 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Wed, 9 Oct 2024 14:59:41 -0700 Subject: [PATCH 8/8] Turn on ltimes benchmark build. --- benchmark/CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 143d6b5b98..c52e040a76 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -17,10 +17,9 @@ raja_add_benchmark( SOURCES benchmark-atomic.cpp) endif() -#TODO: Fix build issues -#raja_add_benchmark( -# NAME ltimes -# SOURCES ltimes.cpp) +raja_add_benchmark( + NAME ltimes + SOURCES ltimes.cpp) raja_add_benchmark( NAME raja_view_blur