From f5bb9b8bb554bf5636e44a96d1abce5966392327 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 2 Sep 2024 08:48:16 -0700 Subject: [PATCH 1/3] artv3/raja-view-slowdown --- examples/CMakeLists.txt | 4 + examples/raja_view_slowdown.cpp | 145 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 examples/raja_view_slowdown.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4dfd2fbc10..fd1aed62ba 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -3,6 +3,10 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +raja_add_executable( + NAME raja_view_slowdown + SOURCES raja_view_slowdown.cpp) + raja_add_executable( NAME tut_launch_basic SOURCES tut_launch_basic.cpp) diff --git a/examples/raja_view_slowdown.cpp b/examples/raja_view_slowdown.cpp new file mode 100644 index 0000000000..7b7dc8def3 --- /dev/null +++ b/examples/raja_view_slowdown.cpp @@ -0,0 +1,145 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include "RAJA/util/Timer.hpp" +#include + +int main() { + + const int N = 10000; + const int K = 17; + + auto timer = RAJA::Timer(); + + //launch to intialize the stream + RAJA::forall> + (RAJA::RangeSegment(0,1), [=] __device__ (int i) { + printf("launch kernel\n"); + }); + + + int* array = new int[N * N]; + int* array_copy = new int[N * N]; + + //big array, or image + for (int i = 0; i < N * N; ++i) { + array[i] = 1; + array_copy[i] = 1; + } + + //small array that acts as the blur + int* kernel = new int[K * K]; + for (int i = 0; i < K * K; ++i) { + kernel[i] = 2; + } + + // copying to gpu + int* d_array; + int* d_array_copy; + int* d_kernel; + cudaMalloc((void**)&d_array, N * N * sizeof(int)); + cudaMalloc((void**)&d_array_copy, N * N * sizeof(int)); + cudaMalloc((void**)&d_kernel, K * K * sizeof(int)); + cudaMemcpy(d_array, array, N * N * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_array_copy, array_copy, N * N * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel, K * K * sizeof(int), cudaMemcpyHostToDevice); + + + constexpr int DIM = 2; + RAJA::View> array_view(d_array, N, N); + RAJA::View> array_view_copy(d_array_copy, N, N); + RAJA::View> kernel_view(d_kernel, K, K); + + + using EXEC_POL5 = RAJA::KernelPolicy< + RAJA::statement::CudaKernelFixed<256, + RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, + RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::RangeSegment range_i(0, N); + RAJA::RangeSegment range_j(0, N); + + +timer.start(); + + RAJA::kernel + (RAJA::make_tuple(range_i, range_j), + [=] RAJA_DEVICE (int i, int j) { + int sum = 0; + + //looping through the "blur" + for (int m = 0; m < K; ++m) { + for (int n = 0; n < K; ++n) { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever the blur is located on the image + if (x < N && y < N) { + sum += kernel_view(m, n) * array_view(x, y); + } + } + } + + array_view(i, j) += sum; + } + ); + +timer.stop(); + +std::cout<<"Elapsed time with RAJA view : "< + (RAJA::make_tuple(range_i, range_j), + [=] RAJA_DEVICE (int i, int j) { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) { + for (int n = 0; n < K; ++n) { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever the blur is located on the image + if (x < N && y < N) { + sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; + } + } + } + + d_array_copy[i * N + j] += sum; + } + ); + +timer.stop(); +std::cout<<"Elapsed time with NO RAJA view : "< Date: Tue, 17 Sep 2024 13:22:49 -0700 Subject: [PATCH 2/3] move raja_view perf test to benchmark folder --- benchmark/CMakeLists.txt | 4 + benchmark/raja_view_blur.cpp | 170 ++++++++++++++++++++++++++++++++ examples/CMakeLists.txt | 4 - examples/raja_view_slowdown.cpp | 145 --------------------------- 4 files changed, 174 insertions(+), 149 deletions(-) create mode 100644 benchmark/raja_view_blur.cpp delete mode 100644 examples/raja_view_slowdown.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8ddeba454d..e4ca0e1809 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -18,3 +18,7 @@ raja_add_benchmark( raja_add_benchmark( NAME ltimes SOURCES ltimes.cpp) + +raja_add_bench_mark( + NAME raja_view_blur + SOURCES raja_view_blur.cpp) diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp new file mode 100644 index 0000000000..c3db04b2dd --- /dev/null +++ b/benchmark/raja_view_blur.cpp @@ -0,0 +1,170 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include "RAJA/util/Timer.hpp" +#include + +/* + * RAJA view performance test + * Kernel performs a 2D Gaussian blur + * + */ + +#if defined(RAJA_ENABLE_HIP) +using device_pol = RAJA::hip_exec<256>; +using device_resources = RAJA::resource::Hip; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::HipKernelFixed<256, + RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>, + RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>, + RAJA::statement::Lambda<0> + > + > + > + >; +#elif defined(RAJA_ENABLE_CUDA) +using device_pol = RAJA::cuda_exec<256>; +using device_resources = RAJA::resources::Cuda; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::CudaKernelFixed<256, + RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, + RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, + RAJA::statement::Lambda<0> + > + > + > + >; +#else +using host_pol = RAJA::seq_exec; +using device_resources = RAJA::resources::Host; +#endif + +using host_resources = RAJA::resources::Host; + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + const int N = 10000; + const int K = 17; + + device_resources def_device_res{device_resources::get_default()}; + host_resources def_host_res{host_resources::get_default()}; + + auto timer = RAJA::Timer(); + + //launch to intialize the stream + RAJA::forall + (RAJA::RangeSegment(0,1), [=] RAJA_DEVICE (int i) { + printf(" \n"); + }); + + int * array = def_host_res.allocate(N * N); + int * array_copy = def_host_res.allocate(N * N); + + //big array, or image + for (int i = 0; i < N * N; ++i) { + array[i] = 1; + array_copy[i] = 1; + } + + //small array that acts as the blur + //int* kernel = new int[K * K]; + int * kernel = def_host_res.allocate(K * K); + for (int i = 0; i < K * K; ++i) { + kernel[i] = 2; + } + + // copying to gpu + int* d_array = def_device_res.allocate(N * N); + int* d_array_copy = def_device_res.allocate(N * N); + int* d_kernel = def_device_res.allocate(K * K); + + def_device_res.memcpy(d_array, array, N * N * sizeof(int)); + def_device_res.memcpy(d_array_copy, array_copy, N * N * sizeof(int)); + def_device_res.memcpy(d_kernel, kernel, K * K * sizeof(int)); + + constexpr int DIM = 2; + RAJA::View> array_view(d_array, N, N); + RAJA::View> array_view_copy(d_array_copy, N, N); + RAJA::View> kernel_view(d_kernel, K, K); + + RAJA::RangeSegment range_i(0, N); + RAJA::RangeSegment range_j(0, N); + + timer.start(); + + RAJA::kernel + (RAJA::make_tuple(range_i, range_j), + [=] RAJA_DEVICE (int i, int j) { + int sum = 0; + + //looping through the "blur" + for (int m = 0; m < K; ++m) { + for (int n = 0; n < K; ++n) { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever the blur is located on the image + if (x < N && y < N) { + sum += kernel_view(m, n) * array_view(x, y); + } + } + } + + array_view(i, j) += sum; + } + ); + + timer.stop(); + + std::cout<<"Elapsed time with RAJA view : "< + (RAJA::make_tuple(range_i, range_j), + [=] RAJA_DEVICE (int i, int j) { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) { + for (int n = 0; n < K; ++n) { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever the blur is located on the image + if (x < N && y < N) { + sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; + } + } + } + + d_array_copy[i * N + j] += sum; + } + ); + timer.stop(); + + std::cout<<"Elapsed time with NO RAJA view : "< -#include "RAJA/util/Timer.hpp" -#include - -int main() { - - const int N = 10000; - const int K = 17; - - auto timer = RAJA::Timer(); - - //launch to intialize the stream - RAJA::forall> - (RAJA::RangeSegment(0,1), [=] __device__ (int i) { - printf("launch kernel\n"); - }); - - - int* array = new int[N * N]; - int* array_copy = new int[N * N]; - - //big array, or image - for (int i = 0; i < N * N; ++i) { - array[i] = 1; - array_copy[i] = 1; - } - - //small array that acts as the blur - int* kernel = new int[K * K]; - for (int i = 0; i < K * K; ++i) { - kernel[i] = 2; - } - - // copying to gpu - int* d_array; - int* d_array_copy; - int* d_kernel; - cudaMalloc((void**)&d_array, N * N * sizeof(int)); - cudaMalloc((void**)&d_array_copy, N * N * sizeof(int)); - cudaMalloc((void**)&d_kernel, K * K * sizeof(int)); - cudaMemcpy(d_array, array, N * N * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_array_copy, array_copy, N * N * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel, K * K * sizeof(int), cudaMemcpyHostToDevice); - - - constexpr int DIM = 2; - RAJA::View> array_view(d_array, N, N); - RAJA::View> array_view_copy(d_array_copy, N, N); - RAJA::View> kernel_view(d_kernel, K, K); - - - using EXEC_POL5 = RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed<256, - RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, - RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::RangeSegment range_i(0, N); - RAJA::RangeSegment range_j(0, N); - - -timer.start(); - - RAJA::kernel - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_DEVICE (int i, int j) { - int sum = 0; - - //looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += kernel_view(m, n) * array_view(x, y); - } - } - } - - array_view(i, j) += sum; - } - ); - -timer.stop(); - -std::cout<<"Elapsed time with RAJA view : "< - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_DEVICE (int i, int j) { - int sum = 0; - - // looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; - } - } - } - - d_array_copy[i * N + j] += sum; - } - ); - -timer.stop(); -std::cout<<"Elapsed time with NO RAJA view : "< Date: Thu, 26 Sep 2024 18:08:30 -0700 Subject: [PATCH 3/3] clean up pass, add other variants --- benchmark/CMakeLists.txt | 9 ++-- benchmark/raja_view_blur.cpp | 82 ++++++++++++++++++++++++++++-------- 2 files changed, 70 insertions(+), 21 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 82f3e10b13..143d6b5b98 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -17,10 +17,11 @@ raja_add_benchmark( SOURCES benchmark-atomic.cpp) endif() -raja_add_benchmark( - NAME ltimes - SOURCES ltimes.cpp) +#TODO: Fix build issues +#raja_add_benchmark( +# NAME ltimes +# SOURCES ltimes.cpp) -raja_add_bench_mark( +raja_add_benchmark( NAME raja_view_blur SOURCES raja_view_blur.cpp) diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp index c3db04b2dd..331d6c51dd 100644 --- a/benchmark/raja_view_blur.cpp +++ b/benchmark/raja_view_blur.cpp @@ -15,7 +15,19 @@ * */ -#if defined(RAJA_ENABLE_HIP) +//Uncomment to specify variant +//#define RUN_HIP_VARIANT +//#define RUN_CUDA_VARIANT +//#define RUN_SYCL_VARIANT +//#define RUN_OPENMP_VARIANT +#define RUN_SEQ_VARIANT + + +using host_pol = RAJA::seq_exec; +using host_resources = RAJA::resources::Host; + + +#if defined(RAJA_ENABLE_HIP) && defined(RUN_HIP_VARIANT) using device_pol = RAJA::hip_exec<256>; using device_resources = RAJA::resource::Hip; @@ -28,7 +40,9 @@ using kernel_pol = RAJA::KernelPolicy< > > >; -#elif defined(RAJA_ENABLE_CUDA) +#endif + +#if defined(RAJA_ENABLE_CUDA) && defined(RUN_CUDA_VARIANT) using device_pol = RAJA::cuda_exec<256>; using device_resources = RAJA::resources::Cuda; @@ -41,33 +55,68 @@ using kernel_pol = RAJA::KernelPolicy< > > >; -#else -using host_pol = RAJA::seq_exec; +#endif + +#if defined(RAJA_ENABLE_SYCL) && defined(RUN_SYCL_VARIANT) +using device_pol = RAJA::sycl_exec<256>; +using device_resources = RAJA::resources::Sycl; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::SyclKernel< + RAJA::statement::For<1, RAJA::sycl_global_item_1, + RAJA::statement::For<0, RAJA::sycl_global_item_2, + RAJA::statement::Lambda<0> + > + > + > + >; +#endif + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP_VARIANT) +using device_pol = RAJA::omp_parallel_for_exec; using device_resources = RAJA::resources::Host; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::omp_parallel_for_exec, + RAJA::statement::For<0, RAJA::seq_exec, + RAJA::statement::Lambda<0> + > + > + >; #endif -using host_resources = RAJA::resources::Host; +#if defined(RUN_SEQ_VARIANT) +using device_pol = RAJA::seq_exec; +using device_resources = RAJA::resources::Host; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::For<0, RAJA::seq_exec, + RAJA::statement::Lambda<0> + > + > + >; +#endif int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { const int N = 10000; const int K = 17; - + device_resources def_device_res{device_resources::get_default()}; host_resources def_host_res{host_resources::get_default()}; - + auto timer = RAJA::Timer(); - + //launch to intialize the stream RAJA::forall - (RAJA::RangeSegment(0,1), [=] RAJA_DEVICE (int i) { - printf(" \n"); + (RAJA::RangeSegment(0,1), [=] RAJA_HOST_DEVICE (int i) { }); int * array = def_host_res.allocate(N * N); int * array_copy = def_host_res.allocate(N * N); - + //big array, or image for (int i = 0; i < N * N; ++i) { array[i] = 1; @@ -75,7 +124,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } //small array that acts as the blur - //int* kernel = new int[K * K]; int * kernel = def_host_res.allocate(K * K); for (int i = 0; i < K * K; ++i) { kernel[i] = 2; @@ -102,7 +150,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::kernel (RAJA::make_tuple(range_i, range_j), - [=] RAJA_DEVICE (int i, int j) { + [=] RAJA_HOST_DEVICE (int i, int j) { int sum = 0; //looping through the "blur" @@ -129,10 +177,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); timer.start(); - + RAJA::kernel (RAJA::make_tuple(range_i, range_j), - [=] RAJA_DEVICE (int i, int j) { + [=] RAJA_HOST_DEVICE (int i, int j) { int sum = 0; // looping through the "blur" @@ -161,10 +209,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) def_device_res.deallocate(d_array); def_device_res.deallocate(d_array_copy); def_device_res.deallocate(d_kernel); - + def_host_res.deallocate(array); def_host_res.deallocate(array_copy); def_host_res.deallocate(kernel); - + return 0; }