From f5bb9b8bb554bf5636e44a96d1abce5966392327 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 2 Sep 2024 08:48:16 -0700
Subject: [PATCH 1/3] artv3/raja-view-slowdown

---
 examples/CMakeLists.txt         |   4 +
 examples/raja_view_slowdown.cpp | 145 ++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 examples/raja_view_slowdown.cpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 4dfd2fbc10..fd1aed62ba 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,6 +3,10 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
+raja_add_executable(
+  NAME raja_view_slowdown
+  SOURCES raja_view_slowdown.cpp)
+
 raja_add_executable(
   NAME tut_launch_basic
   SOURCES tut_launch_basic.cpp)
diff --git a/examples/raja_view_slowdown.cpp b/examples/raja_view_slowdown.cpp
new file mode 100644
index 0000000000..7b7dc8def3
--- /dev/null
+++ b/examples/raja_view_slowdown.cpp
@@ -0,0 +1,145 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA/util/Timer.hpp"
+#include <iostream>
+
+int main() {
+
+  const int N = 10000;
+  const int K = 17;
+
+  auto timer = RAJA::Timer();
+
+  //launch to intialize the stream
+  RAJA::forall<RAJA::cuda_exec<256>>
+    (RAJA::RangeSegment(0,1), [=] __device__ (int i) {
+    printf("launch kernel\n");
+  });
+
+
+  int* array = new int[N * N];
+  int* array_copy = new int[N * N];
+
+  //big array, or image
+  for (int i = 0; i < N * N; ++i) {
+    array[i] = 1;
+    array_copy[i] = 1;
+  }
+
+  //small array that acts as the blur
+  int* kernel = new int[K * K];
+  for (int i = 0; i < K * K; ++i) {
+    kernel[i] = 2;
+  }
+
+  // copying to gpu
+  int* d_array;
+  int* d_array_copy;
+  int* d_kernel;
+  cudaMalloc((void**)&d_array, N * N * sizeof(int));
+  cudaMalloc((void**)&d_array_copy, N * N * sizeof(int));
+  cudaMalloc((void**)&d_kernel, K * K * sizeof(int));
+  cudaMemcpy(d_array, array, N * N * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_array_copy, array_copy, N * N * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel, K * K * sizeof(int), cudaMemcpyHostToDevice);
+
+
+  constexpr int DIM = 2;
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view(d_array, N, N);
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view_copy(d_array_copy, N, N);
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> kernel_view(d_kernel, K, K);
+
+
+  using EXEC_POL5 = RAJA::KernelPolicy<
+    RAJA::statement::CudaKernelFixed<256,
+      RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
+        RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
+	  RAJA::statement::Lambda<0>
+	  >
+       >
+     >
+    >;
+
+  RAJA::RangeSegment range_i(0, N);
+  RAJA::RangeSegment range_j(0, N);
+
+
+timer.start();
+
+  RAJA::kernel<EXEC_POL5>
+    (RAJA::make_tuple(range_i, range_j),
+     [=] RAJA_DEVICE (int i, int j) {
+      int sum = 0;
+
+      //looping through the "blur"
+      for (int m = 0; m < K; ++m) {
+	for (int n = 0; n < K; ++n) {
+	  int x = i + m;
+	  int y = j + n;
+
+	  // adding the "blur" to the "image" wherever the blur is located on the image
+	  if (x < N && y < N) {
+	    sum += kernel_view(m, n) * array_view(x, y);
+	  }
+	}
+      }
+
+      array_view(i, j) += sum;
+    }
+   );
+
+timer.stop();
+
+std::cout<<"Elapsed time with RAJA view : "<<timer.elapsed()<<std::endl;
+
+
+timer.reset();
+timer.start();
+
+  RAJA::kernel<EXEC_POL5>
+    (RAJA::make_tuple(range_i, range_j),
+     [=] RAJA_DEVICE (int i, int j) {
+      int sum = 0;
+
+      // looping through the "blur"
+      for (int m = 0; m < K; ++m) {
+	for (int n = 0; n < K; ++n) {
+	  int x = i + m;
+	  int y = j + n;
+
+	  // adding the "blur" to the "image" wherever the blur is located on the image
+	  if (x < N && y < N) {
+	    sum += d_kernel[m * K + n] * d_array_copy[x * N + y];
+	  }
+	}
+      }
+
+      d_array_copy[i * N + j] += sum;
+    }
+  );
+
+timer.stop();
+std::cout<<"Elapsed time with NO RAJA view : "<<timer.elapsed()<<std::endl;
+
+  // copy from gpu to cpu
+  cudaMemcpy(array, d_array, N * N * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(array_copy, d_array_copy, N * N * sizeof(int), cudaMemcpyDeviceToHost);
+
+
+  cudaFree(d_array);
+  cudaFree(d_array_copy);
+  cudaFree(d_kernel);
+
+
+  delete[] array;
+  delete[] array_copy;
+  delete[] kernel;
+
+  return 0;
+}

From f54bcc13c7ff42ea08c4720300e32d739c4991c4 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 17 Sep 2024 13:22:49 -0700
Subject: [PATCH 2/3] move raja_view perf test to benchmark folder

---
 benchmark/CMakeLists.txt        |   4 +
 benchmark/raja_view_blur.cpp    | 170 ++++++++++++++++++++++++++++++++
 examples/CMakeLists.txt         |   4 -
 examples/raja_view_slowdown.cpp | 145 ---------------------------
 4 files changed, 174 insertions(+), 149 deletions(-)
 create mode 100644 benchmark/raja_view_blur.cpp
 delete mode 100644 examples/raja_view_slowdown.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 8ddeba454d..e4ca0e1809 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -18,3 +18,7 @@ raja_add_benchmark(
 raja_add_benchmark(
   NAME ltimes
   SOURCES ltimes.cpp)
+
+raja_add_bench_mark(
+  NAME raja_view_blur
+  SOURCES raja_view_blur.cpp)
diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp
new file mode 100644
index 0000000000..c3db04b2dd
--- /dev/null
+++ b/benchmark/raja_view_blur.cpp
@@ -0,0 +1,170 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA/util/Timer.hpp"
+#include <iostream>
+
+/*
+ * RAJA view performance test
+ * Kernel performs a 2D Gaussian blur
+ *
+ */
+
+#if defined(RAJA_ENABLE_HIP)
+using device_pol = RAJA::hip_exec<256>;
+using device_resources = RAJA::resource::Hip;
+
+using kernel_pol = RAJA::KernelPolicy<
+  RAJA::statement::HipKernelFixed<256,
+    RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>,
+      RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>,
+         RAJA::statement::Lambda<0>
+      >
+    >
+   >
+  >;
+#elif defined(RAJA_ENABLE_CUDA)
+using device_pol = RAJA::cuda_exec<256>;
+using device_resources = RAJA::resources::Cuda;
+
+using kernel_pol = RAJA::KernelPolicy<
+  RAJA::statement::CudaKernelFixed<256,
+    RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
+      RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
+         RAJA::statement::Lambda<0>
+      >
+    >
+   >
+  >;
+#else
+using host_pol = RAJA::seq_exec;
+using device_resources = RAJA::resources::Host;
+#endif
+
+using host_resources = RAJA::resources::Host;
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  const int N = 10000;
+  const int K = 17;
+  
+  device_resources def_device_res{device_resources::get_default()};
+  host_resources   def_host_res{host_resources::get_default()};
+  
+  auto timer = RAJA::Timer();
+  
+  //launch to intialize the stream
+  RAJA::forall<device_pol>
+    (RAJA::RangeSegment(0,1), [=] RAJA_DEVICE (int i) {
+      printf(" \n");
+  });
+
+  int * array      = def_host_res.allocate<int>(N * N);
+  int * array_copy = def_host_res.allocate<int>(N * N);
+  
+  //big array, or image
+  for (int i = 0; i < N * N; ++i) {
+    array[i] = 1;
+    array_copy[i] = 1;
+  }
+
+  //small array that acts as the blur
+  //int* kernel = new int[K * K];
+  int * kernel  = def_host_res.allocate<int>(K * K);
+  for (int i = 0; i < K * K; ++i) {
+    kernel[i] = 2;
+  }
+
+  // copying to gpu
+  int* d_array      = def_device_res.allocate<int>(N * N);
+  int* d_array_copy = def_device_res.allocate<int>(N * N);
+  int* d_kernel     = def_device_res.allocate<int>(K * K);
+
+  def_device_res.memcpy(d_array, array, N * N * sizeof(int));
+  def_device_res.memcpy(d_array_copy, array_copy, N * N * sizeof(int));
+  def_device_res.memcpy(d_kernel, kernel, K * K * sizeof(int));
+
+  constexpr int DIM = 2;
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view(d_array, N, N);
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view_copy(d_array_copy, N, N);
+  RAJA::View<int, RAJA::Layout<DIM, int, 1>> kernel_view(d_kernel, K, K);
+
+  RAJA::RangeSegment range_i(0, N);
+  RAJA::RangeSegment range_j(0, N);
+
+  timer.start();
+
+  RAJA::kernel<kernel_pol>
+    (RAJA::make_tuple(range_i, range_j),
+     [=] RAJA_DEVICE (int i, int j) {
+      int sum = 0;
+
+      //looping through the "blur"
+      for (int m = 0; m < K; ++m) {
+	for (int n = 0; n < K; ++n) {
+	  int x = i + m;
+	  int y = j + n;
+
+	  // adding the "blur" to the "image" wherever the blur is located on the image
+	  if (x < N && y < N) {
+	    sum += kernel_view(m, n) * array_view(x, y);
+	  }
+	}
+      }
+
+      array_view(i, j) += sum;
+    }
+   );
+
+  timer.stop();
+
+  std::cout<<"Elapsed time with RAJA view : "<<timer.elapsed()<<std::endl;
+
+
+ timer.reset();
+ timer.start();
+ 
+  RAJA::kernel<kernel_pol>
+    (RAJA::make_tuple(range_i, range_j),
+     [=] RAJA_DEVICE (int i, int j) {
+      int sum = 0;
+
+      // looping through the "blur"
+      for (int m = 0; m < K; ++m) {
+	for (int n = 0; n < K; ++n) {
+	  int x = i + m;
+	  int y = j + n;
+
+	  // adding the "blur" to the "image" wherever the blur is located on the image
+	  if (x < N && y < N) {
+	    sum += d_kernel[m * K + n] * d_array_copy[x * N + y];
+	  }
+	}
+      }
+
+      d_array_copy[i * N + j] += sum;
+    }
+  );
+  timer.stop();
+
+  std::cout<<"Elapsed time with NO RAJA view : "<<timer.elapsed()<<std::endl;
+
+  def_device_res.memcpy(array, d_array, N * N * sizeof(int));
+  def_device_res.memcpy(array_copy, d_array_copy, N * N * sizeof(int));
+
+  def_device_res.deallocate(d_array);
+  def_device_res.deallocate(d_array_copy);
+  def_device_res.deallocate(d_kernel);
+    
+  def_host_res.deallocate(array);
+  def_host_res.deallocate(array_copy);
+  def_host_res.deallocate(kernel);
+  
+  return 0;
+}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index fd1aed62ba..4dfd2fbc10 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,10 +3,6 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-raja_add_executable(
-  NAME raja_view_slowdown
-  SOURCES raja_view_slowdown.cpp)
-
 raja_add_executable(
   NAME tut_launch_basic
   SOURCES tut_launch_basic.cpp)
diff --git a/examples/raja_view_slowdown.cpp b/examples/raja_view_slowdown.cpp
deleted file mode 100644
index 7b7dc8def3..0000000000
--- a/examples/raja_view_slowdown.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <RAJA/RAJA.hpp>
-#include "RAJA/util/Timer.hpp"
-#include <iostream>
-
-int main() {
-
-  const int N = 10000;
-  const int K = 17;
-
-  auto timer = RAJA::Timer();
-
-  //launch to intialize the stream
-  RAJA::forall<RAJA::cuda_exec<256>>
-    (RAJA::RangeSegment(0,1), [=] __device__ (int i) {
-    printf("launch kernel\n");
-  });
-
-
-  int* array = new int[N * N];
-  int* array_copy = new int[N * N];
-
-  //big array, or image
-  for (int i = 0; i < N * N; ++i) {
-    array[i] = 1;
-    array_copy[i] = 1;
-  }
-
-  //small array that acts as the blur
-  int* kernel = new int[K * K];
-  for (int i = 0; i < K * K; ++i) {
-    kernel[i] = 2;
-  }
-
-  // copying to gpu
-  int* d_array;
-  int* d_array_copy;
-  int* d_kernel;
-  cudaMalloc((void**)&d_array, N * N * sizeof(int));
-  cudaMalloc((void**)&d_array_copy, N * N * sizeof(int));
-  cudaMalloc((void**)&d_kernel, K * K * sizeof(int));
-  cudaMemcpy(d_array, array, N * N * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_array_copy, array_copy, N * N * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel, K * K * sizeof(int), cudaMemcpyHostToDevice);
-
-
-  constexpr int DIM = 2;
-  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view(d_array, N, N);
-  RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view_copy(d_array_copy, N, N);
-  RAJA::View<int, RAJA::Layout<DIM, int, 1>> kernel_view(d_kernel, K, K);
-
-
-  using EXEC_POL5 = RAJA::KernelPolicy<
-    RAJA::statement::CudaKernelFixed<256,
-      RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
-        RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
-	  RAJA::statement::Lambda<0>
-	  >
-       >
-     >
-    >;
-
-  RAJA::RangeSegment range_i(0, N);
-  RAJA::RangeSegment range_j(0, N);
-
-
-timer.start();
-
-  RAJA::kernel<EXEC_POL5>
-    (RAJA::make_tuple(range_i, range_j),
-     [=] RAJA_DEVICE (int i, int j) {
-      int sum = 0;
-
-      //looping through the "blur"
-      for (int m = 0; m < K; ++m) {
-	for (int n = 0; n < K; ++n) {
-	  int x = i + m;
-	  int y = j + n;
-
-	  // adding the "blur" to the "image" wherever the blur is located on the image
-	  if (x < N && y < N) {
-	    sum += kernel_view(m, n) * array_view(x, y);
-	  }
-	}
-      }
-
-      array_view(i, j) += sum;
-    }
-   );
-
-timer.stop();
-
-std::cout<<"Elapsed time with RAJA view : "<<timer.elapsed()<<std::endl;
-
-
-timer.reset();
-timer.start();
-
-  RAJA::kernel<EXEC_POL5>
-    (RAJA::make_tuple(range_i, range_j),
-     [=] RAJA_DEVICE (int i, int j) {
-      int sum = 0;
-
-      // looping through the "blur"
-      for (int m = 0; m < K; ++m) {
-	for (int n = 0; n < K; ++n) {
-	  int x = i + m;
-	  int y = j + n;
-
-	  // adding the "blur" to the "image" wherever the blur is located on the image
-	  if (x < N && y < N) {
-	    sum += d_kernel[m * K + n] * d_array_copy[x * N + y];
-	  }
-	}
-      }
-
-      d_array_copy[i * N + j] += sum;
-    }
-  );
-
-timer.stop();
-std::cout<<"Elapsed time with NO RAJA view : "<<timer.elapsed()<<std::endl;
-
-  // copy from gpu to cpu
-  cudaMemcpy(array, d_array, N * N * sizeof(int), cudaMemcpyDeviceToHost);
-  cudaMemcpy(array_copy, d_array_copy, N * N * sizeof(int), cudaMemcpyDeviceToHost);
-
-
-  cudaFree(d_array);
-  cudaFree(d_array_copy);
-  cudaFree(d_kernel);
-
-
-  delete[] array;
-  delete[] array_copy;
-  delete[] kernel;
-
-  return 0;
-}

From c4ddb9ec752c15c2b47ca932d6ef561f42f315c8 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 26 Sep 2024 18:08:30 -0700
Subject: [PATCH 3/3] clean up pass, add other variants

---
 benchmark/CMakeLists.txt     |  9 ++--
 benchmark/raja_view_blur.cpp | 82 ++++++++++++++++++++++++++++--------
 2 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 82f3e10b13..143d6b5b98 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -17,10 +17,11 @@ raja_add_benchmark(
   SOURCES benchmark-atomic.cpp)
 endif()
 
-raja_add_benchmark(
-  NAME ltimes
-  SOURCES ltimes.cpp)
+#TODO: Fix build issues
+#raja_add_benchmark(
+#  NAME ltimes
+#  SOURCES ltimes.cpp)
 
-raja_add_bench_mark(
+raja_add_benchmark(
   NAME raja_view_blur
   SOURCES raja_view_blur.cpp)
diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp
index c3db04b2dd..331d6c51dd 100644
--- a/benchmark/raja_view_blur.cpp
+++ b/benchmark/raja_view_blur.cpp
@@ -15,7 +15,19 @@
  *
  */
 
-#if defined(RAJA_ENABLE_HIP)
+//Uncomment to specify variant
+//#define RUN_HIP_VARIANT
+//#define RUN_CUDA_VARIANT
+//#define RUN_SYCL_VARIANT
+//#define RUN_OPENMP_VARIANT
+#define RUN_SEQ_VARIANT
+
+
+using host_pol = RAJA::seq_exec;
+using host_resources = RAJA::resources::Host;
+
+
+#if defined(RAJA_ENABLE_HIP) && defined(RUN_HIP_VARIANT)
 using device_pol = RAJA::hip_exec<256>;
 using device_resources = RAJA::resource::Hip;
 
@@ -28,7 +40,9 @@ using kernel_pol = RAJA::KernelPolicy<
     >
    >
   >;
-#elif defined(RAJA_ENABLE_CUDA)
+#endif
+
+#if defined(RAJA_ENABLE_CUDA) && defined(RUN_CUDA_VARIANT)
 using device_pol = RAJA::cuda_exec<256>;
 using device_resources = RAJA::resources::Cuda;
 
@@ -41,33 +55,68 @@ using kernel_pol = RAJA::KernelPolicy<
     >
    >
   >;
-#else
-using host_pol = RAJA::seq_exec;
+#endif
+
+#if defined(RAJA_ENABLE_SYCL) && defined(RUN_SYCL_VARIANT)
+using device_pol = RAJA::sycl_exec<256>;
+using device_resources = RAJA::resources::Sycl;
+
+using kernel_pol = RAJA::KernelPolicy<
+  RAJA::statement::SyclKernel<
+    RAJA::statement::For<1, RAJA::sycl_global_item_1,
+      RAJA::statement::For<0, RAJA::sycl_global_item_2,
+         RAJA::statement::Lambda<0>
+      >
+    >
+   >
+  >;
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP_VARIANT)
+using device_pol = RAJA::omp_parallel_for_exec;
 using device_resources = RAJA::resources::Host;
+
+using kernel_pol = RAJA::KernelPolicy<
+    RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::seq_exec,
+         RAJA::statement::Lambda<0>
+      >
+    >
+  >;
 #endif
 
-using host_resources = RAJA::resources::Host;
+#if defined(RUN_SEQ_VARIANT)
+using device_pol = RAJA::seq_exec;
+using device_resources =  RAJA::resources::Host;
+
+using kernel_pol = RAJA::KernelPolicy<
+    RAJA::statement::For<1, RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec,
+         RAJA::statement::Lambda<0>
+      >
+    >
+  >;
+#endif
 
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   const int N = 10000;
   const int K = 17;
-  
+
   device_resources def_device_res{device_resources::get_default()};
   host_resources   def_host_res{host_resources::get_default()};
-  
+
   auto timer = RAJA::Timer();
-  
+
   //launch to intialize the stream
   RAJA::forall<device_pol>
-    (RAJA::RangeSegment(0,1), [=] RAJA_DEVICE (int i) {
-      printf(" \n");
+    (RAJA::RangeSegment(0,1), [=] RAJA_HOST_DEVICE (int i) {
   });
 
   int * array      = def_host_res.allocate<int>(N * N);
   int * array_copy = def_host_res.allocate<int>(N * N);
-  
+
   //big array, or image
   for (int i = 0; i < N * N; ++i) {
     array[i] = 1;
@@ -75,7 +124,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
 
   //small array that acts as the blur
-  //int* kernel = new int[K * K];
   int * kernel  = def_host_res.allocate<int>(K * K);
   for (int i = 0; i < K * K; ++i) {
     kernel[i] = 2;
@@ -102,7 +150,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::kernel<kernel_pol>
     (RAJA::make_tuple(range_i, range_j),
-     [=] RAJA_DEVICE (int i, int j) {
+     [=] RAJA_HOST_DEVICE (int i, int j) {
       int sum = 0;
 
       //looping through the "blur"
@@ -129,10 +177,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
  timer.reset();
  timer.start();
- 
+
   RAJA::kernel<kernel_pol>
     (RAJA::make_tuple(range_i, range_j),
-     [=] RAJA_DEVICE (int i, int j) {
+     [=] RAJA_HOST_DEVICE (int i, int j) {
       int sum = 0;
 
       // looping through the "blur"
@@ -161,10 +209,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   def_device_res.deallocate(d_array);
   def_device_res.deallocate(d_array_copy);
   def_device_res.deallocate(d_kernel);
-    
+
   def_host_res.deallocate(array);
   def_host_res.deallocate(array_copy);
   def_host_res.deallocate(kernel);
-  
+
   return 0;
 }