cms-sw · cmsbuild · Jan 23, 2023 · Jan 22, 2023 · Jan 20, 2023
diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -114,7 +114,7 @@ namespace cms::alpakatools {
           return *this;
 
         // increment the thread index with the grid stride
-        first_ += stride_ * elements_;
+        first_ += stride_;
         index_ = first_;
         last_ = std::min(first_ + elements_, extent_);
         if (index_ < extent_)
@@ -204,7 +204,7 @@ namespace cms::alpakatools {
           return *this;
 
         // increment the thread index along with the last dimension with the grid stride
-        first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension];
+        first_[last_dimension] += stride_[last_dimension];
         index_[last_dimension] = first_[last_dimension];
         last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
         if (index_[last_dimension] < extent_[last_dimension])

diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -4,3 +4,10 @@
   <use name="HeterogeneousCore/AlpakaInterface"/>
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
+
+<bin name="alpakaTestKernel" file="alpaka/testKernel.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -0,0 +1,127 @@
+#include <cstdio>
+#include <random>
+
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+// each test binary is built for a single Alpaka backend
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel) "]";
+
+struct VectorAddKernel {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    for (auto index : cms::alpakatools::elements_with_stride(acc, size)) {
+      out[index] = in1[index] + in2[index];
+    }
+  }
+};
+
+struct VectorAddKernel1D {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const {
+    for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
+      auto index = ndindex[0];
+      out[index] = in1[index] + in2[index];
+    }
+  }
+};
+
+TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) {
+  SECTION("VectorAddKernel") {
+    // get the list of devices on the current platform
+    auto const& devices = cms::alpakatools::devices<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+                << ", the test will be skipped.\n";
+      return;
+    }
+
+    // random number generator with a gaussian distribution
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0., 1.};
+
+    // tolerance
+    constexpr float epsilon = 0.000001;
+
+    // buffer size
+    constexpr size_t size = 1024 * 1024;
+
+    // allocate input and output host buffers
+    auto in1_h = cms::alpakatools::make_host_buffer<float[]>(size);
+    auto in2_h = cms::alpakatools::make_host_buffer<float[]>(size);
+    auto out_h = cms::alpakatools::make_host_buffer<float[]>(size);
+
+    // fill the input buffers with random data, and the output buffer with zeros
+    for (size_t i = 0; i < size; ++i) {
+      in1_h[i] = dist(rand);
+      in2_h[i] = dist(rand);
+      out_h[i] = 0.;
+    }
+
+    // run the test on each device
+    for (auto const& device : devices) {
+      std::cout << "Test 1D vector addition on " << alpaka::getName(device) << '\n';
+      auto queue = Queue(device);
+
+      // allocate input and output buffers on the device
+      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+
+      // copy the input data to the device; the size is known from the buffer objects
+      alpaka::memcpy(queue, in1_d, in1_h);
+      alpaka::memcpy(queue, in2_d, in2_h);
+
+      // fill the output buffer with zeros; the size is known from the buffer objects
+      alpaka::memset(queue, out_d, 0.);
+
+      // launch the 1-dimensional kernel with scalar size
+      auto div = cms::alpakatools::make_workdiv<Acc1D>(4, 4);
+      alpaka::exec<Acc1D>(queue, div, VectorAddKernel{}, in1_d.data(), in2_d.data(), out_d.data(), size);
+
+      // copy the results from the device to the host
+      alpaka::memcpy(queue, out_h, out_d);
+
+      // wait for all the operations to complete
+      alpaka::wait(queue);
+
+      // check the results
+      for (size_t i = 0; i < size; ++i) {
+        float sum = in1_h[i] + in2_h[i];
+        REQUIRE(out_h[i] < sum + epsilon);
+        REQUIRE(out_h[i] > sum - epsilon);
+      }
+
+      // reset the output buffer on the device to all zeros
+      alpaka::memset(queue, out_d, 0.);
+
+      // launch the 1-dimensional kernel with vector size
+      alpaka::exec<Acc1D>(queue, div, VectorAddKernel1D{}, in1_d.data(), in2_d.data(), out_d.data(), size);
+
+      // copy the results from the device to the host
+      alpaka::memcpy(queue, out_h, out_d);
+
+      // wait for all the operations to complete
+      alpaka::wait(queue);
+
+      // check the results
+      for (size_t i = 0; i < size; ++i) {
+        float sum = in1_h[i] + in2_h[i];
+        REQUIRE(out_h[i] < sum + epsilon);
+        REQUIRE(out_h[i] > sum - epsilon);
+      }
+    }
+  }
+}