diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index dad09ed1774e6..38fb0d1f525e7 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -114,7 +114,7 @@ namespace cms::alpakatools { return *this; // increment the thread index with the grid stride - first_ += stride_ * elements_; + first_ += stride_; index_ = first_; last_ = std::min(first_ + elements_, extent_); if (index_ < extent_) @@ -204,7 +204,7 @@ namespace cms::alpakatools { return *this; // increment the thread index along with the last dimension with the grid stride - first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension]; + first_[last_dimension] += stride_[last_dimension]; index_[last_dimension] = first_[last_dimension]; last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]); if (index_[last_dimension] < extent_[last_dimension]) diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index c8d214ff53436..a9bb5a65b3987 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -4,3 +4,10 @@ + + + + + + + diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc new file mode 100644 index 0000000000000..4cbf4f1dc08d8 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc @@ -0,0 +1,127 @@ +#include +#include + +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/vec.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +// each test binary is built for a single Alpaka backend +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel) "]"; + +struct VectorAddKernel { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const { + for (auto index : cms::alpakatools::elements_with_stride(acc, size)) { + out[index] = in1[index] + in2[index]; + } + } +}; + +struct VectorAddKernel1D { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const { + for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) { + auto index = ndindex[0]; + out[index] = in1[index] + in2[index]; + } + } +}; + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) { + SECTION("VectorAddKernel") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + constexpr size_t size = 1024 * 1024; + + // allocate input and output host buffers + auto in1_h = cms::alpakatools::make_host_buffer(size); + auto in2_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test 1D vector addition on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + // allocate input and output buffers on the device + auto in1_d = cms::alpakatools::make_device_buffer(queue, size); + auto in2_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, in1_d, in1_h); + alpaka::memcpy(queue, in2_d, in2_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); + + // launch the 1-dimensional kernel with scalar size + auto div = cms::alpakatools::make_workdiv(4, 4); + alpaka::exec(queue, div, VectorAddKernel{}, in1_d.data(), in2_d.data(), out_d.data(), size); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); + } + + // reset the output buffer on the device to all zeros + alpaka::memset(queue, out_d, 0.); + + // launch the 1-dimensional kernel with vector size + alpaka::exec(queue, div, VectorAddKernel1D{}, in1_d.data(), in2_d.data(), out_d.data(), size); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); + } + } + } +}