diff --git a/SYCL/ESIMD/dword_atomic_smoke.cpp b/SYCL/ESIMD/dword_atomic_smoke.cpp new file mode 100644 index 0000000000..4a13ae92fb --- /dev/null +++ b/SYCL/ESIMD/dword_atomic_smoke.cpp @@ -0,0 +1,18 @@ +//==---------------- dword_atomic_smoke.cpp - DPC++ ESIMD on-device test --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This test checks DWORD atomic operations. +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// This macro disables usage of LSC atomics in the included test. +#define UNDEF_USE_LSC_ATOMICS + +#include "lsc/atomic_smoke.cpp" diff --git a/SYCL/ESIMD/esimd_test_utils.hpp b/SYCL/ESIMD/esimd_test_utils.hpp index 277ec00007..1c64904d6d 100644 --- a/SYCL/ESIMD/esimd_test_utils.hpp +++ b/SYCL/ESIMD/esimd_test_utils.hpp @@ -522,4 +522,20 @@ inline void iterate_ops(OpSeq ops, F f) { ConstexprForLoop<0, sizeof...(Ops)>::unroll(act); } +struct USMDeleter { + queue Q; + void operator()(void *Ptr) { + if (Ptr) { + sycl::free(Ptr, Q); + } + } +}; + +template +std::unique_ptr usm_malloc_shared(queue q, int n) { + std::unique_ptr res(sycl::malloc_shared(n, q), + USMDeleter{q}); + return std::move(res); +} + } // namespace esimd_test diff --git a/SYCL/ESIMD/lsc/atomic_smoke.cpp b/SYCL/ESIMD/lsc/atomic_smoke.cpp new file mode 100644 index 0000000000..370c2b6ad8 --- /dev/null +++ b/SYCL/ESIMD/lsc/atomic_smoke.cpp @@ -0,0 +1,561 @@ +//==---------------- atomic_smoke.cpp - DPC++ ESIMD on-device test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This test checks LSC atomic operations. +//===----------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl -DUSE_LSC_ATOMICS %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +#ifdef UNDEF_USE_LSC_ATOMICS +#undef USE_LSC_ATOMICS +#endif + +using namespace sycl; +using namespace sycl::ext::intel::esimd; +using namespace sycl::ext::intel::experimental::esimd; + +struct Config { + int threads_per_group; + int n_groups; + int start_ind; + int masked_lane; + int repeat; + int stride; +}; + +#ifndef PREFER_FULL_BARRIER +#define PREFER_FULL_BARRIER 0 +#endif // PREFER_FULL_BARRIER + +#if PREFER_FULL_BARRIER && defined(USE_LSC_ATOMICS) +#define USE_FULL_BARRIER 1 +#else +#define USE_FULL_BARRIER 0 +#endif + +// ----------------- Helper functions + +std::ostream &operator<<(std::ostream &out, const Config &cfg) { + out << "{ thr_per_group=" << cfg.threads_per_group + << " n_groups=" << cfg.n_groups << " start_ind=" << cfg.start_ind + << " masked_lane=" << cfg.masked_lane << " repeat=" << cfg.repeat + << " stride=" << cfg.stride << " }"; + return out; +} + +using LSCAtomicOp = sycl::ext::intel::esimd::native::lsc::atomic_op; +using DWORDAtomicOp = sycl::ext::intel::esimd::atomic_op; + +// This macro selects between DWORD ("legacy") and LSC-based atomics. +#ifdef USE_LSC_ATOMICS +using AtomicOp = LSCAtomicOp; +constexpr char MODE[] = "LSC"; +#else +using AtomicOp = DWORDAtomicOp; +constexpr char MODE[] = "DWORD"; +#endif // USE_LSC_ATOMICS + +#ifdef USE_LSC_ATOMICS +uint32_t atomic_load(uint32_t *addr) { + auto v = atomic_update(addr, 0, 1); + return v[0]; +} +#endif // USE_LSC_ATOMICS + +template class> class TestID; + +const char *to_string(DWORDAtomicOp op) { + switch (op) { + case DWORDAtomicOp::add: + return "add"; + case DWORDAtomicOp::sub: + return "sub"; + case DWORDAtomicOp::inc: + return "inc"; + case DWORDAtomicOp::dec: + return "dec"; + case DWORDAtomicOp::umin: + return "umin"; + case DWORDAtomicOp::umax: + return "umax"; + case DWORDAtomicOp::xchg: + return "xchg"; + case DWORDAtomicOp::cmpxchg: + return "cmpxchg"; + case DWORDAtomicOp::bit_and: + return "bit_and"; + case DWORDAtomicOp::bit_or: + return "bit_or"; + case DWORDAtomicOp::bit_xor: + return "bit_xor"; + case DWORDAtomicOp::smin: + return "smin"; + case DWORDAtomicOp::smax: + return "smax"; + case DWORDAtomicOp::fmax: + return "fmax"; + case DWORDAtomicOp::fmin: + return "fmin"; + case DWORDAtomicOp::fcmpxchg: + return "fcmpxchg"; + case DWORDAtomicOp::fadd: + return "fadd"; + case DWORDAtomicOp::fsub: + return "fsub"; + case DWORDAtomicOp::load: + return "load"; + case DWORDAtomicOp::store: + return "store"; + case DWORDAtomicOp::predec: + return "predec"; + } + return ""; +} + +const char *to_string(LSCAtomicOp op) { + switch (op) { + case LSCAtomicOp::add: + return "lsc::add"; + case LSCAtomicOp::sub: + return "lsc::sub"; + case LSCAtomicOp::inc: + return "lsc::inc"; + case LSCAtomicOp::dec: + return "lsc::dec"; + case LSCAtomicOp::umin: + return "lsc::umin"; + case LSCAtomicOp::umax: + return "lsc::umax"; + case LSCAtomicOp::cmpxchg: + return "lsc::cmpxchg"; + case LSCAtomicOp::bit_and: + return "lsc::bit_and"; + case LSCAtomicOp::bit_or: + return "lsc::bit_or"; + case LSCAtomicOp::bit_xor: + return "lsc::bit_xor"; + case LSCAtomicOp::smin: + return "lsc::smin"; + case LSCAtomicOp::smax: + return "lsc::smax"; + case LSCAtomicOp::fmax: + return "lsc::fmax"; + case LSCAtomicOp::fmin: + return "lsc::fmin"; + case LSCAtomicOp::fcmpxchg: + return "lsc::fcmpxchg"; + case LSCAtomicOp::fadd: + return "lsc::fadd"; + case LSCAtomicOp::fsub: + return "lsc::fsub"; + case LSCAtomicOp::load: + return "lsc::load"; + case LSCAtomicOp::store: + return "lsc::store"; + } + return "lsc::"; +} + +template inline bool any(simd_mask m, simd_mask ignore_mask) { + simd_mask m1 = 0; + m.merge(m1, ignore_mask); + return m.any(); +} + +// ----------------- The main test function + +template class ImplF> +bool test(queue q, const Config &cfg) { + constexpr auto op = ImplF::atomic_op; + using CurAtomicOpT = decltype(op); + constexpr int n_args = ImplF::n_args; + + std::cout << "Testing " + << "mode=" << MODE << " op=" << to_string(op) + << " full barrier=" << (USE_FULL_BARRIER ? "yes" : "no") + << " T=" << typeid(T).name() << " N=" << N + << "\n" + " " + << cfg << "..."; + + size_t size = cfg.start_ind + (N - 1) * cfg.stride + 1; + T *arr = malloc_shared(size, q); +#if USE_FULL_BARRIER + uint32_t *flag_ptr = malloc_shared(1, q); + *flag_ptr = 0; +#endif // USE_FULL_BARRIER + int n_threads = cfg.threads_per_group * cfg.n_groups; + + for (int i = 0; i < size; ++i) { + arr[i] = ImplF::init(i, cfg); + } + + range<1> glob_rng(n_threads); + range<1> loc_rng(cfg.threads_per_group); + nd_range<1> rng(glob_rng, loc_rng); + + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for>( + rng, [=](id<1> ii) SYCL_ESIMD_KERNEL { + int i = ii; + simd offsets(cfg.start_ind * sizeof(T), + cfg.stride * sizeof(T)); + simd_mask m = 1; + m[cfg.masked_lane] = 0; + // barrier to achieve better contention: +#if USE_FULL_BARRIER + // Full global barrier, works only with LSC atomics + // (+ ND range should fit into the available h/w threads). + atomic_update(flag_ptr, 0, 1); + for (uint32_t x = atomic_load(flag_ptr); x < n_threads; + x = atomic_load(flag_ptr)) + ; +#else + // Intra-work group barrier. + barrier(); +#endif // USE_FULL_BARRIER + + // the atomic operation itself applied in a loop: + for (int cnt = 0; cnt < cfg.repeat; ++cnt) { + if constexpr (n_args == 0) { + atomic_update(arr, offsets, m); + } else if constexpr (n_args == 1) { + simd v0 = ImplF::arg0(i); + atomic_update(arr, offsets, v0, m); + } else if constexpr (n_args == 2) { + simd new_val = ImplF::arg0(i); // new value + simd exp_val = ImplF::arg1(i); // expected value + // do compare-and-swap in a loop until we get expected value; + // arg0 and arg1 must provide values which guarantee the loop + // is not endless: + for (auto old_val = + atomic_update(arr, offsets, new_val, exp_val, m); + any(old_val != exp_val, !m); + old_val = + atomic_update(arr, offsets, new_val, exp_val, m)) + ; + } + } + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(arr, q); +#if USE_FULL_BARRIER + free(flag_ptr, q); +#endif // USE_FULL_BARRIER + return false; + } + int err_cnt = 0; + + for (int i = 0; i < size; ++i) { + T gold = ImplF::gold(i, cfg); + T test = arr[i]; + + if ((gold != test) && (++err_cnt < 10)) { + if (err_cnt == 1) { + std::cout << "\n"; + } + std::cout << " failed at index " << i << ": " << test << " != " << gold + << "(gold)\n"; + } + } + if (err_cnt > 0) { + std::cout << " FAILED\n pass rate: " + << ((float)(size - err_cnt) / (float)size) * 100.0f << "% (" + << (size - err_cnt) << "/" << size << ")\n"; + } else { + std::cout << " passed\n"; + } + free(arr, q); +#if USE_FULL_BARRIER + free(flag_ptr, q); +#endif // USE_FULL_BARRIER + return err_cnt == 0; +} + +// ----------------- Functions providing input and golden values for atomic +// ----------------- operations. + +static int dense_ind(int ind, int VL, const Config &cfg) { + return (ind - cfg.start_ind) / cfg.stride; +} + +static bool is_updated(int ind, int VL, const Config &cfg) { + if ((ind < cfg.start_ind) || (((ind - cfg.start_ind) % cfg.stride) != 0)) { + return false; + } + int ii = dense_ind(ind, VL, cfg); + bool res = (ii % VL) != cfg.masked_lane; + return res; +} + +// ----------------- Actual "traits" for each operation. + +template struct ImplInc { + static constexpr AtomicOp atomic_op = AtomicOp::inc; + static constexpr int n_args = 0; + + static T init(int i, const Config &cfg) { return (T)0; } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) + ? (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups) + : init(i, cfg); + return gold; + } +}; + +template struct ImplDec { + static constexpr AtomicOp atomic_op = AtomicOp::dec; + static constexpr int n_args = 0; + static constexpr int base = 5; + + static T init(int i, const Config &cfg) { + return (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups + base); + } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) ? (T)base : init(i, cfg); + return gold; + } +}; + +template struct ImplAdd { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + + static T init(int i, const Config &cfg) { return (T)0; } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) + ? (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups) + : init(i, cfg); + return gold; + } + + static T arg0(int i) { return 1; } +}; + +template struct ImplSub { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + static constexpr int base = 5; + + static T init(int i, const Config &cfg) { + return (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups + base); + } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) ? (T)base : init(i, cfg); + return gold; + } + + static T arg0(int i) { return 1; } +}; + +template struct ImplMin { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + static constexpr int MIN = 1; + + static T init(int i, const Config &cfg) { + return (T)(cfg.threads_per_group * cfg.n_groups + MIN + 1); + } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) ? (T)MIN : init(i, cfg); + return gold; + } + + static T arg0(int i) { return i + MIN; } +}; + +template struct ImplMax { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + static constexpr int base = 5; + + static T init(int i, const Config &cfg) { return 0; } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) + ? (T)(cfg.threads_per_group * cfg.n_groups - 1) + : init(i, cfg); + return gold; + } + + static T arg0(int i) { return i; } +}; + +template +struct ImplIntAdd : ImplAdd {}; +template +struct ImplIntSub : ImplSub {}; +template +struct ImplSMin : ImplMin {}; +template +struct ImplUMin : ImplMin {}; +template +struct ImplSMax : ImplMax {}; +template +struct ImplUMax : ImplMax {}; + +#ifdef USE_LSC_ATOMICS +// These will be redirected by API implementation to LSC ones: +template +struct ImplFadd : ImplAdd {}; +template +struct ImplFsub : ImplSub {}; +template +struct ImplFmin : ImplMin {}; +template +struct ImplFmax : ImplMax {}; +// LCS versions: +template +struct ImplLSCFadd : ImplAdd {}; +template +struct ImplLSCFsub : ImplSub {}; +template +struct ImplLSCFmin : ImplMin {}; +template +struct ImplLSCFmax : ImplMax {}; +#endif // USE_LSC_ATOMICS + +template struct ImplCmpxchgBase { + static constexpr C atomic_op = Op; + static constexpr int n_args = 2; + static constexpr int base = 2; + + static T init(int i, const Config &cfg) { return base - 1; } + + static T gold(int i, const Config &cfg) { + T gold = is_updated(i, N, cfg) + ? (T)(cfg.threads_per_group * cfg.n_groups - 1 + base) + : init(i, cfg); + return gold; + } + + // "Replacement value" argument in CAS + static inline T arg0(int i) { return i + base; } + + // "Expected value" argument in CAS + static inline T arg1(int i) { return i + base - 1; } +}; + +template +struct ImplCmpxchg + : ImplCmpxchgBase {}; + +#ifdef USE_LSC_ATOMICS +// This will be redirected by API implementation to LSC one: +template +struct ImplFcmpwr + : ImplCmpxchgBase {}; +// LCS versions: +template +struct ImplLSCFcmpwr + : ImplCmpxchgBase {}; +#endif // USE_LSC_ATOMICS + +// ----------------- Main function and test combinations. + +template class Op> +bool test_int_types(queue q, const Config &cfg) { + bool passed = true; + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + return passed; +} + +int main(void) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + Config cfg{ + 11, // int threads_per_group; + 11, // int n_groups; + 5, // int start_ind; + 5, // int masked_lane; + 100, // int repeat; + 111 // int stride; + }; + + bool passed = true; + // Template params: + // - element type, simd size, threads per group, num groups, atomic op, + // verification function, argument generation functions... + // Actual params: + // - queue, start index in data, masked lane, repeat count + passed &= test_int_types<8, ImplInc>(q, cfg); + passed &= test_int_types<8, ImplDec>(q, cfg); + + // TODO: support sizes other than 8 when compiler is fixed + // passed &= test(q, cfg); + // TODO: support 16-bit types when compiler is fixed + // passed &= test(q, cfg); + + passed &= test_int_types<8, ImplIntAdd>(q, cfg); + passed &= test_int_types<8, ImplIntSub>(q, cfg); + + // TODO: this crashes vc-intrinsics + // passed &= test(q, cfg); + // passed &= test(q, cfg); + + passed &= test(q, cfg); + passed &= test(q, cfg); + + // TODO: add other operations + +#ifdef USE_LSC_ATOMICS + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + + // TODO: support sycl::half when compiler is fixed + // passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + // passed &= test(q, cfg); + + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); + passed &= test(q, cfg); +#endif // USE_LSC_ATOMICS + + // Can't easily reset input to initial state, so just 1 iteration for CAS. + cfg.repeat = 1; + passed &= test_int_types<8, ImplCmpxchg>(q, cfg); +#ifdef USE_LSC_ATOMICS + passed &= test(q, cfg); + passed &= test(q, cfg); +#endif // USE_LSC_ATOMICS + // TODO: check double other vector lengths in LSC mode. + + std::cout << (passed ? "Passed\n" : "FAILED\n"); + return passed ? 0 : 1; +}