diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index ddd34eb9635f0..5fc5e00ce3b76 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -378,6 +378,51 @@ __ESIMD_API __ESIMD_NS::simd lsc_slm_block_load(uint32_t offset) { N>(pred.data(), offsets.data()); } +/// USM pointer gather. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_load.ugm +/// +/// Collects elements located at specified address and returns them +/// as a single \ref simd object. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to load per address. +/// @tparam DS is the data size. +/// @tparam L1H is L1 cache hint. +/// @tparam L3H is L3 cache hint. +/// @tparam N is the number of channels (platform dependent). +/// @param p is the base pointer. +/// @param offsets is the zero-based offsets in bytes. +/// @param pred is predicates. +/// @return is a vector of type T and size N * NElts +/// +template +__ESIMD_API __ESIMD_NS::simd +lsc_gather(const T *p, __ESIMD_NS::simd offsets, + __ESIMD_NS::simd_mask pred = 1) { + detail::check_lsc_vector_size(); + detail::check_lsc_data_size(); + detail::check_lsc_cache_hint(); + constexpr uint16_t _AddressScale = 1; + constexpr int _ImmOffset = 0; + constexpr lsc_data_size _DS = + detail::expand_data_size(detail::finalize_data_size()); + constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); + constexpr detail::lsc_data_order _Transposed = + detail::lsc_data_order::nontranspose; + using _MsgT = typename detail::lsc_expand_type::type; + __ESIMD_NS::simd addrs = reinterpret_cast(p); + addrs += convert(offsets); + __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = + __esimd_lsc_load_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, + _DS, _VS, _Transposed, N>(pred.data(), + addrs.data()); + return detail::lsc_format_ret(Tmp); +} + /// Accessor-based gather. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_load.ugm @@ -427,6 +472,47 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, #endif } +/// USM pointer transposed gather with 1 channel. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_load.ugm +/// +/// Collects elements located at specified address and returns them +/// as a single \ref simd object. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to load per address. +/// @tparam DS is the data size. +/// @tparam L1H is L1 cache hint. +/// @tparam L3H is L3 cache hint. +/// @param p is the base pointer. +/// @param pred is operation predicate. Zero means operation is skipped +/// entirely, non-zero - operation is performed. The default is '1' - perform +/// the operation. +/// @return is a vector of type T and size NElts +/// +template +__ESIMD_API __ESIMD_NS::simd +lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) { + detail::check_lsc_vector_size(); + detail::check_lsc_data_size(); + detail::check_lsc_cache_hint(); + constexpr uint16_t _AddressScale = 1; + constexpr int _ImmOffset = 0; + constexpr lsc_data_size _DS = detail::finalize_data_size(); + static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, + "Transposed load is supported only for data size u32 or u64"); + constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); + constexpr detail::lsc_data_order _Transposed = + detail::lsc_data_order::transpose; + constexpr int N = 1; + __ESIMD_NS::simd addrs = reinterpret_cast(p); + return __esimd_lsc_load_stateless(pred.data(), + addrs.data()); +} + /// Accessor-based transposed gather with 1 channel. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_load.ugm @@ -442,6 +528,9 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. +/// @param pred is operation predicate. Zero means operation is skipped +/// entirely, non-zero - operation is performed. The default is '1' - perform +/// the operation. /// @return is a vector of type T and size NElts /// template __ESIMD_API std::enable_if_t::value, __ESIMD_NS::simd> -lsc_block_load(AccessorTy acc, uint32_t offset) { +lsc_block_load(AccessorTy acc, uint32_t offset, + __ESIMD_NS::simd_mask<1> pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM return lsc_block_load( - __ESIMD_DNS::accessorToPointer(acc, offset)); + __ESIMD_DNS::accessorToPointer(acc, offset), pred); #else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); @@ -467,7 +557,6 @@ lsc_block_load(AccessorTy acc, uint32_t offset) { constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; - __ESIMD_NS::simd_mask pred = 1; __ESIMD_NS::simd offsets = offset; auto si = __ESIMD_NS::get_surface_index(acc); return __esimd_lsc_load_bti -__ESIMD_API __ESIMD_NS::simd -lsc_gather(const T *p, __ESIMD_NS::simd offsets, - __ESIMD_NS::simd_mask pred = 1) { +__ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd offsets, + __ESIMD_NS::simd_mask pred = 1) { detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = @@ -513,19 +599,16 @@ lsc_gather(const T *p, __ESIMD_NS::simd offsets, using _MsgT = typename detail::lsc_expand_type::type; __ESIMD_NS::simd addrs = reinterpret_cast(p); addrs += convert(offsets); - __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = - __esimd_lsc_load_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, + __esimd_lsc_prefetch_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(pred.data(), addrs.data()); - return detail::lsc_format_ret(Tmp); } -/// USM pointer transposed gather with 1 channel. +/// USM pointer prefetch transposed gather with 1 channel. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_load.ugm /// -/// Collects elements located at specified address and returns them -/// as a single \ref simd object. +/// Prefetches elements located at specified address. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. @@ -533,29 +616,29 @@ lsc_gather(const T *p, __ESIMD_NS::simd offsets, /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. /// @param p is the base pointer. -/// @return is a vector of type T and size NElts /// template -__ESIMD_API __ESIMD_NS::simd lsc_block_load(const T *p) { +__ESIMD_API void lsc_prefetch(const T *p) { detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed load is supported only for data size u32 or u64"); + static_assert( + _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, + "Transposed prefetch is supported only for data size u32 or u64"); constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; __ESIMD_NS::simd_mask pred = 1; __ESIMD_NS::simd addrs = reinterpret_cast(p); - return __esimd_lsc_load_stateless(pred.data(), - addrs.data()); + __esimd_lsc_prefetch_stateless(pred.data(), + addrs.data()); } /// Accessor-based prefetch gather. @@ -649,83 +732,6 @@ lsc_prefetch(AccessorTy acc, uint32_t offset) { #endif } -/// USM pointer prefetch gather. -/// Supported platforms: DG2, PVC -/// VISA instruction: lsc_load.ugm -/// -/// Prefetches elements located at specified address. -/// -/// @tparam T is element type. -/// @tparam NElts is the number of elements to load per address. -/// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. -/// @tparam N is the number of channels (platform dependent). -/// @param p is the base pointer. -/// @param offsets is the zero-based offsets in bytes. -/// @param pred is predicates. -/// -template -__ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd offsets, - __ESIMD_NS::simd_mask pred = 1) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using _MsgT = typename detail::lsc_expand_type::type; - __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); - __esimd_lsc_prefetch_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, - _DS, _VS, _Transposed, N>(pred.data(), - addrs.data()); -} - -/// USM pointer prefetch transposed gather with 1 channel. -/// Supported platforms: DG2, PVC -/// VISA instruction: lsc_load.ugm -/// -/// Prefetches elements located at specified address. -/// -/// @tparam T is element type. -/// @tparam NElts is the number of elements to load per address. -/// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. -/// @param p is the base pointer. -/// -template -__ESIMD_API void lsc_prefetch(const T *p) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert( - _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed prefetch is supported only for data size u32 or u64"); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::transpose; - constexpr int N = 1; - __ESIMD_NS::simd_mask pred = 1; - __ESIMD_NS::simd addrs = reinterpret_cast(p); - __esimd_lsc_prefetch_stateless(pred.data(), - addrs.data()); -} - /// SLM scatter. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_store.slm @@ -796,11 +802,11 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, pred.data(), offsets.data(), vals.data()); } -/// Accessor-based scatter. +/// USM pointer scatter. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_store.ugm /// -/// Scatters elements to surface. +/// Scatters elements to specific address. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. @@ -808,8 +814,7 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. /// @tparam N is the number of channels (platform dependent). -/// @tparam AccessorTy is the \ref sycl::accessor type. -/// @param acc is the SYCL accessor. +/// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. /// @param vals is values to store. /// @param pred is predicates. @@ -817,15 +822,10 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, template -__ESIMD_API std::enable_if_t::value> -lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, - __ESIMD_NS::simd vals, - __ESIMD_NS::simd_mask pred = 1) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - lsc_scatter(__ESIMD_DNS::accessorToPointer(acc), - offsets, pred); -#else + int N> +__ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd offsets, + __ESIMD_NS::simd vals, + __ESIMD_NS::simd_mask pred = 1) { detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); @@ -839,14 +839,14 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, using _MsgT = typename detail::lsc_expand_type::type; using _CstT = typename detail::lsc_bitcast_type::type; __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>(); - auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_store_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS, - _Transposed, N>(pred.data(), offsets.data(), Tmp.data(), - si); -#endif + __ESIMD_NS::simd addrs = reinterpret_cast(p); + addrs += convert(offsets); + __esimd_lsc_store_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, + _VS, _Transposed, N>(pred.data(), addrs.data(), + Tmp.data()); } -/// Accessor-based transposed scatter with 1 channel. +/// Accessor-based scatter. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_store.ugm /// @@ -857,44 +857,46 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. +/// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. -/// @param offset is the zero-based offset in bytes. +/// @param offsets is the zero-based offsets in bytes. /// @param vals is values to store. +/// @param pred is predicates. /// template + int N, typename AccessorTy> __ESIMD_API std::enable_if_t::value> -lsc_block_store(AccessorTy acc, uint32_t offset, - __ESIMD_NS::simd vals) { +lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, + __ESIMD_NS::simd vals, + __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - lsc_block_store( - __ESIMD_DNS::accessorToPointer(acc, offset), vals); + lsc_scatter(__ESIMD_DNS::accessorToPointer(acc), + offsets, pred); #else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed store is supported only for data size u32 or u64"); + constexpr lsc_data_size _DS = + detail::expand_data_size(detail::finalize_data_size()); constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::transpose; - constexpr int N = 1; - __ESIMD_NS::simd_mask pred = 1; - __ESIMD_NS::simd offsets = offset; + detail::lsc_data_order::nontranspose; + using _MsgT = typename detail::lsc_expand_type::type; + using _CstT = typename detail::lsc_bitcast_type::type; + __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>(); auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_store_bti(pred.data(), offsets.data(), - vals.data(), si); + __esimd_lsc_store_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS, + _Transposed, N>(pred.data(), offsets.data(), Tmp.data(), + si); #endif } -/// USM pointer scatter. +/// USM pointer transposed scatter with 1 channel. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_store.ugm /// @@ -905,57 +907,66 @@ lsc_block_store(AccessorTy acc, uint32_t offset, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. -/// @param offsets is the zero-based offsets in bytes. /// @param vals is values to store. -/// @param pred is predicates. +/// @param pred is operation predicate. Zero means operation is skipped +/// entirely, non-zero - operation is performed. The default is '1' - perform +/// the operation. /// template -__ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd offsets, - __ESIMD_NS::simd vals, - __ESIMD_NS::simd_mask pred = 1) { + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> +__ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd vals, + __ESIMD_NS::simd_mask<1> pred = 1) { detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); + constexpr lsc_data_size _DS = detail::finalize_data_size(); + static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, + "Transposed store is supported only for data size u32 or u64"); constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using _MsgT = typename detail::lsc_expand_type::type; - using _CstT = typename detail::lsc_bitcast_type::type; - __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>(); + detail::lsc_data_order::transpose; + constexpr int N = 1; __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); - __esimd_lsc_store_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, - _VS, _Transposed, N>(pred.data(), addrs.data(), - Tmp.data()); + __esimd_lsc_store_stateless(pred.data(), addrs.data(), + vals.data()); } -/// USM pointer transposed scatter with 1 channel. +/// Accessor-based transposed scatter with 1 channel. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_store.ugm /// -/// Scatters elements to specific address. +/// Scatters elements to surface. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @param p is the base pointer. +/// @tparam AccessorTy is the \ref sycl::accessor type. +/// @param acc is the SYCL accessor. +/// @param offset is the zero-based offset in bytes. /// @param vals is values to store. +/// @param pred is operation predicate. Zero means operation is skipped +/// entirely, non-zero - operation is performed. The default is '1' - perform +/// the operation. /// template -__ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd vals) { + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + typename AccessorTy> +__ESIMD_API std::enable_if_t::value> +lsc_block_store(AccessorTy acc, uint32_t offset, + __ESIMD_NS::simd vals, + __ESIMD_NS::simd_mask<1> pred = 1) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + lsc_block_store( + __ESIMD_DNS::accessorToPointer(acc, offset), vals, pred); +#else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); @@ -968,11 +979,12 @@ __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd vals) { constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; - __ESIMD_NS::simd_mask pred = 1; - __ESIMD_NS::simd addrs = reinterpret_cast(p); - __esimd_lsc_store_stateless(pred.data(), addrs.data(), - vals.data()); + __ESIMD_NS::simd offsets = offset; + auto si = __ESIMD_NS::get_surface_index(acc); + __esimd_lsc_store_bti(pred.data(), offsets.data(), + vals.data(), si); +#endif } namespace detail { @@ -1288,7 +1300,7 @@ lsc_slm_atomic_update(__ESIMD_NS::simd offsets, return detail::lsc_format_ret(Tmp); } -/// Accessor-based atomic. +/// USM pointer atomic. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_atomic_.ugm /// @@ -1298,23 +1310,16 @@ lsc_slm_atomic_update(__ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @tparam AccessorTy is the \ref sycl::accessor type. -/// @param acc is the SYCL accessor. +/// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param pred is predicates. /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, - typename AccessorTy> -__ESIMD_API std::enable_if_t::value, - __ESIMD_NS::simd> -lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> +__ESIMD_API __ESIMD_NS::simd +lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_atomic_update( - __ESIMD_DNS::accessorToPointer(acc), offsets, pred); -#else detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); constexpr __ESIMD_NS::native::lsc::atomic_op _Op = @@ -1329,16 +1334,16 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using _MsgT = typename detail::lsc_expand_type::type; - auto si = __ESIMD_NS::get_surface_index(acc); + __ESIMD_NS::simd addrs = reinterpret_cast(p); + addrs += convert(offsets); __ESIMD_NS::simd<_MsgT, N> Tmp = - __esimd_lsc_xatomic_bti_0<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset, - _DS, _VS, _Transposed, N>(pred.data(), - offsets.data(), si); + __esimd_lsc_xatomic_stateless_0<_MsgT, _Op, L1H, L3H, _AddressScale, + _ImmOffset, _DS, _VS, _Transposed, N>( + pred.data(), addrs.data()); return detail::lsc_format_ret(Tmp); -#endif } -/// Accessor-based atomic. +/// USM pointer atomic. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_atomic_.ugm /// @@ -1348,24 +1353,17 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @tparam AccessorTy is the \ref sycl::accessor type. -/// @param acc is the SYCL accessor. +/// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. /// @param pred is predicates. /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, - typename AccessorTy> -__ESIMD_API std::enable_if_t::value, - __ESIMD_NS::simd> -lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> +__ESIMD_API __ESIMD_NS::simd +lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_atomic_update( - __ESIMD_DNS::accessorToPointer(acc), offsets, src0, pred); -#else detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); constexpr __ESIMD_NS::native::lsc::atomic_op _Op = @@ -1380,16 +1378,16 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using _MsgT = typename detail::lsc_expand_type::type; - auto si = __ESIMD_NS::get_surface_index(acc); + __ESIMD_NS::simd addrs = reinterpret_cast(p); + addrs += convert(offsets); __ESIMD_NS::simd<_MsgT, N> Tmp = - __esimd_lsc_xatomic_bti_1<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset, - _DS, _VS, _Transposed, N>( - pred.data(), offsets.data(), src0.data(), si); + __esimd_lsc_xatomic_stateless_1<_MsgT, _Op, L1H, L3H, _AddressScale, + _ImmOffset, _DS, _VS, _Transposed, N>( + pred.data(), addrs.data(), src0.data()); return detail::lsc_format_ret(Tmp); -#endif } -/// Accessor-based atomic. +/// USM pointer atomic. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_atomic_.ugm /// @@ -1399,8 +1397,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @tparam AccessorTy is the \ref sycl::accessor type. -/// @param acc is the SYCL accessor. +/// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. /// @param src1 is the second atomic operand. @@ -1408,17 +1405,11 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, - typename AccessorTy> -__ESIMD_API std::enable_if_t::value, - __ESIMD_NS::simd> -lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> +__ESIMD_API __ESIMD_NS::simd +lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_atomic_update( - __ESIMD_DNS::accessorToPointer(acc), offsets, src0, src1, pred); -#else detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); constexpr __ESIMD_NS::native::lsc::atomic_op _Op = @@ -1433,16 +1424,16 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using _MsgT = typename detail::lsc_expand_type::type; - auto si = __ESIMD_NS::get_surface_index(acc); + __ESIMD_NS::simd addrs = reinterpret_cast(p); + addrs += convert(offsets); __ESIMD_NS::simd<_MsgT, N> Tmp = - __esimd_lsc_xatomic_bti_2<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset, - _DS, _VS, _Transposed, N>( - pred.data(), offsets.data(), src0.data(), src1.data(), si); + __esimd_lsc_xatomic_stateless_2<_MsgT, _Op, L1H, L3H, _AddressScale, + _ImmOffset, _DS, _VS, _Transposed, N>( + pred.data(), addrs.data(), src0.data(), src1.data()); return detail::lsc_format_ret(Tmp); -#endif } -/// USM pointer atomic. +/// Accessor-based atomic. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_atomic_.ugm /// @@ -1452,16 +1443,23 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @param p is the base pointer. +/// @tparam AccessorTy is the \ref sycl::accessor type. +/// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. /// @param pred is predicates. /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> -__ESIMD_API __ESIMD_NS::simd -lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + typename AccessorTy> +__ESIMD_API std::enable_if_t::value, + __ESIMD_NS::simd> +lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + return lsc_atomic_update( + __ESIMD_DNS::accessorToPointer(acc), offsets, pred); +#else detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); constexpr __ESIMD_NS::native::lsc::atomic_op _Op = @@ -1476,16 +1474,16 @@ lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using _MsgT = typename detail::lsc_expand_type::type; - __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); + auto si = __ESIMD_NS::get_surface_index(acc); __ESIMD_NS::simd<_MsgT, N> Tmp = - __esimd_lsc_xatomic_stateless_0<_MsgT, _Op, L1H, L3H, _AddressScale, - _ImmOffset, _DS, _VS, _Transposed, N>( - pred.data(), addrs.data()); + __esimd_lsc_xatomic_bti_0<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset, + _DS, _VS, _Transposed, N>(pred.data(), + offsets.data(), si); return detail::lsc_format_ret(Tmp); +#endif } -/// USM pointer atomic. +/// Accessor-based atomic. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_atomic_.ugm /// @@ -1495,17 +1493,24 @@ lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @param p is the base pointer. +/// @tparam AccessorTy is the \ref sycl::accessor type. +/// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. /// @param pred is predicates. /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> -__ESIMD_API __ESIMD_NS::simd -lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + typename AccessorTy> +__ESIMD_API std::enable_if_t::value, + __ESIMD_NS::simd> +lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + return lsc_atomic_update( + __ESIMD_DNS::accessorToPointer(acc), offsets, src0, pred); +#else detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); constexpr __ESIMD_NS::native::lsc::atomic_op _Op = @@ -1520,16 +1525,16 @@ lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using _MsgT = typename detail::lsc_expand_type::type; - __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); + auto si = __ESIMD_NS::get_surface_index(acc); __ESIMD_NS::simd<_MsgT, N> Tmp = - __esimd_lsc_xatomic_stateless_1<_MsgT, _Op, L1H, L3H, _AddressScale, - _ImmOffset, _DS, _VS, _Transposed, N>( - pred.data(), addrs.data(), src0.data()); + __esimd_lsc_xatomic_bti_1<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset, + _DS, _VS, _Transposed, N>( + pred.data(), offsets.data(), src0.data(), si); return detail::lsc_format_ret(Tmp); +#endif } -/// USM pointer atomic. +/// Accessor-based atomic. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_atomic_.ugm /// @@ -1539,7 +1544,8 @@ lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. -/// @param p is the base pointer. +/// @tparam AccessorTy is the \ref sycl::accessor type. +/// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. /// @param src1 is the second atomic operand. @@ -1547,11 +1553,17 @@ lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> -__ESIMD_API __ESIMD_NS::simd -lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, + cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + typename AccessorTy> +__ESIMD_API std::enable_if_t::value, + __ESIMD_NS::simd> +lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + return lsc_atomic_update( + __ESIMD_DNS::accessorToPointer(acc), offsets, src0, src1, pred); +#else detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); constexpr __ESIMD_NS::native::lsc::atomic_op _Op = @@ -1566,13 +1578,13 @@ lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using _MsgT = typename detail::lsc_expand_type::type; - __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); + auto si = __ESIMD_NS::get_surface_index(acc); __ESIMD_NS::simd<_MsgT, N> Tmp = - __esimd_lsc_xatomic_stateless_2<_MsgT, _Op, L1H, L3H, _AddressScale, - _ImmOffset, _DS, _VS, _Transposed, N>( - pred.data(), addrs.data(), src0.data(), src1.data()); + __esimd_lsc_xatomic_bti_2<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset, + _DS, _VS, _Transposed, N>( + pred.data(), offsets.data(), src0.data(), src1.data(), si); return detail::lsc_format_ret(Tmp); +#endif } /// Memory fence.