@@ -116,6 +116,12 @@ ESIMD_INLINE ESIMD_NODEBUG
116116 addrs.data (), ElemsPerAddrEncoding<ElemsPerAddr>(), pred.data ());
117117}
118118
119+ // TODO bring this SVM-based scatter/gather interface in accordance with
120+ // accessor-based ones - remove the ElemsPerAddr template parameter as it is
121+ // redundant: the only allowed block size in the underlying BE intrinsics is 1
122+ // byte with max number of blocks being 4. This means T template parameter alone
123+ // can model all supported cases.
124+
119125// / flat-address scatter
120126template <typename T, int n, int ElemsPerAddr = 1 ,
121127 CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
@@ -238,6 +244,148 @@ ESIMD_INLINE ESIMD_NODEBUG void block_store(AccessorTy acc, uint32_t offset,
238244#endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__
239245}
240246
247+ // / Accessor-based gather. Collects elements located at given offsets in
248+ // / an accessor and returns them as a single \ref simd object. An element can be
249+ // / 1, 2 or 4-byte value.
250+ // / Template (compile-time constant) parameters:
251+ // / @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
252+ // / @tparam N - the number of elements
253+ // / @tparam AccessorTy - \ref sycl::accessor type
254+ // / @tparam L1H - L1 cache hint
255+ // / @tparam L3H - L3 cache hint
256+ // /
257+ // / Formal parameters:
258+ // / @param acc - the accessor to gather from
259+ // / @param offsets - per-element offsets
260+ // / @param glob_offset - offset added to each individual element's offset to
261+ // / compute actual memory access offset for that element
262+ // /
263+ template <typename T, int N, typename AccessorTy,
264+ CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
265+ ESIMD_INLINE ESIMD_NODEBUG
266+ typename std::enable_if<(sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 ) &&
267+ !std::is_pointer<AccessorTy>::value,
268+ simd<T, N>>::type
269+ gather (AccessorTy acc, simd<uint32_t , N> offsets,
270+ uint32_t glob_offset = 0 ) {
271+
272+ constexpr int TypeSizeLog2 =
273+ sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof (T)>();
274+ // TODO (performance) use hardware-supported scale once BE supports it
275+ constexpr uint32_t scale = 0 ;
276+ constexpr uint32_t t_scale = sizeof (T);
277+ if constexpr (t_scale > 1 ) {
278+ glob_offset *= t_scale;
279+ offsets *= t_scale;
280+ }
281+
282+ if constexpr (sizeof (T) < 4 ) {
283+ static_assert (std::is_integral<T>::value,
284+ " only integral 1- & 2-byte types are supported" );
285+ using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t ,
286+ uint32_t >::type;
287+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
288+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
289+ const simd<PromoT, N> promo_vals =
290+ __esimd_surf_read<PromoT, N, decltype (surf_ind), TypeSizeLog2, L1H,
291+ L3H>(scale, surf_ind, glob_offset, offsets);
292+ #else
293+ const simd<PromoT, N> promo_vals =
294+ __esimd_surf_read<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
295+ scale, acc, glob_offset, offsets);
296+ #endif
297+ return sycl::INTEL::gpu::convert<T>(promo_vals);
298+ } else {
299+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
300+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
301+ return __esimd_surf_read<T, N, decltype (surf_ind), TypeSizeLog2, L1H, L3H>(
302+ scale, surf_ind, glob_offset, offsets);
303+ #else
304+ return __esimd_surf_read<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
305+ scale, acc, glob_offset, offsets);
306+ #endif
307+ }
308+ }
309+
310+ // / Accessor-based scatter. Writes elements of a \ref simd object into an
311+ // / accessor at given offsets. An element can be 1, 2 or 4-byte value.
312+ // / Template (compile-time constant) parameters:
313+ // / @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
314+ // / @tparam N - the number of elements
315+ // / @tparam AccessorTy - \ref sycl::accessor type
316+ // / @tparam L1H - L1 cache hint
317+ // / @tparam L3H - L3 cache hint
318+ // /
319+ // / Formal parameters:
320+ // / @param acc - the accessor to scatter to
321+ // / @param vals - values to write
322+ // / @param offsets - per-element offsets
323+ // / @param glob_offset - offset added to each individual element's offset to
324+ // / compute actual memory access offset for that element
325+ // / @param pred - per-element predicates; elements with zero corresponding
326+ // / predicates are not written
327+ // /
328+ template <typename T, int N, typename AccessorTy,
329+ CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
330+ ESIMD_INLINE ESIMD_NODEBUG
331+ typename std::enable_if<(sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 ) &&
332+ !std::is_pointer<AccessorTy>::value,
333+ void >::type
334+ scatter (AccessorTy acc, simd<T, N> vals, simd<uint32_t , N> offsets,
335+ uint32_t glob_offset = 0 , simd<uint16_t , N> pred = 1 ) {
336+
337+ constexpr int TypeSizeLog2 =
338+ sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof (T)>();
339+ // TODO (performance) use hardware-supported scale once BE supports it
340+ constexpr uint32_t scale = 0 ;
341+ constexpr uint32_t t_scale = sizeof (T);
342+ if constexpr (t_scale > 1 ) {
343+ glob_offset *= t_scale;
344+ offsets *= t_scale;
345+ }
346+
347+ if constexpr (sizeof (T) < 4 ) {
348+ static_assert (std::is_integral<T>::value,
349+ " only integral 1- & 2-byte types are supported" );
350+ using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t ,
351+ uint32_t >::type;
352+ const simd<PromoT, N> promo_vals = sycl::INTEL::gpu::convert<PromoT>(vals);
353+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
354+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
355+ __esimd_surf_write<PromoT, N, decltype (surf_ind), TypeSizeLog2, L1H, L3H>(
356+ pred, scale, surf_ind, glob_offset, offsets, promo_vals);
357+ #else
358+ __esimd_surf_write<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
359+ pred, scale, acc, glob_offset, offsets, promo_vals);
360+ #endif
361+ } else {
362+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
363+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
364+ __esimd_surf_write<T, N, decltype (surf_ind), TypeSizeLog2, L1H, L3H>(
365+ pred, scale, surf_ind, glob_offset, offsets, vals);
366+ #else
367+ __esimd_surf_write<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
368+ pred, scale, acc, glob_offset, offsets, vals);
369+ #endif
370+ }
371+ }
372+
373+ // / Load a scalar value from an accessor.
374+ template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
375+ CacheHint L3H = CacheHint::None>
376+ ESIMD_INLINE ESIMD_NODEBUG T scalar_load (AccessorTy acc, uint32_t offset) {
377+ const simd<T, 1 > Res = gather<T>(acc, simd<uint32_t , 1 >{offset});
378+ return Res[0 ];
379+ }
380+
381+ // / Store a scalar value into an accessor.
382+ template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
383+ CacheHint L3H = CacheHint::None>
384+ ESIMD_INLINE ESIMD_NODEBUG void scalar_store (AccessorTy acc, uint32_t offset,
385+ T val) {
386+ scatter<T>(acc, simd<T, 1 >{val}, simd<uint32_t , 1 >{offset});
387+ }
388+
241389// TODO @jasonsewall-intel
242390// Don't use '4' in the name - instead either make it a parameter or
243391// (if it must be constant) - try to deduce from other arguments.
0 commit comments