diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index d262586e1f077..8669f3ba0600f 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -246,6 +246,335 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( } #endif // __SYCL_DEVICE_ONLY__ +#ifndef __SYCL_DEVICE_ONLY__ +// Shared utility/helper functions for LSC support under emulation +// (ESIMD_EMULATOR backend) + +// Raw-address increment function for u8u32 and u16u32 +template +constexpr uint32_t rawAddressIncrement() { + if constexpr (DS == __ESIMD_ENS::lsc_data_size::u8u32) { + return 1; + } else if constexpr (DS == __ESIMD_ENS::lsc_data_size::u16u32) { + return 2; + } else { + return (uint32_t)sizeof(Ty); + } +} + +// Vector index increment function for 'Transposed' 2D-surface access +template +constexpr int vectorIndexIncrement() { + if constexpr (_Transposed == __ESIMD_EDNS::lsc_data_order::transpose) { + return 1; + } else { + return N; + } +} + +// Load/Store align bitmask generator for 1-D vector load/store +// +// Not only generates address-align bitmask, but also checks +// legitimacy of load/store operation with respect to vector size, +// data size +/// @tparam Ty is element type. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements to load per address. +/// @tparam N is the SIMD size of operation (the number of addresses to access) +template +constexpr unsigned loadstoreAlignMask() { + constexpr __ESIMD_ENS::lsc_data_size _DS = + __ESIMD_EDNS::finalize_data_size(); // Actual data_size + + if constexpr (VS == __ESIMD_EDNS::lsc_vector_size::n1) { + static_assert(((_DS == __ESIMD_ENS::lsc_data_size::u32) || + (_DS == __ESIMD_ENS::lsc_data_size::u64) || + (_DS == __ESIMD_ENS::lsc_data_size::u8) || + (_DS == __ESIMD_ENS::lsc_data_size::u16) || + (_DS == __ESIMD_ENS::lsc_data_size::u8u32) || + (_DS == __ESIMD_ENS::lsc_data_size::u16u32)) && + "Wrong __ESIMD_EDNS::lsc_data_size for " + "__ESIMD_EDNS::lsc_vector_size == 1\n" + "(loadstoreAlignMask)"); + return 0x0; + } else if constexpr ((VS == __ESIMD_EDNS::lsc_vector_size::n2) || + (VS == __ESIMD_EDNS::lsc_vector_size::n3) || + (VS == __ESIMD_EDNS::lsc_vector_size::n4) || + (VS == __ESIMD_EDNS::lsc_vector_size::n8)) { + static_assert( + ((_DS == __ESIMD_ENS::lsc_data_size::u32) || + (_DS == __ESIMD_ENS::lsc_data_size::u64)) && + "Wrong Data Size for __ESIMD_EDNS::lsc_vector_size == 2/3/4/8\n" + "(loadstoreAlignMask)"); + // 0x3 for u32 / 0x7 for u64 + if constexpr (_DS == __ESIMD_ENS::lsc_data_size::u32) + return 0x3; + else + return 0x7; + } else if constexpr ((VS == __ESIMD_EDNS::lsc_vector_size::n16) || + (VS == __ESIMD_EDNS::lsc_vector_size::n32) || + (VS == __ESIMD_EDNS::lsc_vector_size::n64)) { + static_assert( + (N == 1) && + "Unsupported Size for __ESIMD_EDNS::lsc_vector_size = 16/32/64\n" + "(loadstoreAlignMask)"); + // 0x3 for u32 / 0x7 for u64 + if constexpr (_DS == __ESIMD_ENS::lsc_data_size::u32) + return 0x3; + else + return 0x7; + } else { + static_assert((N != N) && "Wrong Vector Size!!"); + } +} + +// Helper function for loading from indexed-surface and SLM +// INT_MAX is for SLM +template +auto __esimd_emu_lsc_offset_read( + __ESIMD_DNS::simd_mask_storage_t Pred, + __ESIMD_DNS::vector_type_t Offsets, char *ReadBase, + int BufByteWidth = INT_MAX) { + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + __ESIMD_DNS::vector_type_t()> Output = 0; + + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); + + for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) { + if (Pred[OffsetIdx] == 0) { + // Skip Output vector elements correpsonding to + // predicates whose value is zero + continue; + } + + assert(((Offsets[OffsetIdx] & MASK)) == 0 && "Offset Alignment Error!!"); + + // ByteDistance : byte-distance from buffer-read base + int ByteDistance = Offsets[OffsetIdx]; + + for (int ChanelIdx = 0, VecIdx = OffsetIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + + if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) { + Output[VecIdx] = *((Ty *)(ReadBase + ByteDistance)); + } + } + } + return Output; +} + +// Helper function for storing to indexed-surface and SLM. INT_MAX is +// for SLM +template +void __esimd_emu_lsc_offset_write( + __ESIMD_DNS::simd_mask_storage_t Pred, + __ESIMD_DNS::vector_type_t Offsets, + __ESIMD_DNS::vector_type_t()> vals, + char *WriteBase, int BufByteWidth = INT_MAX) { + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + using StoreType = typename std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8, uint8_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u16, uint16_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u32, uint32_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u64, uint64_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8u32, uint8_t, + std::conditional_t>>>>>; + + for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) { + if (Pred[OffsetIdx] == 0) { + // Skip input vector elements correpsonding to + // predicates whose value is zero + continue; + } + + assert(((Offsets[OffsetIdx] & MASK)) == 0 && "Offset Alignment Error!!"); + + // ByteDistance : byte-distance from buffer-write base + int ByteDistance = Offsets[OffsetIdx]; + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); + + for (int ChanelIdx = 0, VecIdx = OffsetIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + + if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) { + *((StoreType *)(WriteBase + ByteDistance)) = vals[VecIdx]; + } + } + } +} + +/// Generic helper function of 2D Block Read supporting both 2d-load +/// and raw_send +template +__ESIMD_DNS::vector_type_t +__esimd_emu_read_2d(__ESIMD_DNS::simd_mask_storage_t Pred, uintptr_t Ptr, + unsigned SurfaceWidth, unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y, int Width, int Height, + int NBlks, __ESIMD_EDNS::lsc_data_order _Transposed, + bool Transformed) { + assert(SurfaceHeight >= 0); + assert(SurfaceWidth >= 0); + assert(SurfaceWidth <= SurfacePitch); + + SurfaceHeight += 1; + SurfaceWidth += 1; + SurfacePitch += 1; + + constexpr unsigned sizeofTy = sizeof(Ty); + + __ESIMD_DNS::vector_type_t Output = 0; + + char *buff = (char *)Ptr; + assert(buff != NULL); + + int vecIdx = 0; + int blkCount = 0; + + for (int xBase = X * sizeofTy; blkCount < NBlks; xBase += sizeofTy * Width) { + if (Transformed == true) { + constexpr int elems_per_DW = (sizeofTy == 1) ? 4 : 2; /// VNNI_pack + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; + u += elems_per_DW, yRead += SurfacePitch * elems_per_DW) { + vecIdx = u * sycl::detail::getNextPowerOfTwo(Width) + + blkCount * Height * sycl::detail::getNextPowerOfTwo(Width); + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + /// Vertically out-of-bound, skip corresponding vector elements + vecIdx += Width * elems_per_DW; + continue; + } + + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofTy) { + if ((xRead < 0) || (xRead >= SurfaceWidth)) { + /// Horizontally out-of-bound, skip corresponding vector elements + vecIdx += elems_per_DW; + continue; + } + + char *base = buff + xRead; + int offset = yRead; + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + if (Pred[vecIdx] != 0) { + if (offset >= 0 && offset < SurfacePitch * SurfaceHeight) { + Output[vecIdx] = *((Ty *)(base + offset)); + } + } + // Increasing in Y-direction + offset += SurfacePitch; + } // k loop + } // v loop + } // u loop + } // (Transformed == true) + else if (_Transposed == __ESIMD_EDNS::lsc_data_order::transpose) { + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofTy) { + if ((xRead < 0) || (xRead >= SurfaceWidth)) { + // Horizontally out-of-bound, skip corresponding vector elements + vecIdx += Height; + continue; + } + + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; + u += 1, yRead += SurfacePitch, vecIdx += 1) { + if (Pred[vecIdx] != 0) { + if ((yRead >= 0) && (yRead < SurfacePitch * SurfaceHeight)) { + Output[vecIdx] = *((Ty *)(buff + yRead + xRead)); + } + } + } // u loop + } // v loop + } // (_Transposed == __ESIMD_EDNS::lsc_data_order::transpose) + else { + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; u += 1, yRead += SurfacePitch) { + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + // Vertically Out-of-bound, skip corresponding vector elements + vecIdx += Width; + continue; + } + + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofTy, vecIdx += 1) { + if (Pred[vecIdx] != 0) { + if ((xRead >= 0) && (xRead < SurfaceWidth)) { + Output[vecIdx] = *((Ty *)(buff + yRead + xRead)); + } + } + } // v loop + } // u loop + } // Linear loading + blkCount += 1; + vecIdx = blkCount * sycl::detail::getNextPowerOfTwo(Width) * Height; + } // xBase loop + + return Output; +} + +/// Generic helper function of 2D Block Write supporting both +/// 2d-write and raw_send +template +void __esimd_emu_write_2d(__ESIMD_DNS::simd_mask_storage_t Pred, + uintptr_t Ptr, unsigned SurfaceWidth, + unsigned SurfaceHeight, unsigned SurfacePitch, int X, + int Y, __ESIMD_DNS::vector_type_t vals, + int Width, int Height) { + assert(SurfaceHeight >= 0); + assert(SurfaceWidth >= 0); + assert(SurfaceWidth <= SurfacePitch); + + SurfaceHeight += 1; + SurfaceWidth += 1; + SurfacePitch += 1; + + constexpr unsigned sizeofTy = sizeof(Ty); + + char *buff = (char *)Ptr; + assert(buff != NULL); + + int vecIdx = 0; + int rowCount = 0; + for (int yWrite = Y * SurfacePitch; rowCount < Height; + yWrite += SurfacePitch) { + if (yWrite == SurfacePitch * SurfaceHeight) { + // Vertically Out-of-bound + break; + } + int writeCount = 0; + for (int xWrite = X * sizeofTy; writeCount < Width; + xWrite += sizeofTy, vecIdx += 1, writeCount += 1) { + if (xWrite >= 0 && xWrite < SurfaceWidth && Pred[vecIdx] != 0) { + *((Ty *)(buff + yWrite + xWrite)) = vals[vecIdx]; + } + } // xWrite loop + rowCount += 1; + } // yWrite loop +} + +#endif + /// SLM gather. /// Supported platforms: DG2, PVC /// @@ -260,7 +589,7 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets for SLM buffer in bytes. /// @return is a vector of type T and size N * to_int() @@ -275,8 +604,13 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + return __esimd_emu_lsc_offset_read()>( + pred, offsets, I->__cm_emu_get_slm_ptr()); } #endif // __SYCL_DEVICE_ONLY__ @@ -294,7 +628,7 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -313,8 +647,21 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + char *readBase; + uint32_t width; + std::mutex *mutexLock; + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->sycl_get_cm_buffer_params_ptr(surf_ind, &readBase, &width, &mutexLock); + + std::lock_guard lock(*mutexLock); + + return __esimd_emu_lsc_offset_read()>( + pred, offsets, readBase, width); } #endif // __SYCL_DEVICE_ONLY__ @@ -332,7 +679,7 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the load addresses. /// @return is a vector of type T and N * to_int() @@ -347,8 +694,36 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + __ESIMD_DNS::vector_type_t()> Output = 0; + + for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) { + if (pred[AddrIdx] == 0) { + // Skip Output vector elements correpsonding to + // predicates whose value is zero + continue; + } + + constexpr uint MASK = loadstoreAlignMask(); + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); + + int ByteDistance = 0; + uintptr_t BaseAddr = addrs[AddrIdx]; + + assert(((BaseAddr & MASK)) == 0 && "Address Alignment Error!!"); + + for (int ChanelIdx = 0, VecIdx = AddrIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + + Output[VecIdx] = *((Ty *)(BaseAddr + ByteDistance)); + } + } + return Output; } #endif // __SYCL_DEVICE_ONLY__ @@ -365,7 +740,7 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -383,7 +758,8 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Prefetch is NOP under ESIMD_EMULATOR + return; } #endif // __SYCL_DEVICE_ONLY__ @@ -400,7 +776,7 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. template pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Prefetch is NOP under ESIMD_EMULATOR + return; } #endif // __SYCL_DEVICE_ONLY__ @@ -431,7 +808,7 @@ __esimd_lsc_prefetch_stateless(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets for SLM buffer in bytes. /// @param vals is values to store. @@ -447,7 +824,12 @@ __ESIMD_INTRIN void __esimd_lsc_store_slm( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + __esimd_emu_lsc_offset_write()>( + pred, offsets, vals, I->__cm_emu_get_slm_ptr()); } #endif // __SYCL_DEVICE_ONLY__ @@ -464,7 +846,7 @@ __ESIMD_INTRIN void __esimd_lsc_store_slm( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -484,7 +866,20 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + char *writeBase; + uint32_t width; + std::mutex *mutexLock; + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->sycl_get_cm_buffer_params_ptr(surf_ind, &writeBase, &width, &mutexLock); + + std::lock_guard lock(*mutexLock); + + __esimd_emu_lsc_offset_write()>( + pred, offsets, vals, writeBase, width); } #endif // __SYCL_DEVICE_ONLY__ @@ -501,7 +896,7 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param vals is values to store. @@ -517,7 +912,46 @@ __ESIMD_INTRIN void __esimd_lsc_store_stateless( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + using StoreType = typename std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8, uint8_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u16, uint16_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u32, uint32_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u64, uint64_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8u32, uint8_t, + std::conditional_t>>>>>; + + for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) { + if (pred[AddrIdx] == 0) { + // Skip Output vector elements correpsonding to + // predicates whose value is zero + continue; + } + + constexpr uint MASK = loadstoreAlignMask(); + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); + + int ByteDistance = 0; + uintptr_t BaseAddr = addrs[AddrIdx]; + + assert(((BaseAddr & MASK)) == 0 && "Address Alignment Error!!"); + + for (int ChanelIdx = 0, VecIdx = AddrIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + *((StoreType *)(BaseAddr + ByteDistance)) = vals[VecIdx]; + } + } } #endif // __SYCL_DEVICE_ONLY__ @@ -563,8 +997,11 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + // Template arguments are already checked by + // check_lsc_block_2d_restrictions() + return __esimd_emu_read_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, + SurfacePitch, X, Y, BlockWidth, BlockHeight, + NBlocks, _Transposed, Transformed); } #endif // __SYCL_DEVICE_ONLY__ @@ -603,7 +1040,8 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Prefetch is NOP under ESIMD_EMULATOR + return; } #endif // __SYCL_DEVICE_ONLY__ @@ -649,7 +1087,11 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Template arguments are already checked by + // check_lsc_block_2d_restrictions() + __esimd_emu_write_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, + SurfacePitch, X, Y, vals, BlockWidth, + BlockHeight); } #endif // __SYCL_DEVICE_ONLY__ @@ -665,7 +1107,7 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -732,7 +1174,7 @@ __esimd_lsc_xatomic_slm_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -769,7 +1211,7 @@ __esimd_lsc_xatomic_slm_2( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -804,7 +1246,7 @@ __esimd_lsc_xatomic_bti_0(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -842,7 +1284,7 @@ __esimd_lsc_xatomic_bti_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -882,7 +1324,7 @@ __esimd_lsc_xatomic_bti_2( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) + /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param src0 is the first atomic operand. @@ -949,7 +1392,7 @@ __esimd_lsc_xatomic_stateless_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param src0 is the first atomic operand. @@ -980,7 +1423,7 @@ __esimd_lsc_xatomic_stateless_2( /// @tparam Kind is the Sfid shaded function. /// @tparam FenceOp is the fence operation. /// @tparam Scope is the operation scope. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. template <__ESIMD_ENS::lsc_memory_kind Kind, __ESIMD_ENS::lsc_fence_op FenceOp, __ESIMD_ENS::lsc_scope Scope, int N>