From 15f65f663504b0d2897b52a8c9a292859f43718f Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Tue, 26 Apr 2022 11:24:50 -0700 Subject: [PATCH 1/5] [SYCL][ESIMD][EMU] LSC support for ESIMD_EMULATOR backend 36 out of 40 LSC tests are passing --- .../esimd/detail/memory_intrin.hpp | 533 +++++++++++++++++- 1 file changed, 518 insertions(+), 15 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index d262586e1f077..379e202d14755 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -246,6 +246,395 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( } #endif // __SYCL_DEVICE_ONLY__ +#ifndef __SYCL_DEVICE_ONLY__ +// Shared utility/helper functions for LSC support under emulation +// (ESIMD_EMULATOR backend) + +// Raw-address increment function for u8u32 and u16u32 +template +constexpr uint32_t rawAddressIncrement() { + if constexpr (DS == __ESIMD_ENS::lsc_data_size::u8u32) { + return 1; + } else if constexpr (DS == __ESIMD_ENS::lsc_data_size::u16u32) { + return 2; + } else { + return (uint32_t)sizeof(Ty); + } +} + +// Vector index increment function for 'Transposed' 2D-surface access +template +constexpr int vectorIndexIncrement() { + if constexpr (_Transposed == __ESIMD_EDNS::lsc_data_order::transpose) { + return 1; + } else { + return N; + } +} + +// Load/Store align bitmask generator for 1-D vector load/store +// +// Not only generates address-align bitmask, but also checks +// legitimacy of load/store operation with respect to vector size, +// data size, and SIMT +template +constexpr unsigned loadstoreAlignMask() { + constexpr __ESIMD_ENS::lsc_data_size _DS = + __ESIMD_EDNS::finalize_data_size(); // Actual data_size + + if constexpr (VS == __ESIMD_EDNS::lsc_vector_size::n1) { + static_assert(((_DS == __ESIMD_ENS::lsc_data_size::u32) || + (_DS == __ESIMD_ENS::lsc_data_size::u64) || + (_DS == __ESIMD_ENS::lsc_data_size::u8) || + (_DS == __ESIMD_ENS::lsc_data_size::u16) || + (_DS == __ESIMD_ENS::lsc_data_size::u8u32) || + (_DS == __ESIMD_ENS::lsc_data_size::u16u32)) && + "Wrong __ESIMD_EDNS::lsc_data_size for " + "__ESIMD_EDNS::lsc_vector_size == 1\n" + "(loadstoreAlignMask)"); + return 0x0; + } else if constexpr ((VS == __ESIMD_EDNS::lsc_vector_size::n2) || + (VS == __ESIMD_EDNS::lsc_vector_size::n3) || + (VS == __ESIMD_EDNS::lsc_vector_size::n4) || + (VS == __ESIMD_EDNS::lsc_vector_size::n8)) { + static_assert( + ((_DS == __ESIMD_ENS::lsc_data_size::u32) || + (_DS == __ESIMD_ENS::lsc_data_size::u64)) && + "Wrong Data Size for __ESIMD_EDNS::lsc_vector_size == 2/3/4/8\n" + "(loadstoreAlignMask)"); + // 0x3 for u32 / 0x7 for u64 + if constexpr (_DS == __ESIMD_ENS::lsc_data_size::u32) + return 0x3; + else + return 0x7; + } else if constexpr ((VS == __ESIMD_EDNS::lsc_vector_size::n16) || + (VS == __ESIMD_EDNS::lsc_vector_size::n32) || + (VS == __ESIMD_EDNS::lsc_vector_size::n64)) { + static_assert( + (N == 1) && + "Unsupported SIMT Size for __ESIMD_EDNS::lsc_vector_size = 16/32/64\n" + "(loadstoreAlignMask)"); + // 0x3 for u32 / 0x7 for u64 + if constexpr (_DS == __ESIMD_ENS::lsc_data_size::u32) + return 0x3; + else + return 0x7; + } else { + static_assert((N != N) && "Wrong Vector Size!!"); + } +} + +// Helper function for loading from indexed-surface and SLM +// INT_MAX is for SLM +template +auto __esimd_emu_lsc_offset_read( + __ESIMD_DNS::simd_mask_storage_t Pred, + __ESIMD_DNS::vector_type_t Offsets, char *ReadBase, + int BufByteWidth = INT_MAX) { + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + __ESIMD_DNS::vector_type_t()> Output = 0; + + constexpr int ElemCount = __ESIMD_EDNS::to_int(); + + for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) { + if (Pred[OffsetIdx] == 0) { + // Skip Output vector elements correpsonding to + // predicates whose value is zero + continue; + } + + assert(((Offsets[OffsetIdx] & MASK)) == 0 && "Offset Alignment Error!!"); + + // ByteDistance : byte-distance from buffer-read base + int ByteDistance = Offsets[OffsetIdx]; + + for (int ElemIdx = 0, VecIdx = OffsetIdx; ElemIdx < ElemCount; ElemIdx += 1, + ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + + if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) { + Output[VecIdx] = *((Ty *)(ReadBase + ByteDistance)); + } + } + } + return Output; +} + +// Helper function for storing to indexed-surface and SLM. INT_MAX is +// for SLM +template +void __esimd_emu_lsc_offset_write( + __ESIMD_DNS::simd_mask_storage_t Pred, + __ESIMD_DNS::vector_type_t Offsets, + __ESIMD_DNS::vector_type_t()> vals, + char *WriteBase, int BufByteWidth = INT_MAX) { + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + using StoreType = typename std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8, uint8_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u16, uint16_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u32, uint32_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u64, uint64_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8u32, uint8_t, + std::conditional_t>>>>>; + + constexpr int ElemCount = __ESIMD_EDNS::to_int(); + + for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) { + if (Pred[OffsetIdx] == 0) { + // Skip input vector elements correpsonding to + // predicates whose value is zero + continue; + } + + assert(((Offsets[OffsetIdx] & MASK)) == 0 && "Offset Alignment Error!!"); + + // ByteDistance : byte-distance from buffer-write base + int ByteDistance = Offsets[OffsetIdx]; + + for (int ElemIdx = 0, VecIdx = OffsetIdx; ElemIdx < ElemCount; ElemIdx += 1, + ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + + if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) { + *((StoreType *)(WriteBase + ByteDistance)) = vals[VecIdx]; + } + } + } +} + +/// Stateless-2d operations +/// Template argument check for 2D-load/store +template +constexpr void loadstore2DArgumentCheck() { + const __ESIMD_ENS::lsc_data_size _DS = __ESIMD_EDNS::finalize_data_size< + T, __ESIMD_ENS::lsc_data_size::default_size>(); + static_assert(__ESIMD_DNS::isPowerOf2(NBlks) && (NBlks * sizeof(T) <= 8), + "NBlks must be power of 2 and less than or equal to 4!!"); + + if constexpr (isStore == true) { + static_assert(NBlks == 1, "Mutliple Blocks are not allowed for 2D store!!"); + static_assert(Transposed == __ESIMD_EDNS::lsc_data_order::nontranspose, + "No Transposed 2D store!!"); + static_assert(Transformed == false, "No Transformed 2D store!!"); + static_assert((Height >= 1) && (Height <= 32), + "Invalid Height for 2D store!! H > 32 or H == 0"); + + static_assert((Width * sizeof(T) >= 4) && (Width * sizeof(T) <= 64), + "Invalid Width for 2D store!!"); + + static_assert(sycl::detail::getNextPowerOfTwo(Width * sizeof(T)) * Height <= + 512, + "Invalid Width * Height combination!!"); + } else // isStore == false + { + // Restriction : Width * NBlks + static_assert( + Width * NBlks * sizeof(T) <= 64, + "Invalid Width/NBlks combination!! (W * NBlks * sizeof(T) > 64)"); + + static_assert( + ((Transposed == __ESIMD_EDNS::lsc_data_order::transpose) & + Transformed) != true, + "Transpose and Transform cannot be used together for 2D-load!!"); + + if constexpr (Transformed == false) { + if constexpr (Transposed == __ESIMD_EDNS::lsc_data_order::transpose) { + static_assert(NBlks == 1, + "Invalid NBlks for Transposed 2D load!! NBlks != 1"); + } + + static_assert((Height >= 1) && (Height <= 32), + "Invalid Height for Non-transform 2D load!!"); + + static_assert((Width * sizeof(T) >= 4) && (Width * sizeof(T) <= 64), + "Invalid Width for Non-transform 2D load!!"); + } else // Transformed == true + { + static_assert( + (_DS == __ESIMD_ENS::lsc_data_size::u8) || + (_DS == __ESIMD_ENS::lsc_data_size::u16), + "For Transformed 2D read, DataSize must be either U8 or U16"); + + static_assert((Width * sizeof(T) >= 4) && (Width <= 16), + "Invalid Width for Transformed/Non-Transposed 2D load!!"); + static_assert((Height * sizeof(T) >= 4) && (Height <= 32), + "Invalid Height for Transformed/Non-Transposed 2D load!!"); + } + } +} + +/// Generic helper function of 2D Block Read supporting both 2d-load +/// and raw_send +template +__ESIMD_DNS::vector_type_t +__esimd_emu_read_2d(__ESIMD_DNS::simd_mask_storage_t Pred, uintptr_t Ptr, + unsigned SurfaceWidth, unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y, int Width, int Height, + int NBlks, __ESIMD_EDNS::lsc_data_order _Transposed, + bool Transformed) { + assert(SurfaceHeight >= 0); + assert(SurfaceWidth >= 0); + assert(SurfaceWidth <= SurfacePitch); + + SurfaceHeight += 1; + SurfaceWidth += 1; + SurfacePitch += 1; + + constexpr unsigned sizeofTy = sizeof(Ty); + + __ESIMD_DNS::vector_type_t Output = 0; + + char *buff = (char *)Ptr; + assert(buff != NULL); + + int vecIdx = 0; + int blkCount = 0; + + for (int xBase = X * sizeofTy; blkCount < NBlks; xBase += sizeofTy * Width) { + if (Transformed == true) { + constexpr int elems_per_DW = (sizeofTy == 1) ? 4 : 2; /// VNNI_pack + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; + u += elems_per_DW, yRead += SurfacePitch * elems_per_DW) { + vecIdx = u * sycl::detail::getNextPowerOfTwo(Width) + + blkCount * Height * sycl::detail::getNextPowerOfTwo(Width); + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + /// Vertically out-of-bound, skip corresponding vector elements + vecIdx += Width * elems_per_DW;; + continue; + } + + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofTy) { + if ((xRead < 0) || (xRead >= SurfaceWidth)) { + /// Horizontally out-of-bound, skip corresponding vector elements + vecIdx += elems_per_DW; + continue; + } + + char *base = buff + xRead; + int offset = yRead; + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + if (Pred[vecIdx] != 0) { + if (offset >= 0 && offset < SurfacePitch * SurfaceHeight) { + Output[vecIdx] = *((Ty *)(base + offset)); + } + } + // Increasing in Y-direction + offset += SurfacePitch; + } // k loop + } // v loop + } // u loop + } // (Transformed == true) + else if (_Transposed == __ESIMD_EDNS::lsc_data_order::transpose) { + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofTy) { + if ((xRead < 0) || (xRead >= SurfaceWidth)) { + // Horizontally out-of-bound, skip corresponding vector elements + vecIdx += Height; + continue; + } + + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; + u += 1, yRead += SurfacePitch, vecIdx += 1) { + if (Pred[vecIdx] != 0) { + if ((yRead >= 0) && (yRead < SurfacePitch * SurfaceHeight)) { + Output[vecIdx] = *((Ty *)(buff + yRead + xRead)); + } + } + } // u loop + } // v loop + } // (_Transposed == __ESIMD_EDNS::lsc_data_order::transpose) + else { + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; u += 1, yRead += SurfacePitch) { + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + // Vertically Out-of-bound, skip corresponding vector elements + vecIdx += Width; + continue; + } + + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofTy, vecIdx += 1) { + if (Pred[vecIdx] != 0) { + if ((xRead >= 0) && (xRead < SurfaceWidth)) { + Output[vecIdx] = *((Ty *)(buff + yRead + xRead)); + } + } + } // v loop + } // u loop + } // Linear loading + blkCount += 1; + vecIdx = blkCount * sycl::detail::getNextPowerOfTwo(Width) * Height; + } // xBase loop + + return Output; +} + +/// Generic helper function of 2D Block Write supporting both +/// 2d-write and raw_send +template +void __esimd_emu_write_2d(__ESIMD_DNS::simd_mask_storage_t Pred, + uintptr_t Ptr, unsigned SurfaceWidth, + unsigned SurfaceHeight, unsigned SurfacePitch, int X, + int Y, __ESIMD_DNS::vector_type_t vals, + int Width, int Height) { + assert(SurfaceHeight >= 0); + assert(SurfaceWidth >= 0); + assert(SurfaceWidth <= SurfacePitch); + + SurfaceHeight += 1; + SurfaceWidth += 1; + SurfacePitch += 1; + + constexpr unsigned sizeofTy = sizeof(Ty); + + char *buff = (char *)Ptr; + assert(buff != NULL); + + int vecIdx = 0; + int rowCount = 0; + for (int yWrite = Y * SurfacePitch; rowCount < Height; + yWrite += SurfacePitch) { + if (yWrite == SurfacePitch * SurfaceHeight) { + // Vertically Out-of-bound + break; + } + int writeCount = 0; + for (int xWrite = X * sizeofTy; writeCount < Width; + xWrite += sizeofTy, vecIdx += 1, writeCount += 1) { + if (xWrite >= 0 && xWrite < SurfaceWidth && Pred[vecIdx] != 0) { + *((Ty *)(buff + yWrite + xWrite)) = vals[vecIdx]; + } + } // xWrite loop + rowCount += 1; + } // yWrite loop +} + +#endif + /// SLM gather. /// Supported platforms: DG2, PVC /// @@ -275,8 +664,14 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + constexpr uint MASK = loadstoreAlignMask(); + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + return __esimd_emu_lsc_offset_read( + pred, offsets, I->__cm_emu_get_slm_ptr()); } #endif // __SYCL_DEVICE_ONLY__ @@ -313,8 +708,21 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + constexpr uint MASK = loadstoreAlignMask(); + char *readBase; + uint32_t width; + std::mutex *mutexLock; + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->sycl_get_cm_buffer_params_ptr(surf_ind, &readBase, &width, &mutexLock); + + std::lock_guard lock(*mutexLock); + + return __esimd_emu_lsc_offset_read(pred, offsets, + readBase, width); } #endif // __SYCL_DEVICE_ONLY__ @@ -347,8 +755,35 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + constexpr uint MASK = loadstoreAlignMask(); + __ESIMD_DNS::vector_type_t()> Output = 0; + + constexpr int ElemCount = __ESIMD_EDNS::to_int(); + + for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) { + if (pred[AddrIdx] == 0) { + // Skip Output vector elements correpsonding to + // predicates whose value is zero + continue; + } + + int ByteDistance = 0; + + assert(((addrs[AddrIdx] & MASK)) == 0 && "Address Alignment Error!!"); + + for (int ElemIdx = 0, VecIdx = AddrIdx; ElemIdx < ElemCount; ElemIdx += 1, + ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + + Output[VecIdx] = *((Ty *)(addrs[AddrIdx] + ByteDistance)); + } + } + return Output; } #endif // __SYCL_DEVICE_ONLY__ @@ -383,7 +818,8 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Prefetch is NOP under ESIMD_EMULATOR + return; } #endif // __SYCL_DEVICE_ONLY__ @@ -414,7 +850,8 @@ __esimd_lsc_prefetch_stateless(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Prefetch is NOP under ESIMD_EMULATOR + return; } #endif // __SYCL_DEVICE_ONLY__ @@ -447,7 +884,14 @@ __ESIMD_INTRIN void __esimd_lsc_store_slm( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + constexpr uint MASK = loadstoreAlignMask(); + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + __esimd_emu_lsc_offset_write(pred, offsets, vals, + I->__cm_emu_get_slm_ptr()); } #endif // __SYCL_DEVICE_ONLY__ @@ -484,7 +928,20 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + constexpr uint MASK = loadstoreAlignMask(); + char *writeBase; + uint32_t width; + std::mutex *mutexLock; + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->sycl_get_cm_buffer_params_ptr(surf_ind, &writeBase, &width, &mutexLock); + + std::lock_guard lock(*mutexLock); + + __esimd_emu_lsc_offset_write(pred, offsets, vals, writeBase, width); } #endif // __SYCL_DEVICE_ONLY__ @@ -517,7 +974,45 @@ __ESIMD_INTRIN void __esimd_lsc_store_stateless( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // TODO : Support AddressScale, ImmOffset + static_assert(AddressScale == 1); + static_assert(ImmOffset == 0); + static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); + + using StoreType = typename std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8, uint8_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u16, uint16_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u32, uint32_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u64, uint64_t, + std::conditional_t< + DS == __ESIMD_ENS::lsc_data_size::u8u32, uint8_t, + std::conditional_t>>>>>; + + constexpr int ElemCount = __ESIMD_EDNS::to_int(); + constexpr uint MASK = loadstoreAlignMask(); + + for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) { + if (pred[AddrIdx] == 0) { + // Skip Output vector elements correpsonding to + // predicates whose value is zero + continue; + } + + int ByteDistance = 0; + + assert(((addrs[AddrIdx] & MASK)) == 0 && "Address Alignment Error!!"); + + for (int ElemIdx = 0, VecIdx = AddrIdx; ElemIdx < ElemCount; ElemIdx += 1, + ByteDistance += rawAddressIncrement(), + VecIdx += vectorIndexIncrement()) { + *((StoreType *)(addrs[AddrIdx] + ByteDistance)) = vals[VecIdx]; + } + } } #endif // __SYCL_DEVICE_ONLY__ @@ -563,8 +1058,11 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; - return 0; + loadstore2DArgumentCheck(); + return __esimd_emu_read_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, + SurfacePitch, X, Y, BlockWidth, BlockHeight, + NBlocks, _Transposed, Transformed); } #endif // __SYCL_DEVICE_ONLY__ @@ -603,7 +1101,8 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless( ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + // Prefetch is NOP under ESIMD_EMULATOR + return; } #endif // __SYCL_DEVICE_ONLY__ @@ -649,7 +1148,11 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, ; #else // __SYCL_DEVICE_ONLY__ { - __ESIMD_UNSUPPORTED_ON_HOST; + loadstore2DArgumentCheck(); + __esimd_emu_write_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, + SurfacePitch, X, Y, vals, BlockWidth, + BlockHeight); } #endif // __SYCL_DEVICE_ONLY__ From b0f0b10c18e7045eeeb00e1b28767a9e79b8b7f6 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Wed, 4 May 2022 14:21:21 -0700 Subject: [PATCH 2/5] clang-format fix --- .../sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 379e202d14755..d50297b8c8392 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -521,7 +521,7 @@ __esimd_emu_read_2d(__ESIMD_DNS::simd_mask_storage_t Pred, uintptr_t Ptr, blkCount * Height * sycl::detail::getNextPowerOfTwo(Width); if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { /// Vertically out-of-bound, skip corresponding vector elements - vecIdx += Width * elems_per_DW;; + vecIdx += Width * elems_per_DW; continue; } From 3ba3cfa238c6e4e127303116f92c41afc65ed0ca Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 5 May 2022 10:52:49 -0700 Subject: [PATCH 3/5] Addressing Konst's comments - Updating comments for definition of 'N' : SIMD operationsize - addrs[] to BaseAddr - Relocating variables close to the place of its use - Elem* to Chanl* --- .../esimd/detail/memory_intrin.hpp | 135 +++++++++++------- 1 file changed, 80 insertions(+), 55 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index d50297b8c8392..423915d2139a6 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -276,7 +276,12 @@ constexpr int vectorIndexIncrement() { // // Not only generates address-align bitmask, but also checks // legitimacy of load/store operation with respect to vector size, -// data size, and SIMT +// data size +/// @tparam Ty is element type. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements to load per address. +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). template constexpr unsigned loadstoreAlignMask() { @@ -313,7 +318,7 @@ constexpr unsigned loadstoreAlignMask() { (VS == __ESIMD_EDNS::lsc_vector_size::n64)) { static_assert( (N == 1) && - "Unsupported SIMT Size for __ESIMD_EDNS::lsc_vector_size = 16/32/64\n" + "Unsupported Size for __ESIMD_EDNS::lsc_vector_size = 16/32/64\n" "(loadstoreAlignMask)"); // 0x3 for u32 / 0x7 for u64 if constexpr (_DS == __ESIMD_ENS::lsc_data_size::u32) @@ -341,7 +346,7 @@ auto __esimd_emu_lsc_offset_read( __ESIMD_DNS::vector_type_t()> Output = 0; - constexpr int ElemCount = __ESIMD_EDNS::to_int(); + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) { if (Pred[OffsetIdx] == 0) { @@ -355,8 +360,8 @@ auto __esimd_emu_lsc_offset_read( // ByteDistance : byte-distance from buffer-read base int ByteDistance = Offsets[OffsetIdx]; - for (int ElemIdx = 0, VecIdx = OffsetIdx; ElemIdx < ElemCount; ElemIdx += 1, - ByteDistance += rawAddressIncrement(), + for (int ChanelIdx = 0, VecIdx = OffsetIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), VecIdx += vectorIndexIncrement()) { if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) { @@ -396,8 +401,6 @@ void __esimd_emu_lsc_offset_write( __ESIMD_ENS::lsc_data_size::u16u32, uint16_t, void>>>>>>; - constexpr int ElemCount = __ESIMD_EDNS::to_int(); - for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) { if (Pred[OffsetIdx] == 0) { // Skip input vector elements correpsonding to @@ -409,9 +412,10 @@ void __esimd_emu_lsc_offset_write( // ByteDistance : byte-distance from buffer-write base int ByteDistance = Offsets[OffsetIdx]; + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); - for (int ElemIdx = 0, VecIdx = OffsetIdx; ElemIdx < ElemCount; ElemIdx += 1, - ByteDistance += rawAddressIncrement(), + for (int ChanelIdx = 0, VecIdx = OffsetIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), VecIdx += vectorIndexIncrement()) { if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) { @@ -423,6 +427,10 @@ void __esimd_emu_lsc_offset_write( /// Stateless-2d operations /// Template argument check for 2D-load/store +/// @tparam T is element type. +/// @tparam Width is width of block +/// @tparam Height is height of block +/// @tparam NBlks is Number of blocks template @@ -649,7 +657,8 @@ void __esimd_emu_write_2d(__ESIMD_DNS::simd_mask_storage_t Pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param offsets is the zero-based offsets for SLM buffer in bytes. /// @return is a vector of type T and size N * to_int() @@ -664,13 +673,12 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - constexpr uint MASK = loadstoreAlignMask(); - sycl::detail::ESIMDDeviceInterface *I = sycl::detail::getESIMDDeviceInterface(); return __esimd_emu_lsc_offset_read( + _Transposed, N, + loadstoreAlignMask()>( pred, offsets, I->__cm_emu_get_slm_ptr()); } #endif // __SYCL_DEVICE_ONLY__ @@ -689,7 +697,8 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -708,7 +717,6 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, ; #else // __SYCL_DEVICE_ONLY__ { - constexpr uint MASK = loadstoreAlignMask(); char *readBase; uint32_t width; std::mutex *mutexLock; @@ -721,8 +729,9 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, std::lock_guard lock(*mutexLock); return __esimd_emu_lsc_offset_read(pred, offsets, - readBase, width); + _Transposed, N, + loadstoreAlignMask()>( + pred, offsets, readBase, width); } #endif // __SYCL_DEVICE_ONLY__ @@ -740,7 +749,8 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param addrs is the load addresses. /// @return is a vector of type T and N * to_int() @@ -760,11 +770,8 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, static_assert(ImmOffset == 0); static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h); - constexpr uint MASK = loadstoreAlignMask(); __ESIMD_DNS::vector_type_t()> Output = 0; - constexpr int ElemCount = __ESIMD_EDNS::to_int(); - for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) { if (pred[AddrIdx] == 0) { // Skip Output vector elements correpsonding to @@ -772,15 +779,19 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, continue; } + constexpr uint MASK = loadstoreAlignMask(); + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); + int ByteDistance = 0; + uintptr_t BaseAddr = addrs[AddrIdx]; - assert(((addrs[AddrIdx] & MASK)) == 0 && "Address Alignment Error!!"); + assert(((BaseAddr & MASK)) == 0 && "Address Alignment Error!!"); - for (int ElemIdx = 0, VecIdx = AddrIdx; ElemIdx < ElemCount; ElemIdx += 1, - ByteDistance += rawAddressIncrement(), + for (int ChanelIdx = 0, VecIdx = AddrIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), VecIdx += vectorIndexIncrement()) { - Output[VecIdx] = *((Ty *)(addrs[AddrIdx] + ByteDistance)); + Output[VecIdx] = *((Ty *)(BaseAddr + ByteDistance)); } } return Output; @@ -800,7 +811,8 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -836,7 +848,8 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param addrs is the prefetch addresses. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param offsets is the zero-based offsets for SLM buffer in bytes. /// @param vals is values to store. @@ -884,14 +898,12 @@ __ESIMD_INTRIN void __esimd_lsc_store_slm( ; #else // __SYCL_DEVICE_ONLY__ { - constexpr uint MASK = loadstoreAlignMask(); - sycl::detail::ESIMDDeviceInterface *I = sycl::detail::getESIMDDeviceInterface(); __esimd_emu_lsc_offset_write(pred, offsets, vals, - I->__cm_emu_get_slm_ptr()); + N, loadstoreAlignMask()>( + pred, offsets, vals, I->__cm_emu_get_slm_ptr()); } #endif // __SYCL_DEVICE_ONLY__ @@ -908,7 +920,8 @@ __ESIMD_INTRIN void __esimd_lsc_store_slm( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -928,7 +941,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( ; #else // __SYCL_DEVICE_ONLY__ { - constexpr uint MASK = loadstoreAlignMask(); char *writeBase; uint32_t width; std::mutex *mutexLock; @@ -941,7 +953,8 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( std::lock_guard lock(*mutexLock); __esimd_emu_lsc_offset_write(pred, offsets, vals, writeBase, width); + N, loadstoreAlignMask()>( + pred, offsets, vals, writeBase, width); } #endif // __SYCL_DEVICE_ONLY__ @@ -958,7 +971,8 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param vals is values to store. @@ -993,9 +1007,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_stateless( __ESIMD_ENS::lsc_data_size::u16u32, uint16_t, void>>>>>>; - constexpr int ElemCount = __ESIMD_EDNS::to_int(); - constexpr uint MASK = loadstoreAlignMask(); - for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) { if (pred[AddrIdx] == 0) { // Skip Output vector elements correpsonding to @@ -1003,14 +1014,18 @@ __ESIMD_INTRIN void __esimd_lsc_store_stateless( continue; } + constexpr uint MASK = loadstoreAlignMask(); + constexpr int ChanlCount = __ESIMD_EDNS::to_int(); + int ByteDistance = 0; + uintptr_t BaseAddr = addrs[AddrIdx]; - assert(((addrs[AddrIdx] & MASK)) == 0 && "Address Alignment Error!!"); + assert(((BaseAddr & MASK)) == 0 && "Address Alignment Error!!"); - for (int ElemIdx = 0, VecIdx = AddrIdx; ElemIdx < ElemCount; ElemIdx += 1, - ByteDistance += rawAddressIncrement(), + for (int ChanelIdx = 0, VecIdx = AddrIdx; ChanelIdx < ChanlCount; + ChanelIdx += 1, ByteDistance += rawAddressIncrement(), VecIdx += vectorIndexIncrement()) { - *((StoreType *)(addrs[AddrIdx] + ByteDistance)) = vals[VecIdx]; + *((StoreType *)(BaseAddr + ByteDistance)) = vals[VecIdx]; } } } @@ -1059,7 +1074,7 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, #else // __SYCL_DEVICE_ONLY__ { loadstore2DArgumentCheck(); + Transformed, NBlocks, false /* isStore*/>(); return __esimd_emu_read_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, BlockWidth, BlockHeight, NBlocks, _Transposed, Transformed); @@ -1149,7 +1164,7 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, #else // __SYCL_DEVICE_ONLY__ { loadstore2DArgumentCheck(); + Transformed, NBlocks, true /* isStore */>(); __esimd_emu_write_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, vals, BlockWidth, BlockHeight); @@ -1168,7 +1183,8 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param offsets is the zero-based offsets. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -1235,7 +1252,8 @@ __esimd_lsc_xatomic_slm_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -1272,7 +1290,8 @@ __esimd_lsc_xatomic_slm_2( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -1307,7 +1326,8 @@ __esimd_lsc_xatomic_bti_0(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -1345,7 +1365,8 @@ __esimd_lsc_xatomic_bti_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -1385,7 +1406,8 @@ __esimd_lsc_xatomic_bti_2( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param addrs is the prefetch addresses. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param src0 is the first atomic operand. @@ -1452,7 +1475,8 @@ __esimd_lsc_xatomic_stateless_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param src0 is the first atomic operand. @@ -1483,7 +1507,8 @@ __esimd_lsc_xatomic_stateless_2( /// @tparam Kind is the Sfid shaded function. /// @tparam FenceOp is the fence operation. /// @tparam Scope is the operation scope. -/// @tparam N is the number of channels (platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access, +/// platform dependent). /// @param pred is predicates. template <__ESIMD_ENS::lsc_memory_kind Kind, __ESIMD_ENS::lsc_fence_op FenceOp, __ESIMD_ENS::lsc_scope Scope, int N> From 10ac7428860eb00dae9e2235457310380ee9a278 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 12 May 2022 22:52:34 -0700 Subject: [PATCH 4/5] Updating comment on 'N', number of addresses to access --- .../esimd/detail/memory_intrin.hpp | 58 +++++++------------ 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 423915d2139a6..b9127cdd96426 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -280,8 +280,7 @@ constexpr int vectorIndexIncrement() { /// @tparam Ty is element type. /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) template constexpr unsigned loadstoreAlignMask() { @@ -657,8 +656,7 @@ void __esimd_emu_write_2d(__ESIMD_DNS::simd_mask_storage_t Pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets for SLM buffer in bytes. /// @return is a vector of type T and size N * to_int() @@ -697,8 +695,7 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -749,8 +746,7 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the load addresses. /// @return is a vector of type T and N * to_int() @@ -811,8 +807,7 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -848,8 +843,7 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets for SLM buffer in bytes. /// @param vals is values to store. @@ -920,8 +913,7 @@ __ESIMD_INTRIN void __esimd_lsc_store_slm( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. @@ -971,8 +963,7 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( /// @tparam DS is the data size. /// @tparam VS is the number of elements to load per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param vals is values to store. @@ -1183,8 +1174,7 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -1252,8 +1241,7 @@ __esimd_lsc_xatomic_slm_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -1290,8 +1278,7 @@ __esimd_lsc_xatomic_slm_2( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -1326,8 +1313,7 @@ __esimd_lsc_xatomic_bti_0(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -1365,8 +1351,7 @@ __esimd_lsc_xatomic_bti_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @tparam SurfIndAliasTy is the \ref sycl::accessor type. /// @param pred is predicates. /// @param offsets is the zero-based offsets. @@ -1406,8 +1391,7 @@ __esimd_lsc_xatomic_bti_2( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. template pred, /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) + /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param src0 is the first atomic operand. @@ -1475,8 +1459,7 @@ __esimd_lsc_xatomic_stateless_1( /// @tparam DS is the data size. /// @tparam VS is the number of elements per address. /// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. /// @param src0 is the first atomic operand. @@ -1507,8 +1490,7 @@ __esimd_lsc_xatomic_stateless_2( /// @tparam Kind is the Sfid shaded function. /// @tparam FenceOp is the fence operation. /// @tparam Scope is the operation scope. -/// @tparam N is the SIMD size of operation (the number of addresses to access, -/// platform dependent). +/// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. template <__ESIMD_ENS::lsc_memory_kind Kind, __ESIMD_ENS::lsc_fence_op FenceOp, __ESIMD_ENS::lsc_scope Scope, int N> From c6b1191c1f7b4a7e67ce10cc703cb91ab90f0284 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Mon, 23 May 2022 12:47:44 -0700 Subject: [PATCH 5/5] Removing duplicated 2D-load/store template argument check --- .../esimd/detail/memory_intrin.hpp | 75 +------------------ 1 file changed, 4 insertions(+), 71 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index b9127cdd96426..8669f3ba0600f 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -424,73 +424,6 @@ void __esimd_emu_lsc_offset_write( } } -/// Stateless-2d operations -/// Template argument check for 2D-load/store -/// @tparam T is element type. -/// @tparam Width is width of block -/// @tparam Height is height of block -/// @tparam NBlks is Number of blocks -template -constexpr void loadstore2DArgumentCheck() { - const __ESIMD_ENS::lsc_data_size _DS = __ESIMD_EDNS::finalize_data_size< - T, __ESIMD_ENS::lsc_data_size::default_size>(); - static_assert(__ESIMD_DNS::isPowerOf2(NBlks) && (NBlks * sizeof(T) <= 8), - "NBlks must be power of 2 and less than or equal to 4!!"); - - if constexpr (isStore == true) { - static_assert(NBlks == 1, "Mutliple Blocks are not allowed for 2D store!!"); - static_assert(Transposed == __ESIMD_EDNS::lsc_data_order::nontranspose, - "No Transposed 2D store!!"); - static_assert(Transformed == false, "No Transformed 2D store!!"); - static_assert((Height >= 1) && (Height <= 32), - "Invalid Height for 2D store!! H > 32 or H == 0"); - - static_assert((Width * sizeof(T) >= 4) && (Width * sizeof(T) <= 64), - "Invalid Width for 2D store!!"); - - static_assert(sycl::detail::getNextPowerOfTwo(Width * sizeof(T)) * Height <= - 512, - "Invalid Width * Height combination!!"); - } else // isStore == false - { - // Restriction : Width * NBlks - static_assert( - Width * NBlks * sizeof(T) <= 64, - "Invalid Width/NBlks combination!! (W * NBlks * sizeof(T) > 64)"); - - static_assert( - ((Transposed == __ESIMD_EDNS::lsc_data_order::transpose) & - Transformed) != true, - "Transpose and Transform cannot be used together for 2D-load!!"); - - if constexpr (Transformed == false) { - if constexpr (Transposed == __ESIMD_EDNS::lsc_data_order::transpose) { - static_assert(NBlks == 1, - "Invalid NBlks for Transposed 2D load!! NBlks != 1"); - } - - static_assert((Height >= 1) && (Height <= 32), - "Invalid Height for Non-transform 2D load!!"); - - static_assert((Width * sizeof(T) >= 4) && (Width * sizeof(T) <= 64), - "Invalid Width for Non-transform 2D load!!"); - } else // Transformed == true - { - static_assert( - (_DS == __ESIMD_ENS::lsc_data_size::u8) || - (_DS == __ESIMD_ENS::lsc_data_size::u16), - "For Transformed 2D read, DataSize must be either U8 or U16"); - - static_assert((Width * sizeof(T) >= 4) && (Width <= 16), - "Invalid Width for Transformed/Non-Transposed 2D load!!"); - static_assert((Height * sizeof(T) >= 4) && (Height <= 32), - "Invalid Height for Transformed/Non-Transposed 2D load!!"); - } - } -} - /// Generic helper function of 2D Block Read supporting both 2d-load /// and raw_send template @@ -1064,8 +997,8 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, ; #else // __SYCL_DEVICE_ONLY__ { - loadstore2DArgumentCheck(); + // Template arguments are already checked by + // check_lsc_block_2d_restrictions() return __esimd_emu_read_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, BlockWidth, BlockHeight, NBlocks, _Transposed, Transformed); @@ -1154,8 +1087,8 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, ; #else // __SYCL_DEVICE_ONLY__ { - loadstore2DArgumentCheck(); + // Template arguments are already checked by + // check_lsc_block_2d_restrictions() __esimd_emu_write_2d(Pred, Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, vals, BlockWidth, BlockHeight);