diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 9e8ace042fbe6..3db6e27266e3b 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -1353,9 +1353,34 @@ void simd_obj_impl::copy_to( if constexpr (RemN == 1) { Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize]; } else if constexpr (RemN == 8 || RemN == 16) { - simd Offsets(0u, sizeof(T)); - scatter(Addr + (NumChunks * ChunkSize), Offsets, - Tmp.template select(NumChunks * ChunkSize)); + // TODO: GPU runtime may handle scatter of 16 byte elements incorrectly. + // The code below is a workaround which must be deleted once GPU runtime + // is fixed. + if constexpr (sizeof(T) == 1 && RemN == 16) { + if constexpr (Align % OperandSize::DWORD > 0) { + ForHelper::unroll([Addr, &Tmp](unsigned Index) { + Addr[Index + NumChunks * ChunkSize] = + Tmp[Index + NumChunks * ChunkSize]; + }); + } else { + simd_mask_type<8> Pred(0); + simd Vals; + Pred.template select<4, 1>() = 1; + Vals.template select<4, 1>() = + Tmp.template bit_cast_view().template select<4, 1>( + NumChunks * ChunkSize); + + simd Offsets(0u, sizeof(int32_t)); + scatter( + reinterpret_cast(Addr + (NumChunks * ChunkSize)), + Offsets, Vals, Pred); + } + } else { + simd Offsets(0u, sizeof(T)); + scatter( + Addr + (NumChunks * ChunkSize), Offsets, + Tmp.template select(NumChunks * ChunkSize)); + } } else { constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32; simd_mask_type Pred(0);