1+ // RUN: %clangxx -O0 -fsycl -fsycl-device-only -fno-sycl-esimd-force-stateless-mem -D__ESIMD_GATHER_SCATTER_LLVM_IR -Xclang -emit-llvm %s -o %t
2+ // RUN: sycl-post-link -split-esimd -lower-esimd -lower-esimd-force-stateless-mem=false -O0 -S %t -o %t.table
3+ // RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes=CHECK,CHECK-STATEFUL
4+
5+ // RUN: %clangxx -O0 -fsycl -fsycl-device-only -fsycl-esimd-force-stateless-mem -D__ESIMD_GATHER_SCATTER_LLVM_IR -Xclang -emit-llvm %s -o %t
6+ // RUN: sycl-post-link -split-esimd -lower-esimd -lower-esimd-force-stateless-mem -O0 -S %t -o %t.table
7+ // RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes=CHECK,CHECK-STATELESS
8+
9+ // Checks ESIMD memory functions accepting compile time properties for prefetch
10+ // and 2D APIs. NOTE: must be run in -O0, as optimizer optimizes away some of
11+ // the code.
12+
13+ #include < sycl/ext/intel/esimd.hpp>
14+
15+ using namespace sycl ::ext::intel::esimd;
16+
17+ using AccType = sycl::accessor<uint8_t , 1 , sycl::access::mode::read_write>;
18+ using LocalAccType = sycl::local_accessor<double , 1 >;
19+ using LocalAccTypeInt = sycl::local_accessor<int , 1 >;
20+
21+ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
22+ test_prefetch (AccType &, float *, int byte_offset32, size_t byte_offset64);
23+ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_2d (float *);
24+
25+ class EsimdFunctor {
26+ public:
27+ AccType acc;
28+ LocalAccType local_acc;
29+ LocalAccTypeInt local_acc_int;
30+ float *ptr;
31+ int byte_offset32;
32+ size_t byte_offset64;
33+ void operator ()() __attribute__((sycl_explicit_simd)) {
34+ test_prefetch (acc, ptr, byte_offset32, byte_offset64);
35+ test_2d (ptr);
36+ }
37+ };
38+
39+ template <typename name, typename Func>
40+ __attribute__ ((sycl_kernel)) void kernel(Func kernelFunc) {
41+ kernelFunc ();
42+ }
43+
44+ void bar (AccType &acc, LocalAccType &local_acc, LocalAccTypeInt &local_acc_int,
45+ float *ptr, int byte_offset32, size_t byte_offset64) {
46+ EsimdFunctor esimdf{acc, local_acc, local_acc_int,
47+ ptr, byte_offset32, byte_offset64};
48+ kernel<class kernel_esimd >(esimdf);
49+ }
50+
51+ // CHECK-LABEL: define {{.*}} @_Z13test_prefetch{{.*}}
52+ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch (AccType &acc, float *ptrf,
53+ int byte_offset32,
54+ size_t byte_offset64) {
55+ properties props_cache_load{cache_hint_L1<cache_hint::cached>,
56+ cache_hint_L2<cache_hint::uncached>};
57+ properties props_cache_load_align{cache_hint_L1<cache_hint::cached>,
58+ cache_hint_L2<cache_hint::uncached>,
59+ alignment<8 >};
60+
61+ uint8_t *ptrb = reinterpret_cast <uint8_t *>(ptrf);
62+
63+ simd<uint32_t , 32 > ioffset_n32 (byte_offset32, 8 );
64+ simd<uint64_t , 32 > loffset_n32 (byte_offset64, 16 );
65+ auto ioffset_n32_view = ioffset_n32.select <32 , 1 >();
66+ auto loffset_n32_view = loffset_n32.select <32 , 1 >();
67+
68+ simd<uint32_t , 16 > ioffset_n16 (byte_offset32, 8 );
69+ simd<uint64_t , 16 > loffset_n16 (byte_offset64, 16 );
70+ auto ioffset_n16_view = ioffset_n16.select <16 , 1 >();
71+ auto loffset_n16_view = loffset_n16.select <16 , 1 >();
72+
73+ simd_mask<32 > mask_n32 = 1 ;
74+ simd_mask<16 > mask_n16 = 1 ;
75+ simd_mask<1 > mask_n1 = 1 ;
76+
77+ // Test USM prefetch using this plan:
78+ // 1) prefetch(usm, offsets): offsets is simd or simd_view
79+ // 2) prefetch(usm, offsets, mask): offsets is simd or simd_view
80+ // 3) prefetch(usm, offset): same as (1) above, but with offset as a scalar.
81+ // 4) prefetch(usm, offset): same as (1) and (2) above, but with VS > 1.
82+
83+ // 1) prefetch(usm, offsets): offsets is simd or simd_view
84+
85+ // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
86+ prefetch (ptrf, ioffset_n32, props_cache_load);
87+ prefetch<float , 32 >(ptrf, ioffset_n32_view, props_cache_load);
88+ prefetch<float , 32 >(ptrf, ioffset_n32_view.select <32 , 1 >(), props_cache_load);
89+
90+ prefetch (ptrf, loffset_n32, props_cache_load);
91+ prefetch<float , 32 >(ptrf, loffset_n32_view, props_cache_load);
92+ prefetch<float , 32 >(ptrf, loffset_n32_view.select <32 , 1 >(), props_cache_load);
93+
94+ // 2) prefetch(usm, offsets, mask): offsets is simd or simd_view
95+ // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
96+ prefetch (ptrf, ioffset_n32, mask_n32, props_cache_load);
97+ prefetch<float , 32 >(ptrf, ioffset_n32_view, mask_n32, props_cache_load);
98+ prefetch<float , 32 >(ptrf, ioffset_n32_view.select <32 , 1 >(), mask_n32,
99+ props_cache_load);
100+
101+ prefetch (ptrf, loffset_n32, mask_n32, props_cache_load);
102+ prefetch<float , 32 >(ptrf, loffset_n32_view, mask_n32, props_cache_load);
103+ prefetch<float , 32 >(ptrf, loffset_n32_view.select <32 , 1 >(), mask_n32,
104+ props_cache_load);
105+
106+ // 3) prefetch(usm, offset): offset is scalar
107+ // CHECK-COUNT-16: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
108+ __ESIMD_NS::prefetch (ptrf, byte_offset32, props_cache_load);
109+ __ESIMD_NS::prefetch (ptrf, byte_offset64, props_cache_load);
110+ __ESIMD_NS::prefetch (ptrf, props_cache_load);
111+ __ESIMD_NS::prefetch (ptrf, mask_n1, props_cache_load);
112+ __ESIMD_NS::prefetch (ptrf, byte_offset32, mask_n1, props_cache_load);
113+ __ESIMD_NS::prefetch (ptrf, byte_offset64, mask_n1, props_cache_load);
114+ __ESIMD_NS::prefetch (ptrf, byte_offset32, mask_n1, props_cache_load);
115+ __ESIMD_NS::prefetch (ptrf, byte_offset64, mask_n1, props_cache_load);
116+
117+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, byte_offset32, props_cache_load_align);
118+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, byte_offset64, props_cache_load_align);
119+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, props_cache_load_align);
120+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, mask_n1, props_cache_load_align);
121+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, byte_offset32, mask_n1,
122+ props_cache_load_align);
123+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, byte_offset64, mask_n1,
124+ props_cache_load_align);
125+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, byte_offset32, mask_n1,
126+ props_cache_load_align);
127+ __ESIMD_NS::prefetch<uint8_t , 4 >(ptrb, byte_offset64, mask_n1,
128+ props_cache_load_align);
129+
130+ // 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1.
131+ // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
132+ prefetch<float , 32 , 2 >(ptrf, ioffset_n16, props_cache_load);
133+ prefetch<float , 32 , 2 >(ptrf, ioffset_n16_view, props_cache_load);
134+ prefetch<float , 32 , 2 >(ptrf, ioffset_n16_view.select <16 , 1 >(),
135+ props_cache_load);
136+
137+ prefetch<float , 32 , 2 >(ptrf, loffset_n16, props_cache_load);
138+ prefetch<float , 32 , 2 >(ptrf, loffset_n16_view, props_cache_load);
139+ prefetch<float , 32 , 2 >(ptrf, loffset_n16_view.select <16 , 1 >(),
140+ props_cache_load);
141+
142+ // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
143+ prefetch<float , 32 , 2 >(ptrf, ioffset_n16, mask_n16, props_cache_load);
144+ prefetch<float , 32 , 2 >(ptrf, ioffset_n16_view, mask_n16, props_cache_load);
145+ prefetch<float , 32 , 2 >(ptrf, ioffset_n16_view.select <16 , 1 >(), mask_n16,
146+ props_cache_load);
147+
148+ prefetch<float , 32 , 2 >(ptrf, loffset_n16, mask_n16, props_cache_load);
149+ prefetch<float , 32 , 2 >(ptrf, loffset_n16_view, mask_n16, props_cache_load);
150+ prefetch<float , 32 , 2 >(ptrf, loffset_n16_view.select <16 , 1 >(), mask_n16,
151+ props_cache_load);
152+
153+ // CHECK-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
154+ __ESIMD_NS::prefetch<float , 32 >(ptrf, 0 , props_cache_load);
155+ __ESIMD_NS::prefetch<float , 32 >(ptrf, 0 , 1 , props_cache_load);
156+
157+ // Test Acc prefetch using this plan:
158+ // 1) prefetch(acc, offsets): offsets is simd or simd_view
159+ // 2) prefetch(acc, offsets, mask): offsets is simd or simd_view
160+ // 3) prefetch(acc, offset): same as (1) above, but with offset as a scalar.
161+ // 4) prefetch(acc, offset): same as (1) and (2) above, but with VS > 1.
162+
163+ // 1) prefetch(acc, offsets): offsets is simd or simd_view
164+ // CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
165+ // CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
166+ prefetch<float >(acc, ioffset_n32, props_cache_load);
167+ prefetch<float , 32 >(acc, ioffset_n32_view, props_cache_load);
168+ prefetch<float , 32 >(acc, ioffset_n32_view.select <32 , 1 >(), props_cache_load);
169+
170+ // 2) prefetch(acc, offsets, mask): offsets is simd or simd_view
171+ // CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
172+ // CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
173+ prefetch<float >(acc, ioffset_n32, mask_n32, props_cache_load);
174+ prefetch<float , 32 >(acc, ioffset_n32_view, mask_n32, props_cache_load);
175+ prefetch<float , 32 >(acc, ioffset_n32_view.select <32 , 1 >(), mask_n32,
176+ props_cache_load);
177+
178+ // 3) prefetch(acc, offset): offset is scalar
179+ // CHECK-STATEFUL-COUNT-10: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}})
180+ // CHECK-STATELESS-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
181+ prefetch<float >(acc, byte_offset32, props_cache_load);
182+ prefetch<float >(acc, props_cache_load);
183+ prefetch<float >(acc, mask_n1, props_cache_load);
184+ prefetch<float >(acc, byte_offset32, mask_n1, props_cache_load);
185+ prefetch<float >(acc, byte_offset32, mask_n1, props_cache_load);
186+
187+ prefetch<uint8_t , 4 >(acc, byte_offset32, props_cache_load_align);
188+ prefetch<uint8_t , 4 >(acc, props_cache_load_align);
189+ prefetch<uint8_t , 4 >(acc, mask_n1, props_cache_load_align);
190+ prefetch<uint8_t , 4 >(acc, byte_offset32, mask_n1, props_cache_load_align);
191+ prefetch<uint8_t , 4 >(acc, byte_offset32, mask_n1, props_cache_load_align);
192+
193+ // 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1.
194+ // CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}})
195+ // CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
196+ prefetch<float , 32 , 2 >(acc, ioffset_n16, props_cache_load);
197+ prefetch<float , 32 , 2 >(acc, ioffset_n16_view, props_cache_load);
198+ prefetch<float , 32 , 2 >(acc, ioffset_n16_view.select <16 , 1 >(),
199+ props_cache_load);
200+
201+ // CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}})
202+ // CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
203+ prefetch<float , 32 , 2 >(acc, ioffset_n16, mask_n16, props_cache_load);
204+ prefetch<float , 32 , 2 >(acc, ioffset_n16_view, mask_n16, props_cache_load);
205+ prefetch<float , 32 , 2 >(acc, ioffset_n16_view.select <16 , 1 >(), mask_n16,
206+ props_cache_load);
207+
208+ // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}})
209+ // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
210+ prefetch<float , 32 >(acc, 0 , props_cache_load);
211+ prefetch<float , 32 >(acc, 0 , 1 , props_cache_load);
212+ }
213+
214+ // CHECK-LABEL: define {{.*}} @_Z7test_2d{{.*}}
215+ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_2d (float *ptr) {
216+ properties props_cache_load{cache_hint_L1<cache_hint::streaming>,
217+ cache_hint_L2<cache_hint::uncached>};
218+ simd<float , 16 > Vals;
219+ auto Vals_view = Vals.select <16 , 1 >();
220+
221+ constexpr int BlockWidth = 16 ;
222+ constexpr int BlockHeight = 1 ;
223+ constexpr int NBlocks = 1 ;
224+ constexpr bool Transposed = false ;
225+ constexpr bool Transformed = false ;
226+
227+ unsigned SurfaceWidth;
228+ unsigned SurfaceHeight;
229+ unsigned SurfacePitch;
230+ int X;
231+ int Y;
232+ // Test USM 2d API using this plan:
233+ // 1) prefetch_2d(): combinations of explicit and default template parameters
234+ // 2) load_2d(): combinations of explicit and default template parameters
235+ // 3) same as (2) but without compile time properties
236+ // 4) store_2d(): combinations of explicit and default template parameters
237+ // 5) same as (4) but without compile time properties
238+
239+ // CHECK-COUNT-3: call void @llvm.genx.lsc.prefetch2d.stateless.v1i1.i64(<1 x i1> {{[^)]+}}, i8 5, i8 1, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
240+ prefetch_2d<float , BlockWidth, BlockHeight, NBlocks>(
241+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
242+ prefetch_2d<float , BlockWidth, BlockHeight>(
243+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
244+ prefetch_2d<float , BlockWidth>(ptr, SurfaceWidth, SurfaceHeight, SurfacePitch,
245+ X, Y, props_cache_load);
246+
247+ // CHECK-COUNT-5: call <16 x float> @llvm.genx.lsc.load2d.stateless.v16f32.v1i1.i64(<1 x i1> {{[^)]+}}, i8 5, i8 1, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
248+ Vals =
249+ load_2d<float , BlockWidth, BlockHeight, NBlocks, Transposed, Transformed>(
250+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
251+ props_cache_load);
252+ Vals = load_2d<float , BlockWidth, BlockHeight, NBlocks, Transposed>(
253+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
254+ Vals = load_2d<float , BlockWidth, BlockHeight, NBlocks>(
255+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
256+ Vals = load_2d<float , BlockWidth, BlockHeight>(
257+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
258+ Vals = load_2d<float , BlockWidth>(ptr, SurfaceWidth, SurfaceHeight,
259+ SurfacePitch, X, Y, props_cache_load);
260+
261+ // CHECK-COUNT-5: call <16 x float> @llvm.genx.lsc.load2d.stateless.v16f32.v1i1.i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
262+ Vals =
263+ load_2d<float , BlockWidth, BlockHeight, NBlocks, Transposed, Transformed>(
264+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
265+ Vals = load_2d<float , BlockWidth, BlockHeight, NBlocks, Transposed>(
266+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
267+ Vals = load_2d<float , BlockWidth, BlockHeight, NBlocks>(
268+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
269+ Vals = load_2d<float , BlockWidth, BlockHeight>(
270+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
271+ Vals = load_2d<float , BlockWidth>(ptr, SurfaceWidth, SurfaceHeight,
272+ SurfacePitch, X, Y);
273+
274+ // CHECK-COUNT-4: call void @llvm.genx.lsc.store2d.stateless.v1i1.i64.v16f32(<1 x i1> {{[^)]+}}, i8 5, i8 1, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, <16 x float> {{[^)]+}})
275+ store_2d<float , BlockWidth, BlockHeight>(ptr, SurfaceWidth, SurfaceHeight,
276+ SurfacePitch, X, Y, Vals,
277+ props_cache_load);
278+ store_2d<float , BlockWidth>(ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X,
279+ Y, Vals, props_cache_load);
280+ store_2d<float , BlockWidth, BlockHeight, 16 >(ptr, SurfaceWidth, SurfaceHeight,
281+ SurfacePitch, X, Y, Vals_view,
282+ props_cache_load);
283+ store_2d<float , BlockWidth, BlockHeight, 16 >(
284+ ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
285+ Vals_view.select <16 , 1 >(), props_cache_load);
286+
287+ // CHECK-COUNT-4: call void @llvm.genx.lsc.store2d.stateless.v1i1.i64.v16f32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, <16 x float> {{[^)]+}})
288+ store_2d<float , BlockWidth, BlockHeight>(ptr, SurfaceWidth, SurfaceHeight,
289+ SurfacePitch, X, Y, Vals);
290+ store_2d<float , BlockWidth>(ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X,
291+ Y, Vals);
292+ store_2d<float , BlockWidth, BlockHeight, 16 >(ptr, SurfaceWidth, SurfaceHeight,
293+ SurfacePitch, X, Y, Vals_view);
294+ store_2d<float , BlockWidth, BlockHeight, 16 >(ptr, SurfaceWidth, SurfaceHeight,
295+ SurfacePitch, X, Y,
296+ Vals_view.select <16 , 1 >());
297+ }
0 commit comments