Skip to content

Commit 4b9819f

Browse files
authored
[ESIMD][NFC] Split memory_properties test to improve performance (#13718)
1 parent e36cb98 commit 4b9819f

File tree

6 files changed

+2811
-2573
lines changed

6 files changed

+2811
-2573
lines changed

sycl/test/esimd/memory_properties.cpp

Lines changed: 0 additions & 2573 deletions
This file was deleted.

sycl/test/esimd/memory_properties_atomic_update.cpp

Lines changed: 1005 additions & 0 deletions
Large diffs are not rendered by default.

sycl/test/esimd/memory_properties_gather.cpp

Lines changed: 616 additions & 0 deletions
Large diffs are not rendered by default.

sycl/test/esimd/memory_properties_load_store.cpp

Lines changed: 359 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
// RUN: %clangxx -O0 -fsycl -fsycl-device-only -fno-sycl-esimd-force-stateless-mem -D__ESIMD_GATHER_SCATTER_LLVM_IR -Xclang -emit-llvm %s -o %t
2+
// RUN: sycl-post-link -split-esimd -lower-esimd -lower-esimd-force-stateless-mem=false -O0 -S %t -o %t.table
3+
// RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes=CHECK,CHECK-STATEFUL
4+
5+
// RUN: %clangxx -O0 -fsycl -fsycl-device-only -fsycl-esimd-force-stateless-mem -D__ESIMD_GATHER_SCATTER_LLVM_IR -Xclang -emit-llvm %s -o %t
6+
// RUN: sycl-post-link -split-esimd -lower-esimd -lower-esimd-force-stateless-mem -O0 -S %t -o %t.table
7+
// RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes=CHECK,CHECK-STATELESS
8+
9+
// Checks ESIMD memory functions accepting compile time properties for prefetch
10+
// and 2D APIs. NOTE: must be run in -O0, as optimizer optimizes away some of
11+
// the code.
12+
13+
#include <sycl/ext/intel/esimd.hpp>
14+
15+
using namespace sycl::ext::intel::esimd;
16+
17+
using AccType = sycl::accessor<uint8_t, 1, sycl::access::mode::read_write>;
18+
using LocalAccType = sycl::local_accessor<double, 1>;
19+
using LocalAccTypeInt = sycl::local_accessor<int, 1>;
20+
21+
SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
22+
test_prefetch(AccType &, float *, int byte_offset32, size_t byte_offset64);
23+
SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_2d(float *);
24+
25+
class EsimdFunctor {
26+
public:
27+
AccType acc;
28+
LocalAccType local_acc;
29+
LocalAccTypeInt local_acc_int;
30+
float *ptr;
31+
int byte_offset32;
32+
size_t byte_offset64;
33+
void operator()() __attribute__((sycl_explicit_simd)) {
34+
test_prefetch(acc, ptr, byte_offset32, byte_offset64);
35+
test_2d(ptr);
36+
}
37+
};
38+
39+
template <typename name, typename Func>
40+
__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
41+
kernelFunc();
42+
}
43+
44+
void bar(AccType &acc, LocalAccType &local_acc, LocalAccTypeInt &local_acc_int,
45+
float *ptr, int byte_offset32, size_t byte_offset64) {
46+
EsimdFunctor esimdf{acc, local_acc, local_acc_int,
47+
ptr, byte_offset32, byte_offset64};
48+
kernel<class kernel_esimd>(esimdf);
49+
}
50+
51+
// CHECK-LABEL: define {{.*}} @_Z13test_prefetch{{.*}}
52+
SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch(AccType &acc, float *ptrf,
53+
int byte_offset32,
54+
size_t byte_offset64) {
55+
properties props_cache_load{cache_hint_L1<cache_hint::cached>,
56+
cache_hint_L2<cache_hint::uncached>};
57+
properties props_cache_load_align{cache_hint_L1<cache_hint::cached>,
58+
cache_hint_L2<cache_hint::uncached>,
59+
alignment<8>};
60+
61+
uint8_t *ptrb = reinterpret_cast<uint8_t *>(ptrf);
62+
63+
simd<uint32_t, 32> ioffset_n32(byte_offset32, 8);
64+
simd<uint64_t, 32> loffset_n32(byte_offset64, 16);
65+
auto ioffset_n32_view = ioffset_n32.select<32, 1>();
66+
auto loffset_n32_view = loffset_n32.select<32, 1>();
67+
68+
simd<uint32_t, 16> ioffset_n16(byte_offset32, 8);
69+
simd<uint64_t, 16> loffset_n16(byte_offset64, 16);
70+
auto ioffset_n16_view = ioffset_n16.select<16, 1>();
71+
auto loffset_n16_view = loffset_n16.select<16, 1>();
72+
73+
simd_mask<32> mask_n32 = 1;
74+
simd_mask<16> mask_n16 = 1;
75+
simd_mask<1> mask_n1 = 1;
76+
77+
// Test USM prefetch using this plan:
78+
// 1) prefetch(usm, offsets): offsets is simd or simd_view
79+
// 2) prefetch(usm, offsets, mask): offsets is simd or simd_view
80+
// 3) prefetch(usm, offset): same as (1) above, but with offset as a scalar.
81+
// 4) prefetch(usm, offset): same as (1) and (2) above, but with VS > 1.
82+
83+
// 1) prefetch(usm, offsets): offsets is simd or simd_view
84+
85+
// CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
86+
prefetch(ptrf, ioffset_n32, props_cache_load);
87+
prefetch<float, 32>(ptrf, ioffset_n32_view, props_cache_load);
88+
prefetch<float, 32>(ptrf, ioffset_n32_view.select<32, 1>(), props_cache_load);
89+
90+
prefetch(ptrf, loffset_n32, props_cache_load);
91+
prefetch<float, 32>(ptrf, loffset_n32_view, props_cache_load);
92+
prefetch<float, 32>(ptrf, loffset_n32_view.select<32, 1>(), props_cache_load);
93+
94+
// 2) prefetch(usm, offsets, mask): offsets is simd or simd_view
95+
// CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
96+
prefetch(ptrf, ioffset_n32, mask_n32, props_cache_load);
97+
prefetch<float, 32>(ptrf, ioffset_n32_view, mask_n32, props_cache_load);
98+
prefetch<float, 32>(ptrf, ioffset_n32_view.select<32, 1>(), mask_n32,
99+
props_cache_load);
100+
101+
prefetch(ptrf, loffset_n32, mask_n32, props_cache_load);
102+
prefetch<float, 32>(ptrf, loffset_n32_view, mask_n32, props_cache_load);
103+
prefetch<float, 32>(ptrf, loffset_n32_view.select<32, 1>(), mask_n32,
104+
props_cache_load);
105+
106+
// 3) prefetch(usm, offset): offset is scalar
107+
// CHECK-COUNT-16: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
108+
__ESIMD_NS::prefetch(ptrf, byte_offset32, props_cache_load);
109+
__ESIMD_NS::prefetch(ptrf, byte_offset64, props_cache_load);
110+
__ESIMD_NS::prefetch(ptrf, props_cache_load);
111+
__ESIMD_NS::prefetch(ptrf, mask_n1, props_cache_load);
112+
__ESIMD_NS::prefetch(ptrf, byte_offset32, mask_n1, props_cache_load);
113+
__ESIMD_NS::prefetch(ptrf, byte_offset64, mask_n1, props_cache_load);
114+
__ESIMD_NS::prefetch(ptrf, byte_offset32, mask_n1, props_cache_load);
115+
__ESIMD_NS::prefetch(ptrf, byte_offset64, mask_n1, props_cache_load);
116+
117+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, byte_offset32, props_cache_load_align);
118+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, byte_offset64, props_cache_load_align);
119+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, props_cache_load_align);
120+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, mask_n1, props_cache_load_align);
121+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, byte_offset32, mask_n1,
122+
props_cache_load_align);
123+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, byte_offset64, mask_n1,
124+
props_cache_load_align);
125+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, byte_offset32, mask_n1,
126+
props_cache_load_align);
127+
__ESIMD_NS::prefetch<uint8_t, 4>(ptrb, byte_offset64, mask_n1,
128+
props_cache_load_align);
129+
130+
// 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1.
131+
// CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
132+
prefetch<float, 32, 2>(ptrf, ioffset_n16, props_cache_load);
133+
prefetch<float, 32, 2>(ptrf, ioffset_n16_view, props_cache_load);
134+
prefetch<float, 32, 2>(ptrf, ioffset_n16_view.select<16, 1>(),
135+
props_cache_load);
136+
137+
prefetch<float, 32, 2>(ptrf, loffset_n16, props_cache_load);
138+
prefetch<float, 32, 2>(ptrf, loffset_n16_view, props_cache_load);
139+
prefetch<float, 32, 2>(ptrf, loffset_n16_view.select<16, 1>(),
140+
props_cache_load);
141+
142+
// CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
143+
prefetch<float, 32, 2>(ptrf, ioffset_n16, mask_n16, props_cache_load);
144+
prefetch<float, 32, 2>(ptrf, ioffset_n16_view, mask_n16, props_cache_load);
145+
prefetch<float, 32, 2>(ptrf, ioffset_n16_view.select<16, 1>(), mask_n16,
146+
props_cache_load);
147+
148+
prefetch<float, 32, 2>(ptrf, loffset_n16, mask_n16, props_cache_load);
149+
prefetch<float, 32, 2>(ptrf, loffset_n16_view, mask_n16, props_cache_load);
150+
prefetch<float, 32, 2>(ptrf, loffset_n16_view.select<16, 1>(), mask_n16,
151+
props_cache_load);
152+
153+
// CHECK-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
154+
__ESIMD_NS::prefetch<float, 32>(ptrf, 0, props_cache_load);
155+
__ESIMD_NS::prefetch<float, 32>(ptrf, 0, 1, props_cache_load);
156+
157+
// Test Acc prefetch using this plan:
158+
// 1) prefetch(acc, offsets): offsets is simd or simd_view
159+
// 2) prefetch(acc, offsets, mask): offsets is simd or simd_view
160+
// 3) prefetch(acc, offset): same as (1) above, but with offset as a scalar.
161+
// 4) prefetch(acc, offset): same as (1) and (2) above, but with VS > 1.
162+
163+
// 1) prefetch(acc, offsets): offsets is simd or simd_view
164+
// CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
165+
// CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
166+
prefetch<float>(acc, ioffset_n32, props_cache_load);
167+
prefetch<float, 32>(acc, ioffset_n32_view, props_cache_load);
168+
prefetch<float, 32>(acc, ioffset_n32_view.select<32, 1>(), props_cache_load);
169+
170+
// 2) prefetch(acc, offsets, mask): offsets is simd or simd_view
171+
// CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
172+
// CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
173+
prefetch<float>(acc, ioffset_n32, mask_n32, props_cache_load);
174+
prefetch<float, 32>(acc, ioffset_n32_view, mask_n32, props_cache_load);
175+
prefetch<float, 32>(acc, ioffset_n32_view.select<32, 1>(), mask_n32,
176+
props_cache_load);
177+
178+
// 3) prefetch(acc, offset): offset is scalar
179+
// CHECK-STATEFUL-COUNT-10: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}})
180+
// CHECK-STATELESS-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
181+
prefetch<float>(acc, byte_offset32, props_cache_load);
182+
prefetch<float>(acc, props_cache_load);
183+
prefetch<float>(acc, mask_n1, props_cache_load);
184+
prefetch<float>(acc, byte_offset32, mask_n1, props_cache_load);
185+
prefetch<float>(acc, byte_offset32, mask_n1, props_cache_load);
186+
187+
prefetch<uint8_t, 4>(acc, byte_offset32, props_cache_load_align);
188+
prefetch<uint8_t, 4>(acc, props_cache_load_align);
189+
prefetch<uint8_t, 4>(acc, mask_n1, props_cache_load_align);
190+
prefetch<uint8_t, 4>(acc, byte_offset32, mask_n1, props_cache_load_align);
191+
prefetch<uint8_t, 4>(acc, byte_offset32, mask_n1, props_cache_load_align);
192+
193+
// 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1.
194+
// CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}})
195+
// CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
196+
prefetch<float, 32, 2>(acc, ioffset_n16, props_cache_load);
197+
prefetch<float, 32, 2>(acc, ioffset_n16_view, props_cache_load);
198+
prefetch<float, 32, 2>(acc, ioffset_n16_view.select<16, 1>(),
199+
props_cache_load);
200+
201+
// CHECK-STATEFUL-COUNT-3: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}})
202+
// CHECK-STATELESS-COUNT-3: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
203+
prefetch<float, 32, 2>(acc, ioffset_n16, mask_n16, props_cache_load);
204+
prefetch<float, 32, 2>(acc, ioffset_n16_view, mask_n16, props_cache_load);
205+
prefetch<float, 32, 2>(acc, ioffset_n16_view.select<16, 1>(), mask_n16,
206+
props_cache_load);
207+
208+
// CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}})
209+
// CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
210+
prefetch<float, 32>(acc, 0, props_cache_load);
211+
prefetch<float, 32>(acc, 0, 1, props_cache_load);
212+
}
213+
214+
// CHECK-LABEL: define {{.*}} @_Z7test_2d{{.*}}
215+
SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_2d(float *ptr) {
216+
properties props_cache_load{cache_hint_L1<cache_hint::streaming>,
217+
cache_hint_L2<cache_hint::uncached>};
218+
simd<float, 16> Vals;
219+
auto Vals_view = Vals.select<16, 1>();
220+
221+
constexpr int BlockWidth = 16;
222+
constexpr int BlockHeight = 1;
223+
constexpr int NBlocks = 1;
224+
constexpr bool Transposed = false;
225+
constexpr bool Transformed = false;
226+
227+
unsigned SurfaceWidth;
228+
unsigned SurfaceHeight;
229+
unsigned SurfacePitch;
230+
int X;
231+
int Y;
232+
// Test USM 2d API using this plan:
233+
// 1) prefetch_2d(): combinations of explicit and default template parameters
234+
// 2) load_2d(): combinations of explicit and default template parameters
235+
// 3) same as (2) but without compile time properties
236+
// 4) store_2d(): combinations of explicit and default template parameters
237+
// 5) same as (4) but without compile time properties
238+
239+
// CHECK-COUNT-3: call void @llvm.genx.lsc.prefetch2d.stateless.v1i1.i64(<1 x i1> {{[^)]+}}, i8 5, i8 1, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
240+
prefetch_2d<float, BlockWidth, BlockHeight, NBlocks>(
241+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
242+
prefetch_2d<float, BlockWidth, BlockHeight>(
243+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
244+
prefetch_2d<float, BlockWidth>(ptr, SurfaceWidth, SurfaceHeight, SurfacePitch,
245+
X, Y, props_cache_load);
246+
247+
// CHECK-COUNT-5: call <16 x float> @llvm.genx.lsc.load2d.stateless.v16f32.v1i1.i64(<1 x i1> {{[^)]+}}, i8 5, i8 1, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
248+
Vals =
249+
load_2d<float, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed>(
250+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
251+
props_cache_load);
252+
Vals = load_2d<float, BlockWidth, BlockHeight, NBlocks, Transposed>(
253+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
254+
Vals = load_2d<float, BlockWidth, BlockHeight, NBlocks>(
255+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
256+
Vals = load_2d<float, BlockWidth, BlockHeight>(
257+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, props_cache_load);
258+
Vals = load_2d<float, BlockWidth>(ptr, SurfaceWidth, SurfaceHeight,
259+
SurfacePitch, X, Y, props_cache_load);
260+
261+
// CHECK-COUNT-5: call <16 x float> @llvm.genx.lsc.load2d.stateless.v16f32.v1i1.i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
262+
Vals =
263+
load_2d<float, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed>(
264+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
265+
Vals = load_2d<float, BlockWidth, BlockHeight, NBlocks, Transposed>(
266+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
267+
Vals = load_2d<float, BlockWidth, BlockHeight, NBlocks>(
268+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
269+
Vals = load_2d<float, BlockWidth, BlockHeight>(
270+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
271+
Vals = load_2d<float, BlockWidth>(ptr, SurfaceWidth, SurfaceHeight,
272+
SurfacePitch, X, Y);
273+
274+
// CHECK-COUNT-4: call void @llvm.genx.lsc.store2d.stateless.v1i1.i64.v16f32(<1 x i1> {{[^)]+}}, i8 5, i8 1, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, <16 x float> {{[^)]+}})
275+
store_2d<float, BlockWidth, BlockHeight>(ptr, SurfaceWidth, SurfaceHeight,
276+
SurfacePitch, X, Y, Vals,
277+
props_cache_load);
278+
store_2d<float, BlockWidth>(ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X,
279+
Y, Vals, props_cache_load);
280+
store_2d<float, BlockWidth, BlockHeight, 16>(ptr, SurfaceWidth, SurfaceHeight,
281+
SurfacePitch, X, Y, Vals_view,
282+
props_cache_load);
283+
store_2d<float, BlockWidth, BlockHeight, 16>(
284+
ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
285+
Vals_view.select<16, 1>(), props_cache_load);
286+
287+
// CHECK-COUNT-4: call void @llvm.genx.lsc.store2d.stateless.v1i1.i64.v16f32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 3, i8 1, i8 1, i16 16, i16 1, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, <16 x float> {{[^)]+}})
288+
store_2d<float, BlockWidth, BlockHeight>(ptr, SurfaceWidth, SurfaceHeight,
289+
SurfacePitch, X, Y, Vals);
290+
store_2d<float, BlockWidth>(ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X,
291+
Y, Vals);
292+
store_2d<float, BlockWidth, BlockHeight, 16>(ptr, SurfaceWidth, SurfaceHeight,
293+
SurfacePitch, X, Y, Vals_view);
294+
store_2d<float, BlockWidth, BlockHeight, 16>(ptr, SurfaceWidth, SurfaceHeight,
295+
SurfacePitch, X, Y,
296+
Vals_view.select<16, 1>());
297+
}

0 commit comments

Comments
 (0)