@@ -5,11 +5,22 @@ target triple = "amdgcn-amd-amdhsa"
55
66; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
77
8- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
9- ; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
10- ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
8+ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
9+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
10+ ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
11+ ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
12+ ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
13+
14+ ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
15+
1116; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
12- ; HSA-DAG: ds_write_b32 [[PTR]], [[K]]
17+ ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
18+
19+ ; GFX9: s_cmp_lg_u32 [[PTR]], -1
20+ ; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
21+ ; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
22+
23+ ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
1324
1425; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
1526; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -28,8 +39,22 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
2839
2940; Test handling inside a non-kernel
3041; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
42+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
43+ ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
44+ ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
45+ ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
46+ ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
47+
48+ ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
49+
3150; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
32- ; HSA-DAG: ds_write_b32 v0, [[K]]
51+
52+ ; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]]
53+ ; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
54+ ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
55+ ; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc
56+
57+ ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
3358define void @use_group_to_flat_addrspacecast_func (ptr addrspace (3 ) %ptr ) #0 {
3459 %stof = addrspacecast ptr addrspace (3 ) %ptr to ptr
3560 store volatile i32 7 , ptr %stof
@@ -38,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
3863
3964; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
4065
41- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
42- ; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
43- ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
44- ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
45- ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
46- ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
47- ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
48- ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
49- ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
50- ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
66+ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
67+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
68+
69+ ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
70+ ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
71+ ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
72+ ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
73+
74+ ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
75+ ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
76+
77+ ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
78+ ; GFX9: s_cmp_lg_u32 [[PTR]], -1
79+ ; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
80+ ; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
81+
82+ ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
5183
5284; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
5385; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -65,12 +97,10 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
6597; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
6698
6799; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
68- ; CI -DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
69- ; CI -DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
100+ ; HSA -DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
101+ ; HSA -DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
70102; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
71- ; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
72- ; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
73- ; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]]
103+ ; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
74104
75105; HSA: .amdhsa_user_sgpr_queue_ptr 0
76106define amdgpu_kernel void @use_global_to_flat_addrspacecast (ptr addrspace (1 ) %ptr ) #0 {
@@ -82,7 +112,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
82112; no-op
83113; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
84114; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
85- ; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0
115+ ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
116+ ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
117+ ; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
86118define amdgpu_kernel void @use_constant_to_flat_addrspacecast (ptr addrspace (4 ) %ptr ) #0 {
87119 %stof = addrspacecast ptr addrspace (4 ) %ptr to ptr
88120 %ld = load volatile i32 , ptr %stof
@@ -183,9 +215,11 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
183215}
184216
185217; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
218+
186219; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
220+ ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
187221; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
188- ; HSA: ds_write_b32 v[[LO ]], v[[K]]
222+ ; HSA: flat_store_dword v[[[LO]]:[[HI] ]], v[[K]]
189223define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast () #0 {
190224 %cast = addrspacecast ptr addrspace (3 ) null to ptr
191225 store volatile i32 7 , ptr %cast
@@ -203,9 +237,10 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
203237}
204238
205239; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
240+ ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
206241; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
207- ; HSA-DAG: v_mov_b32_e32 v[[LO :[0-9]+]], -1
208- ; HSA: ds_write_b32 v[[LO ]], v[[K]]
242+ ; HSA-DAG: v_mov_b32_e32 v[[HI :[0-9]+]], 0{{$}}
243+ ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI] ]], v[[K]]
209244define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast () #0 {
210245 %cast = addrspacecast ptr addrspace (3 ) inttoptr (i32 -1 to ptr addrspace (3 )) to ptr
211246 store volatile i32 7 , ptr %cast
@@ -224,13 +259,10 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
224259
225260; FIXME: Shouldn't need to enable queue ptr
226261; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
227- ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
228- ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
229- ; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
230- ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
231- ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
262+ ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
263+ ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
232264; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
233- ; HSA: buffer_store_dword v[[K]], off, s[[[BASELO ]]:[[RSRCHI ]]], 0
265+ ; HSA: flat_store_dword v[[[LO ]]:[[HI ]]], v[[K]]
234266define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast () #0 {
235267 %cast = addrspacecast ptr addrspace (5 ) null to ptr
236268 store volatile i32 7 , ptr %cast
@@ -249,14 +281,10 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
249281
250282; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
251283
252- ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
253- ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
254- ; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
255- ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
256- ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
257- ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
284+ ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
258285; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
259- ; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
286+ ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
287+ ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
260288
261289; CI: .amdhsa_user_sgpr_queue_ptr 1
262290; GFX9: .amdhsa_user_sgpr_queue_ptr 0
@@ -306,18 +334,16 @@ end:
306334
307335; Check for prologue initializing special SGPRs pointing to scratch.
308336; HSA-LABEL: {{^}}store_flat_scratch:
337+ ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
309338; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
310339; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
311- ; HSA: buffer_store_dword
312- ; HSA: s_barrier
313- ; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc
314- ; HSA-DAG: s_load_dwordx2
315- ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
316- ; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4
317- ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5
318- ; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0
319- ; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
320- ; GFX9: global_store_dword [[PTR]], [[K]]
340+
341+ ; GFX9: s_add_u32 flat_scratch_lo, s6, s9
342+ ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
343+
344+ ; HSA: {{flat|global}}_store_dword
345+ ; HSA: s_barrier
346+ ; HSA: {{flat|global}}_load_dword
321347define amdgpu_kernel void @store_flat_scratch (ptr addrspace (1 ) noalias %out , i32 ) #0 {
322348 %alloca = alloca i32 , i32 9 , align 4 , addrspace (5 )
323349 %x = call i32 @llvm.amdgcn.workitem.id.x () #2
0 commit comments