Skip to content

Commit f6b9afa

Browse files
committed
[AMDGPU] Extend and reorganize memory legalizer tests
* Rename some tests to try to make a convention (where all components are optional) of: <addrspace>_<syncscope>_<memory-orders>_<operation> * Split up at a level of granularity appropriate for the different RUN lines (i.e. split on addrspace so GFX6 can avoid FLAT) and that makes running a specific test reasonable in terms of wall time taken. This also means when run as part of the test suite the testing is not one serial bottleneck. * Auto-generate check lines with `update_llc_test_checks.py` to make future maintenance more tractable. Reviewed By: rampitec, t-tye Differential Revision: https://reviews.llvm.org/D91545
1 parent c282b7d commit f6b9afa

27 files changed

+74831
-8120
lines changed

llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll

Lines changed: 0 additions & 526 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll

Lines changed: 0 additions & 3292 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll

Lines changed: 0 additions & 719 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll

Lines changed: 0 additions & 1370 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll

Lines changed: 1229 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll

Lines changed: 5098 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6+
7+
define amdgpu_kernel void @flat_nontemporal_load_0(
8+
; GFX7-LABEL: flat_nontemporal_load_0:
9+
; GFX7: ; %bb.0: ; %entry
10+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
11+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
12+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
13+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
14+
; GFX7-NEXT: flat_load_dword v0, v[0:1] glc slc
15+
; GFX7-NEXT: v_mov_b32_e32 v2, s2
16+
; GFX7-NEXT: v_mov_b32_e32 v3, s3
17+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18+
; GFX7-NEXT: flat_store_dword v[2:3], v0
19+
; GFX7-NEXT: s_endpgm
20+
;
21+
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
22+
; GFX10-WGP: ; %bb.0: ; %entry
23+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
24+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
25+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
26+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
27+
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
28+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
29+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
30+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
31+
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
32+
; GFX10-WGP-NEXT: s_endpgm
33+
;
34+
; GFX10-CU-LABEL: flat_nontemporal_load_0:
35+
; GFX10-CU: ; %bb.0: ; %entry
36+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
37+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
38+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
39+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
40+
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
41+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
42+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
43+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
44+
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
45+
; GFX10-CU-NEXT: s_endpgm
46+
;
47+
; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
48+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
49+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
50+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
51+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
52+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
53+
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc slc
54+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
55+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
56+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
57+
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
58+
; SKIP-CACHE-INV-NEXT: s_endpgm
59+
i32* %in, i32* %out) {
60+
entry:
61+
%val = load i32, i32* %in, align 4, !nontemporal !0
62+
store i32 %val, i32* %out
63+
ret void
64+
}
65+
66+
define amdgpu_kernel void @flat_nontemporal_load_1(
67+
; GFX7-LABEL: flat_nontemporal_load_1:
68+
; GFX7: ; %bb.0: ; %entry
69+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
70+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0
71+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
72+
; GFX7-NEXT: v_mov_b32_e32 v3, s1
73+
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
74+
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
75+
; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc
76+
; GFX7-NEXT: v_mov_b32_e32 v0, s2
77+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
78+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
79+
; GFX7-NEXT: flat_store_dword v[0:1], v2
80+
; GFX7-NEXT: s_endpgm
81+
;
82+
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
83+
; GFX10-WGP: ; %bb.0: ; %entry
84+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
85+
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
86+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
87+
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
88+
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
89+
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
90+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
91+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
92+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
93+
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
94+
; GFX10-WGP-NEXT: s_endpgm
95+
;
96+
; GFX10-CU-LABEL: flat_nontemporal_load_1:
97+
; GFX10-CU: ; %bb.0: ; %entry
98+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
99+
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
100+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
101+
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
102+
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
103+
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
104+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
105+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
106+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
107+
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
108+
; GFX10-CU-NEXT: s_endpgm
109+
;
110+
; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
111+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
112+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
113+
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v2, 2, v0
114+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
115+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
116+
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v2, vcc, s0, v2
117+
; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
118+
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[2:3] glc slc
119+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
120+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
121+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
122+
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
123+
; SKIP-CACHE-INV-NEXT: s_endpgm
124+
i32* %in, i32* %out) {
125+
entry:
126+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
127+
%val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
128+
%val = load i32, i32* %val.gep, align 4, !nontemporal !0
129+
store i32 %val, i32* %out
130+
ret void
131+
}
132+
133+
define amdgpu_kernel void @flat_nontemporal_store_0(
134+
; GFX7-LABEL: flat_nontemporal_store_0:
135+
; GFX7: ; %bb.0: ; %entry
136+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
137+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
138+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
139+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
140+
; GFX7-NEXT: flat_load_dword v0, v[0:1]
141+
; GFX7-NEXT: v_mov_b32_e32 v2, s2
142+
; GFX7-NEXT: v_mov_b32_e32 v3, s3
143+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
144+
; GFX7-NEXT: flat_store_dword v[2:3], v0 glc slc
145+
; GFX7-NEXT: s_endpgm
146+
;
147+
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
148+
; GFX10-WGP: ; %bb.0: ; %entry
149+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
150+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
151+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
152+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
153+
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
154+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
155+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
156+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
157+
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc
158+
; GFX10-WGP-NEXT: s_endpgm
159+
;
160+
; GFX10-CU-LABEL: flat_nontemporal_store_0:
161+
; GFX10-CU: ; %bb.0: ; %entry
162+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
163+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
164+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
165+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
166+
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
167+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
168+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
169+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
170+
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc
171+
; GFX10-CU-NEXT: s_endpgm
172+
;
173+
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
174+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
175+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
176+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
177+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
178+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
179+
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
180+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
181+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
182+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
183+
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 glc slc
184+
; SKIP-CACHE-INV-NEXT: s_endpgm
185+
i32* %in, i32* %out) {
186+
entry:
187+
%val = load i32, i32* %in, align 4
188+
store i32 %val, i32* %out, !nontemporal !0
189+
ret void
190+
}
191+
192+
define amdgpu_kernel void @flat_nontemporal_store_1(
193+
; GFX7-LABEL: flat_nontemporal_store_1:
194+
; GFX7: ; %bb.0: ; %entry
195+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
196+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
197+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
198+
; GFX7-NEXT: v_mov_b32_e32 v1, s0
199+
; GFX7-NEXT: v_mov_b32_e32 v2, s1
200+
; GFX7-NEXT: flat_load_dword v2, v[1:2]
201+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
202+
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
203+
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
204+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
205+
; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
206+
; GFX7-NEXT: s_endpgm
207+
;
208+
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
209+
; GFX10-WGP: ; %bb.0: ; %entry
210+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
211+
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
212+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
213+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
214+
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
215+
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
216+
; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
217+
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
218+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
219+
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc
220+
; GFX10-WGP-NEXT: s_endpgm
221+
;
222+
; GFX10-CU-LABEL: flat_nontemporal_store_1:
223+
; GFX10-CU: ; %bb.0: ; %entry
224+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
225+
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
226+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
227+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
228+
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
229+
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
230+
; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
231+
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
232+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
233+
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc
234+
; GFX10-CU-NEXT: s_endpgm
235+
;
236+
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
237+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
238+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
239+
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
240+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
241+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
242+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
243+
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2]
244+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
245+
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
246+
; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
247+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
248+
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc
249+
; SKIP-CACHE-INV-NEXT: s_endpgm
250+
i32* %in, i32* %out) {
251+
entry:
252+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
253+
%val = load i32, i32* %in, align 4
254+
%out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
255+
store i32 %val, i32* %out.gep, !nontemporal !0
256+
ret void
257+
}
258+
259+
!0 = !{i32 1}
260+
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)