Skip to content

Commit 89ec940

Browse files
authored
[AMDGPU] Insert spill codes for the SGPRs used for EXEC copy (#79428)
The SGPR registers used for preserving EXEC mask while lowering the whole-wave register spills and copies should be preserved at the prolog and epilog if they are in the CSR range. It isn't happening when there is only wwm-copy lowered and there are no wwm-spills. This patch addresses that problem.
1 parent b31fffb commit 89ec940

File tree

4 files changed

+135
-18
lines changed

4 files changed

+135
-18
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

+12-5
Original file line numberDiff line numberDiff line change
@@ -1504,22 +1504,29 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
15041504

15051505
const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
15061506

1507-
if (NeedExecCopyReservedReg) {
1508-
Register ReservedReg = MFI->getSGPRForEXECCopy();
1509-
assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
1507+
Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1508+
if (NeedExecCopyReservedReg ||
1509+
(ReservedRegForExecCopy &&
1510+
MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1511+
MRI.reserveReg(ReservedRegForExecCopy, TRI);
15101512
Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
15111513
if (UnusedScratchReg) {
15121514
// If found any unused scratch SGPR, reserve the register itself for Exec
15131515
// copy and there is no need for any spills in that case.
15141516
MFI->setSGPRForEXECCopy(UnusedScratchReg);
1517+
MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
15151518
LiveUnits.addReg(UnusedScratchReg);
15161519
} else {
15171520
// Needs spill.
1518-
assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
1521+
assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
15191522
"Re-reserving spill slot for EXEC copy register");
1520-
getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC,
1523+
getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
15211524
/*IncludeScratchCopy=*/false);
15221525
}
1526+
} else if (ReservedRegForExecCopy) {
1527+
// Reset it at this point. There are no whole-wave copies and spills
1528+
// encountered.
1529+
MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
15231530
}
15241531

15251532
// hasFP only knows about stack objects that already exist. We're now

llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

+25-11
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
3232
; GFX906-NEXT: v_writelane_b32 v2, s24, 5
3333
; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11]
3434
; GFX906-NEXT: v_writelane_b32 v2, s26, 6
35+
; GFX906-NEXT: v_writelane_b32 v41, s34, 2
3536
; GFX906-NEXT: v_writelane_b32 v2, s27, 7
37+
; GFX906-NEXT: v_writelane_b32 v41, s35, 3
3638
; GFX906-NEXT: v_writelane_b32 v2, s8, 8
37-
; GFX906-NEXT: v_writelane_b32 v41, s16, 2
39+
; GFX906-NEXT: v_writelane_b32 v41, s16, 4
3840
; GFX906-NEXT: v_writelane_b32 v2, s9, 9
3941
; GFX906-NEXT: v_writelane_b32 v41, s30, 0
4042
; GFX906-NEXT: v_writelane_b32 v2, s4, 10
@@ -338,7 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
338340
; GFX906-NEXT: v_readlane_b32 s31, v41, 1
339341
; GFX906-NEXT: v_readlane_b32 s30, v41, 0
340342
; GFX906-NEXT: ; kill: killed $vgpr40
341-
; GFX906-NEXT: v_readlane_b32 s4, v41, 2
343+
; GFX906-NEXT: v_readlane_b32 s34, v41, 2
344+
; GFX906-NEXT: v_readlane_b32 s35, v41, 3
345+
; GFX906-NEXT: v_readlane_b32 s4, v41, 4
342346
; GFX906-NEXT: s_waitcnt vmcnt(0)
343347
; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
344348
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -379,23 +383,27 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
379383
; GFX908-NEXT: s_mov_b64 exec, -1
380384
; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
381385
; GFX908-NEXT: s_mov_b64 exec, s[18:19]
382-
; GFX908-NEXT: v_mov_b32_e32 v3, s16
386+
; GFX908-NEXT: v_mov_b32_e32 v3, s34
383387
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
388+
; GFX908-NEXT: v_mov_b32_e32 v3, s35
389+
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
390+
; GFX908-NEXT: v_mov_b32_e32 v3, s16
391+
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
384392
; GFX908-NEXT: s_addk_i32 s32, 0x2c00
385393
; GFX908-NEXT: s_mov_b64 s[16:17], exec
386394
; GFX908-NEXT: s_mov_b64 exec, 1
387-
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164
395+
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172
388396
; GFX908-NEXT: v_writelane_b32 v2, s30, 0
389397
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
390-
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164
398+
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172
391399
; GFX908-NEXT: s_waitcnt vmcnt(0)
392400
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
393401
; GFX908-NEXT: s_mov_b64 s[16:17], exec
394402
; GFX908-NEXT: s_mov_b64 exec, 1
395-
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164
403+
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172
396404
; GFX908-NEXT: v_writelane_b32 v2, s31, 0
397405
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
398-
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164
406+
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172
399407
; GFX908-NEXT: s_waitcnt vmcnt(0)
400408
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
401409
; GFX908-NEXT: ; implicit-def: $vgpr2
@@ -729,25 +737,31 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
729737
; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
730738
; GFX908-NEXT: s_waitcnt vmcnt(0)
731739
; GFX908-NEXT: s_mov_b64 exec, 1
732-
; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164
740+
; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172
733741
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
734742
; GFX908-NEXT: s_waitcnt vmcnt(0)
735743
; GFX908-NEXT: v_readlane_b32 s31, v0, 0
736-
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164
744+
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
737745
; GFX908-NEXT: s_waitcnt vmcnt(0)
738746
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
739747
; GFX908-NEXT: s_mov_b64 s[4:5], exec
740748
; GFX908-NEXT: s_mov_b64 exec, 1
741-
; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164
749+
; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172
742750
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
743751
; GFX908-NEXT: s_waitcnt vmcnt(0)
744752
; GFX908-NEXT: v_readlane_b32 s30, v0, 0
745-
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164
753+
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
746754
; GFX908-NEXT: s_waitcnt vmcnt(0)
747755
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
748756
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
749757
; GFX908-NEXT: ; kill: killed $vgpr40
750758
; GFX908-NEXT: s_waitcnt vmcnt(0)
759+
; GFX908-NEXT: v_readfirstlane_b32 s34, v0
760+
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
761+
; GFX908-NEXT: s_waitcnt vmcnt(0)
762+
; GFX908-NEXT: v_readfirstlane_b32 s35, v0
763+
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
764+
; GFX908-NEXT: s_waitcnt vmcnt(0)
751765
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
752766
; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1
753767
; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -run-pass=prologepilog -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
3+
4+
# There are free scratch SGPRs available and hence use the smallest available pair for exec copy
5+
# instead of the pair in the CSR range already given.
6+
7+
---
8+
name: shift_back_exec_copy_reserved_reg
9+
tracksRegLiveness: true
10+
frameInfo:
11+
maxAlignment: 4
12+
machineFunctionInfo:
13+
isEntryFunction: false
14+
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
15+
stackPtrOffsetReg: '$sgpr32'
16+
frameOffsetReg: '$sgpr33'
17+
sgprForEXECCopy: '$sgpr34_sgpr35'
18+
body: |
19+
bb.0:
20+
liveins: $sgpr30_sgpr31, $vgpr0
21+
; GCN-LABEL: name: shift_back_exec_copy_reserved_reg
22+
; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
23+
; GCN-NEXT: {{ $}}
24+
; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
25+
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
26+
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
27+
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
28+
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0
29+
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0
30+
; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
31+
; GCN-NEXT: $vgpr1 = COPY $vgpr0
32+
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
33+
; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
34+
; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
35+
; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
36+
$vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0
37+
$vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0
38+
$sgpr34_sgpr35 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
39+
$vgpr1 = COPY $vgpr0
40+
$exec = S_MOV_B64 killed $sgpr34_sgpr35
41+
$sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
42+
$sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
43+
S_SETPC_B64 $sgpr30_sgpr31
44+
...
45+
46+
# All scratch registers are made unavailable so that the CSR SGPR pair used for EXEC copy needed spills at prolog and epilog.
47+
48+
---
49+
name: spill_exec_copy_reserved_reg
50+
tracksRegLiveness: true
51+
frameInfo:
52+
maxAlignment: 4
53+
machineFunctionInfo:
54+
isEntryFunction: false
55+
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
56+
stackPtrOffsetReg: '$sgpr32'
57+
frameOffsetReg: '$sgpr33'
58+
sgprForEXECCopy: '$sgpr34_sgpr35'
59+
body: |
60+
bb.0:
61+
liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0
62+
; GCN-LABEL: name: spill_exec_copy_reserved_reg
63+
; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
64+
; GCN-NEXT: {{ $}}
65+
; GCN-NEXT: $sgpr28_sgpr29 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
66+
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
67+
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
68+
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
69+
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr28_sgpr29
70+
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2
71+
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr35, 1, undef $vgpr2
72+
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0
73+
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0
74+
; GCN-NEXT: $sgpr34_sgpr35 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
75+
; GCN-NEXT: $sgpr28_sgpr29 = IMPLICIT_DEF
76+
; GCN-NEXT: $vgpr1 = COPY $vgpr0
77+
; GCN-NEXT: S_NOP 0, implicit $sgpr28_sgpr29
78+
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr34_sgpr35
79+
; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
80+
; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
81+
; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8_sgpr9_sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr15, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $vcc
82+
$vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0
83+
$vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0
84+
$sgpr34_sgpr35 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
85+
$sgpr28_sgpr29 = IMPLICIT_DEF
86+
$vgpr1 = COPY $vgpr0
87+
S_NOP 0, implicit $sgpr28_sgpr29
88+
$exec = S_MOV_B64 killed $sgpr34_sgpr35
89+
$sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
90+
$sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
91+
S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8_sgpr9_sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr15, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $vcc
92+
...

llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll

+6-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ define void @vector_reg_liverange_split() #0 {
1818
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1919
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
2020
; GFX90A-NEXT: s_mov_b64 exec, s[18:19]
21-
; GFX90A-NEXT: v_writelane_b32 v40, s16, 2
21+
; GFX90A-NEXT: v_writelane_b32 v40, s28, 2
22+
; GFX90A-NEXT: v_writelane_b32 v40, s29, 3
23+
; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
2224
; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
2325
; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
2426
; GFX90A-NEXT: s_addk_i32 s32, 0x400
@@ -46,7 +48,9 @@ define void @vector_reg_liverange_split() #0 {
4648
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
4749
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
4850
; GFX90A-NEXT: ; kill: killed $vgpr0
49-
; GFX90A-NEXT: v_readlane_b32 s4, v40, 2
51+
; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
52+
; GFX90A-NEXT: v_readlane_b32 s29, v40, 3
53+
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
5054
; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1
5155
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
5256
; GFX90A-NEXT: s_mov_b64 exec, -1

0 commit comments

Comments
 (0)