-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU] Scavenge a VGPR to eliminate a frame index #166979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Scavenge a VGPR to eliminate a frame index #166979
Conversation
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Anshil Gandhi (gandhi56) Changes
Co-authored by Matt Arsenault Patch is 34.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166979.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a6c1af24e13e9..0c7bb95432fe4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2981,8 +2981,42 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
MI, false, 0, !UseSGPR);
- // TODO: for flat scratch another attempt can be made with a VGPR index
- // if no SGPRs can be scavenged.
+ // Fallback: If we need an SGPR but cannot scavenge one and there is no
+ // frame register, try to convert the flat-scratch instruction to use a
+ // VGPR index (SS -> SV) and materialize the offset in a VGPR.
+ if (!TmpSReg && !FrameReg && TII->isFLATScratch(*MI)) {
+ // Reuse an existing VGPR temp if available, otherwise scavenge one.
+ Register VTmp = (!UseSGPR && TmpReg)
+ ? TmpReg
+ : RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, false, 0);
+ if (VTmp) {
+ // Put the large offset into a VGPR and zero the immediate offset.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), VTmp)
+ .addImm(Offset);
+
+ unsigned Opc = MI->getOpcode();
+ int NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
+ if (NewOpc != -1) {
+ int OldSAddrIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
+ int NewVAddrIdx =
+ AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
+ if (OldSAddrIdx == NewVAddrIdx && OldSAddrIdx >= 0) {
+ MI->setDesc(TII->get(NewOpc));
+ // Replace former saddr (now vaddr) with the VGPR index.
+ MI->getOperand(NewVAddrIdx).ChangeToRegister(VTmp, false);
+ // Reset the immediate offset to 0 as it is now in vaddr.
+ MachineOperand *OffOp =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
+ assert(OffOp && "Flat scratch SV form must have offset operand");
+ OffOp->setImm(0);
+ return false;
+ }
+ }
+ }
+ }
+
if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
report_fatal_error("Cannot scavenge register in FI elimination!");
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-ss-to-sv-scavenge.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-ss-to-sv-scavenge.ll
new file mode 100644
index 0000000000000..9d8bbc198afa0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-ss-to-sv-scavenge.ll
@@ -0,0 +1,630 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; Ensure we don't crash with: "Cannot scavenge register in FI elimination!"
+; RUN: llc < %s -verify-machineinstrs -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a | FileCheck %s --check-prefix=GFX90A
+; RUN: llc < %s -verify-machineinstrs -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 | FileCheck %s --check-prefix=GFX950
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @issue155902(i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg1, i64 %arg35, i64 %arg34, i64 %arg, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg37, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49, i64 %arg45, i64 %arg36) {
+; GFX90A-LABEL: issue155902:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4008
+; GFX90A-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_writelane_b32 v2, s6, 0
+; GFX90A-NEXT: v_writelane_b32 v2, s7, 1
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_writelane_b32 v2, s6, 2
+; GFX90A-NEXT: v_writelane_b32 v2, s7, 3
+; GFX90A-NEXT: s_load_dwordx2 vcc, s[4:5], 0x10
+; GFX90A-NEXT: s_load_dwordx2 s[98:99], s[4:5], 0x18
+; GFX90A-NEXT: s_load_dwordx2 s[96:97], s[4:5], 0x20
+; GFX90A-NEXT: s_load_dwordx2 s[94:95], s[4:5], 0x28
+; GFX90A-NEXT: s_load_dwordx2 s[92:93], s[4:5], 0x30
+; GFX90A-NEXT: s_load_dwordx2 s[90:91], s[4:5], 0x38
+; GFX90A-NEXT: s_load_dwordx2 s[88:89], s[4:5], 0x40
+; GFX90A-NEXT: s_load_dwordx2 s[86:87], s[4:5], 0x48
+; GFX90A-NEXT: s_load_dwordx2 s[84:85], s[4:5], 0x50
+; GFX90A-NEXT: s_load_dwordx2 s[82:83], s[4:5], 0x58
+; GFX90A-NEXT: s_load_dwordx2 s[80:81], s[4:5], 0x60
+; GFX90A-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x68
+; GFX90A-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x70
+; GFX90A-NEXT: s_load_dwordx2 s[74:75], s[4:5], 0x78
+; GFX90A-NEXT: s_load_dwordx2 s[72:73], s[4:5], 0x80
+; GFX90A-NEXT: s_load_dwordx2 s[70:71], s[4:5], 0x88
+; GFX90A-NEXT: s_load_dwordx2 s[68:69], s[4:5], 0x90
+; GFX90A-NEXT: s_load_dwordx2 s[66:67], s[4:5], 0x98
+; GFX90A-NEXT: s_load_dwordx2 s[64:65], s[4:5], 0xa0
+; GFX90A-NEXT: s_load_dwordx2 s[62:63], s[4:5], 0xa8
+; GFX90A-NEXT: s_load_dwordx2 s[60:61], s[4:5], 0xb0
+; GFX90A-NEXT: s_load_dwordx2 s[58:59], s[4:5], 0xb8
+; GFX90A-NEXT: s_load_dwordx2 s[56:57], s[4:5], 0xc0
+; GFX90A-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0xc8
+; GFX90A-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0xd0
+; GFX90A-NEXT: s_load_dwordx2 s[50:51], s[4:5], 0xd8
+; GFX90A-NEXT: s_load_dwordx2 s[48:49], s[4:5], 0xe0
+; GFX90A-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0xe8
+; GFX90A-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0xf0
+; GFX90A-NEXT: s_load_dwordx2 s[42:43], s[4:5], 0xf8
+; GFX90A-NEXT: s_load_dwordx2 s[40:41], s[4:5], 0x100
+; GFX90A-NEXT: s_load_dwordx2 s[38:39], s[4:5], 0x108
+; GFX90A-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x110
+; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x118
+; GFX90A-NEXT: s_load_dwordx2 s[30:31], s[4:5], 0x120
+; GFX90A-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x128
+; GFX90A-NEXT: s_load_dwordx2 s[26:27], s[4:5], 0x130
+; GFX90A-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x138
+; GFX90A-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x140
+; GFX90A-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x148
+; GFX90A-NEXT: s_load_dwordx2 s[18:19], s[4:5], 0x150
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x158
+; GFX90A-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x160
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x168
+; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x170
+; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x178
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x180
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x188
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_writelane_b32 v2, s4, 4
+; GFX90A-NEXT: v_writelane_b32 v2, s5, 5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_mov_b32 s33, s5
+; GFX90A-NEXT: v_writelane_b32 v2, s33, 6
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x4008
+; GFX90A-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen offset:12
+; GFX90A-NEXT: s_mov_b32 s33, s4
+; GFX90A-NEXT: v_readlane_b32 s4, v2, 6
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x4008
+; GFX90A-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen offset:8
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x4008
+; GFX90A-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_readlane_b32 s4, v2, 0
+; GFX90A-NEXT: v_readlane_b32 s5, v2, 1
+; GFX90A-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NEXT: s_mov_b32 s33, s5
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s4
+; GFX90A-NEXT: v_readlane_b32 s4, v2, 2
+; GFX90A-NEXT: v_readlane_b32 s5, v2, 3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s5
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s4
+; GFX90A-NEXT: v_readlane_b32 s4, v2, 4
+; GFX90A-NEXT: v_readlane_b32 s5, v2, 5
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, vcc_hi
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, vcc_lo
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s99
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s98
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s97
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s96
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s95
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s94
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s93
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s92
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s91
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s90
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s89
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s88
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s87
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s86
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s85
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s84
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s83
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s82
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s81
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s80
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s79
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s78
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s77
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s76
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s75
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s74
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s73
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s72
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s71
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s70
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s69
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s68
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s67
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s66
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s65
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s64
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s63
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s62
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s61
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s60
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s59
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s58
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s57
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s56
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s55
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s54
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s53
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s52
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s51
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s50
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s49
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s48
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s47
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s46
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s45
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s44
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s43
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s42
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s41
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s40
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s39
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s38
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s37
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s36
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s35
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: s_mov_b32 s33, s34
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s33, s31
+; GFX90A-NEXT: v_mov_b32_e32 v0, s33
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: ; kill: def $sgpr30 killed $sgpr30 killed $sgpr30_sgpr31
+; GFX90A-NEXT: v_mov_b32_e32 v0, s30
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s30, s29
+; GFX90A-NEXT: v_mov_b32_e32 v0, s30
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: ; kill: def $sgpr28 killed $sgpr28 killed $sgpr28_sgpr29
+; GFX90A-NEXT: v_mov_b32_e32 v0, s28
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s28, s27
+; GFX90A-NEXT: v_mov_b32_e32 v0, s28
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: ; kill: def $sgpr26 killed $sgpr26 killed $sgpr26_sgpr27
+; GFX90A-NEXT: v_mov_b32_e32 v0, s26
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s26, s25
+; GFX90A-NEXT: v_mov_b32_e32 v0, s26
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX90A-NEXT: ; kill: def $sgpr24 killed $sgpr24 killed $sgpr24_sgpr25
+; GFX90A-NEXT: v_mov_b32_e32 v0, s24
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX90A-NEXT: s_mov_b32 s24, s23
+; GFX90A-NEXT: ...
[truncated]
|
d7ecf2d to
4c5d4b3
Compare
4c5d4b3 to
af84378
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
65377c3 to
4c36a54
Compare
|
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
t
0dff251 to
cb2f757
Compare
cb2f757 to
76e516d
Compare
|
Thanks for the feedback. I decided to take a safer route by enhancing the LocalStackSlotAllocation pass instead of removing the feature. I corrected the newly added test as well. |
76e516d to
42348c7
Compare
This still does not resolve the issue. This may make it less likely to happen, but the actual fix requires something like the patch I posted |
I see. How about I create a new PR for the FI elimination changes? It might still be worth merging this patch. |
Yes, should do both |
42348c7 to
f51c96b
Compare
|
I pivoted this PR to solve #155902. I will create a new PR to implement the LocalStackSlotAlloca changes. |
If the subtarget supports flat scratch SVS mode and there is no SGPR available to replace a frame index, convert a scratch instruction in SS form into SV form by scavenging a VGPR. Co-authored by Matt Arsenault
f51c96b to
7259860
Compare
If the subtarget supports flat scratch SVS mode and there is no SGPR available to replace a frame index, convert a scratch instruction in SS form into SV form and replace the frame index with a scavenged VGPR. Resolves #155902
Co-authored-by: Matt Arsenault matthew.arsenault@amd.com