-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Do not fold an immediate into instructions with frame indexes #151263
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Do not fold an immediate into an instruction that already has a frame index operand. A frame index could possibly turn out to be another immediate.
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesDo not fold an immediate into an instruction that already has a frame index operand. Patch is 24.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151263.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..49b1683bf1abe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6120,10 +6120,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
!Op.isIdenticalTo(*MO))
return false;
- // Do not fold a frame index into an instruction that already has a frame
- // index. The frame index handling code doesn't handle fixing up operand
- // constraints if there are multiple indexes.
- if (Op.isFI() && MO->isFI())
+ // Do not fold a frame index or an immediate into an instruction that
+ // already has a frame index. The frame index handling code doesn't handle
+ // fixing up operand constraints if there are multiple indexes, and a
+ // frame index could possibly turn out to be another immediate.
+ if (Op.isFI() && (MO->isFI() || MO->isImm()))
return false;
}
} else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a066b15f84d6b..e6a8baceee020 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1917,8 +1917,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1933,7 +1934,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1945,10 +1947,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX942-LABEL: store_load_large_imm_offset_kernel:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s0, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1958,7 +1961,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-LABEL: store_load_large_imm_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, 4
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -1986,8 +1991,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2002,7 +2008,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2014,10 +2021,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2027,7 +2035,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2061,11 +2071,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2076,8 +2088,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2089,11 +2103,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX942-LABEL: store_load_large_imm_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_add_i32 s1, s32, s0
; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s1, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2104,7 +2120,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s32, s0
+; GFX11-NEXT: s_add_i32 s0, s1, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2133,11 +2152,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX9: ; %bb.0: ; %bb
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2148,8 +2169,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX10: ; %bb.0: ; %bb
; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2161,11 +2184,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2176,7 +2201,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b25d9b245f5f6..fc8883924dfbc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3804
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0
+; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 7fad2f466bc9f..a88b1ecc40cc9 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -75,7 +75,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0
- ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
index cc4314263bcba..2f2d727ee2c59 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
@@ -46,7 +46,8 @@ body: |
%2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
...
# GCN-LABEL: name: test_frameindex{{$}}
-# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70
+# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70
+# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]]
---
name: test_frameindex
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 15cda622b902d..f2fe61f5376e4 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -360,7 +360,8 @@ entry:
; s_add_i32.
; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error:
-; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
+; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000
+; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]]
; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014e33e7..300124848c1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
-; CHECK-NEXT: s_add_i32 s1, s32, 0xf4
-; CHECK-NEXT: s_add_i32 s2, s32, 0xf8
-; CHECK-NEXT: s_add_i32 s3, s32, 0xfc
+; CHECK-NEXT: s_movk_i32 s1, 0xf4
+; CHECK-NEXT: s_movk_i32 s2, 0xf8
+; CHECK-NEXT: s_movk_i32 s3, 0xfc
+; CHECK-NEXT: s_movk_i32 s34, 0x100
; CHECK-NEXT: v_mov_b32_e32 v1, v0
-; CHECK-NEXT: s_add_i32 s34, s32, 0x100
-; CHECK-NEXT: s_add_i32 s35, s32, 0x104
-; CHECK-NEXT: s_add_i32 s36, s32, 0x108
-; CHECK-NEXT: s_add_i32 s37, s32, 0x110
-; CHECK-NEXT: s_add_i32 s38, s32, 0x120
+; CHECK-NEXT: s_movk_i3...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Changpeng Fang (changpeng) ChangesDo not fold an immediate into an instruction that already has a frame index operand. Patch is 24.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151263.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..49b1683bf1abe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6120,10 +6120,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
!Op.isIdenticalTo(*MO))
return false;
- // Do not fold a frame index into an instruction that already has a frame
- // index. The frame index handling code doesn't handle fixing up operand
- // constraints if there are multiple indexes.
- if (Op.isFI() && MO->isFI())
+ // Do not fold a frame index or an immediate into an instruction that
+ // already has a frame index. The frame index handling code doesn't handle
+ // fixing up operand constraints if there are multiple indexes, and a
+ // frame index could possibly turn out to be another immediate.
+ if (Op.isFI() && (MO->isFI() || MO->isImm()))
return false;
}
} else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a066b15f84d6b..e6a8baceee020 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1917,8 +1917,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1933,7 +1934,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1945,10 +1947,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX942-LABEL: store_load_large_imm_offset_kernel:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s0, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1958,7 +1961,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-LABEL: store_load_large_imm_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, 4
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -1986,8 +1991,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2002,7 +2008,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2014,10 +2021,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2027,7 +2035,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2061,11 +2071,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2076,8 +2088,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2089,11 +2103,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX942-LABEL: store_load_large_imm_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_add_i32 s1, s32, s0
; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s1, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2104,7 +2120,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s32, s0
+; GFX11-NEXT: s_add_i32 s0, s1, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2133,11 +2152,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX9: ; %bb.0: ; %bb
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2148,8 +2169,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX10: ; %bb.0: ; %bb
; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2161,11 +2184,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2176,7 +2201,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b25d9b245f5f6..fc8883924dfbc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3804
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0
+; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 7fad2f466bc9f..a88b1ecc40cc9 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -75,7 +75,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0
- ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
index cc4314263bcba..2f2d727ee2c59 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
@@ -46,7 +46,8 @@ body: |
%2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
...
# GCN-LABEL: name: test_frameindex{{$}}
-# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70
+# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70
+# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]]
---
name: test_frameindex
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 15cda622b902d..f2fe61f5376e4 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -360,7 +360,8 @@ entry:
; s_add_i32.
; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error:
-; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
+; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000
+; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]]
; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014e33e7..300124848c1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
-; CHECK-NEXT: s_add_i32 s1, s32, 0xf4
-; CHECK-NEXT: s_add_i32 s2, s32, 0xf8
-; CHECK-NEXT: s_add_i32 s3, s32, 0xfc
+; CHECK-NEXT: s_movk_i32 s1, 0xf4
+; CHECK-NEXT: s_movk_i32 s2, 0xf8
+; CHECK-NEXT: s_movk_i32 s3, 0xfc
+; CHECK-NEXT: s_movk_i32 s34, 0x100
; CHECK-NEXT: v_mov_b32_e32 v1, v0
-; CHECK-NEXT: s_add_i32 s34, s32, 0x100
-; CHECK-NEXT: s_add_i32 s35, s32, 0x104
-; CHECK-NEXT: s_add_i32 s36, s32, 0x108
-; CHECK-NEXT: s_add_i32 s37, s32, 0x110
-; CHECK-NEXT: s_add_i32 s38, s32, 0x120
+; CHECK-NEXT: s_movk_i3...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing a new test that would break. This looks too conservative; it will always be OK if the immediate is an inline immediate
It is filtered out under the condition (MO) !IsInlineConst. |
new test added |
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 | ||
; GFX10-NEXT: v_mov_b32_e32 v0, 13 | ||
; GFX10-NEXT: v_mov_b32_e32 v1, 15 | ||
; GFX10-NEXT: s_movk_i32 s0, 0x3e84 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The codegen looks better in the original case. It is true for most of the lit-test changes introduced in this patch.
Was this move originally incorrect? Or this code bloat is a side-effect of this fix?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The codegen looks better in the original case. It is true for most of the lit-test changes introduced in this patch. Was this move originally incorrect? Or this code bloat is a side-effect of this fix?
If an instruction has both FI and imm operands, it could possibly result in two immediate. So this is to fix the correctness issue conservatively.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok.
Hi, @arsenm : New test has been added. And we already filtered out the case of inline immediate, i.e. we can fold frame index into an instruction with an inline immediate operand. Also do you think it is cheaper just to revert the offending patch: #140587 |
// already has a frame index. The frame index handling code doesn't handle | ||
// fixing up operand constraints if there are multiple indexes, and a | ||
// frame index could possibly turn out to be another immediate. | ||
if (Op.isFI() && (MO->isFI() || MO->isImm())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This probably should be !isReg, the same issue probably exists for any exotic operand types
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This probably should be !isReg, the same issue probably exists for any exotic operand types
we are under the following conditions. So it means completely giving up when the instruction has a FI operand.
if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
bb: | ||
%alloca = alloca <4 x i64>, align 32, addrspace(5) | ||
%alloca1 = alloca <16 x i64>, align 128, addrspace(5) | ||
%addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these casts really necessary?
%alloca = alloca <4 x i64>, align 32, addrspace(5) | ||
%alloca1 = alloca <16 x i64>, align 128, addrspace(5) | ||
%addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr | ||
store volatile <4 x i64> poison, ptr %addrspacecast, align 32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Store a real value?
1) extend the restriction to all !isReg: the same issue probably exists for any exotic operand types. 2) Update the newly added LIT test: -- remove unnecessary addrespacecasts -- store value instead of posion
Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
llvm#151263) Do not fold an immediate into an instruction that already has a frame index operand. A frame index could possibly turn out to be another immediate. Fixes: SWDEV-536263 --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
Do not fold an immediate into an instruction that already has a frame index operand.
A frame index could possibly turn out to be another immediate.
Fixes: SWDEV-536263