-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Folding imm offset in more cases for scratch access #70634
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Ruiling, Song (ruiling) ChangesFor scratch load/store, our hardware only accept non-negative value in SGPR/VGPR. Besides the case that we can prove from known bits, we can also prove that the value in Patch is 105.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70634.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..691d644badd24b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,10 +1146,23 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base,
uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
return true;
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ // For `nuw` addition, we should not have negative base address.
+ if (Addr->getFlags().hasNoUnsignedWrap())
+ return true;
+
+ auto *RHS = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ // If the immediate offset is negative, we should not have the base being
+ // negative as well.
+ if (RHS && RHS->getSExtValue() < 0)
+ return true;
+ }
+
// When value in 32-bit Base can be negative calculate scratch offset using
// 32-bit add instruction, otherwise use Base(unsigned) + offset.
return CurDAG->SignBitIsZero(Base);
@@ -1549,7 +1562,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
- isFlatScratchBaseLegal(N0, FlatVariant)) {
+ isFlatScratchBaseLegal(Addr, N0, FlatVariant)) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1782,7 +1795,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
int64_t COffsetVal = 0;
if (CurDAG->isBaseWithConstantOffset(Addr) &&
- isFlatScratchBaseLegal(Addr.getOperand(0))) {
+ isFlatScratchBaseLegal(Addr, Addr.getOperand(0))) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
@@ -1860,7 +1873,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr, SAddr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
@@ -1886,7 +1899,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr, SAddr) ||
+ !isFlatScratchBaseLegal(Addr, VAddr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..8a47757f70bbfbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
- SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ SDValue Addr, SDValue Base,
+ uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index af023835c529776..329f0a2068cb072 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -704,11 +704,11 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR: ; %bb.0: ; %bb
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off
-; FLATSCR-NEXT: v_add_u32_e32 v2, 2, v0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR-NEXT: scratch_load_short_d16 v0, v2, off
+; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -726,22 +726,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR_GFX10: ; %bb.0: ; %bb
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
-; FLATSCR_GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v0
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v0, v2, off
+; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; GFX11-NEXT: scratch_load_d16_b16 v0, v2, off
+; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index c849cf08094e718..ad4d4a4a30fc6d0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -11,16 +11,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT: scratch_load_ubyte v0, v0, off
+; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_zext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT: scratch_load_u8 v0, v0, off
+; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -38,16 +36,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT: scratch_load_sbyte v0, v0, off
+; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_sext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT: scratch_load_i8 v0, v0, off
+; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -65,16 +61,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT: scratch_load_ushort v0, v0, off
+; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_zext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT: scratch_load_u16 v0, v0, off
+; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -92,16 +86,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT: scratch_load_sshort v0, v0, off
+; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_sext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT: scratch_load_i16 v0, v0, off
+; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -359,16 +351,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 1
-; GFX10-NEXT: scratch_load_ubyte v2, off, s2
+; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_zext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 1
-; GFX11-NEXT: scratch_load_u8 v2, off, s0
+; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -386,16 +376,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 1
-; GFX10-NEXT: scratch_load_sbyte v2, off, s2
+; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_sext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 1
-; GFX11-NEXT: scratch_load_i8 v2, off, s0
+; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -413,16 +401,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 2
-; GFX10-NEXT: scratch_load_ushort v2, off, s2
+; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_zext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 2
-; GFX11-NEXT: scratch_load_u16 v2, off, s0
+; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -440,16 +426,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 2
-; GFX10-NEXT: scratch_load_sshort v2, off, s2
+; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_sext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 2
-; GFX11-NEXT: scratch_load_i16 v2, off, s0
+; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -713,19 +697,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT: scratch_load_ubyte v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_zext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT: scratch_load_u8 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -745,19 +726,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT: scratch_load_sbyte v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_sext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT: scratch_load_i8 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -777,19 +755,16 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT: scratch_load_ushort v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_zext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT: scratch_load_u16 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -809,19 +784,16 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT: scratch_load_sshort v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_sext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT: scratch_load_i16 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 07b3df2a8520aae..fe984ffa653a5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -576,11 +576,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_add_u32_e32 v1, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
-; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT: scratch_load_dword v0, v0, off glc
+; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
+; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -592,24 +591,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 15
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT: scratch_store_dword v0, v2, off
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT: scratch_store_dword v1, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_kernel:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, 4, v0
-; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0
+; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
;
@@ -628,8 +625,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT: scratch_lo...
[truncated]
|
Should refine this to specify this is for MUBUF, scratch instructions do have signed offsets (we also still need to implement stack support using negative offsetting) |
I don't quite get your point here, the change is for scratch_load/scratch_store. The commit message is saying the address offset in SGPR/VGPR should be non-negative per hardware requirement. If I understand correctly, only the immediate offset in scratch_load/store can be negative, right? |
@@ -1146,10 +1146,23 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, | |||
return CurDAG->SignBitIsZero(Base); | |||
} | |||
|
|||
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base, | |||
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you can implement this in isFlatScratchBaseLegal
. If the address is saddr+ioffset
or vaddr+ioffset
then this function is only called on the saddr
or vaddr
part, so this is not the right place to check for nuw
addition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the whole address will be passed to the Addr
instead of base in the case saddr+ioffset
and vaddr+ioffset
? see AMDGPUDAGToDAGISel::SelectFlatOffsetImpl()
and AMDGPUDAGToDAGISel::SelectScratchSAddr()
. I am trying to add some comment locally explaining the function:
// This is used to check whether the address of scratch_load/store in the
// form of `base + immediate offset` is legal with respect to the hardware's
// requirement that the SGPR/VGPR address offset in the flat scratch instruction
// should be unsigned.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, the function was only passed the base
part, I have added the whole address as well as the base.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I see. I think this patch is probably OK as-is but the code that it touches could really benefit from some refactoring. For example isFlatScratchBaseLegal no longer just checks the "base" part of the address, and it has to look for ADD nodes that have already been discovered by its callers. In the case of SGPR+VGPR+offset addressing, we could check that both ADDs have the nuw flag set.
Maybe selectScratchOffset
, selectScratchSAddr
and selectScratchSVAddr
could be combined into one function with flags saying whether it should match saddr or vaddr or both.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would argue that the job of the isFlatScratchBaseLegal() does not change too much. Its job is still to prove that base part of the address can be put in SGPR/VGPR, but with additional context, we can now do things better. So far, the extra checking happened inside it is trivial. Maybe there should be a better name for it:)
I feel we might need more code to check for both ADDs in case of SGPR+VGPR+Imm, because passing in two bases does not mean that the SGPR and VGPR are in two different ADDs. So I just leave a TODO in case someone came up with a simple code implementation later.
I am not sure what the ideal form of refactor would be for the three select* functions. I would really prefer someone else could help on that later as a separate NFC change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added the check for two ADDs, and refine the code to be more reasonable. Please help take a look. Thanks!
d676da0
to
b6c8367
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
ping |
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know. The whole CheckBothOperands
thing seems fragile to me and as if something wasn't quite right yet. See also the inline comments.
// If the immediate offset is negative, the base address cannot also be | ||
// negative. Although in theory we could get positive value by adding two | ||
// negative values (like in the case: 0x80000010 + 0x80000020 = 0x30), but | ||
// isLegalFLATOffset() will help filter out such values later. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This reliance on interactions between multiple methods seems fragile :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But it maybe not as fragile as you think. Do you see a situation this might be broken? Or do you think adding a check here would be helpful? like checking the Imm > -0x40000000
which should be enough to be helpful, then base address should be non-negative. Otherwise, the result address would be either negative or too large that would fall out of the region a thread can access.
|
The |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Really wish we could get alive or something to prove this works
if (FlatVariant != SIInstrFlags::FlatScratch) | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pre-existing issue but I'd expect this case to not be called in the first place?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, agree. I did not think carefully about this. I think the caller can write like if (FlatVariant != FlatScratch) || isFlatScratchBaseLegal()
.
// When value in 32-bit Base can be negative calculate scratch offset using | ||
// 32-bit add instruction, otherwise use Base(unsigned) + offset. | ||
return KB->signBitIsZero(Base); | ||
auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think auto hurts here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree. But it is also weird to have a type name take almost one extra line:( I have changed it to getDefIgnoringCopies() which matches the need here exactly.
For flat scratch load/store, our hardware only accept non-negative value in SGPR/VGPR. Besides the case that we can prove from known bits, we can also prove that the value in `base` will be non-negative: 1.) When the address calculatioon has NonUnsignedWrap property (either an OR instruction or ADD with NoUnsignedWrap flag). 2.) When the immediate offset is already negative and withn a specific range.
5f67952
to
1e8d9dd
Compare
The failure " | Flang :: Driver/ctofortran.f90" is unrelated. I will merge the change. |
Register Base, uint64_t FlatVariant) const { | ||
if (FlatVariant != SIInstrFlags::FlatScratch) | ||
// Return whether the operation has NoUnsignedWrap property. | ||
bool isNoUnsignedWrap(MachineInstr *Addr) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added static
to make this internal linkage and changed a };
to }
in commit 9535e01
…vm#78193) llvm#70634 has disabled use of potentially negative scratch offsets, but we can use it on GFX12. --------- Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
For scratch load/store, our hardware only accept non-negative value in SGPR/VGPR. Besides the case that we can prove from known bits, we can also prove that the value in
base
will be non-negative: 1.) When the ADD for the address calculation has NonUnsignedWrap flag. 2.) When the immediate offset is already negative.