-
Notifications
You must be signed in to change notification settings - Fork 13.4k
AMDGPU: Return legal addressmode correctly for flat scratch #71494
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Ruiling, Song (ruiling) ChangesDepends on #70634 for test changes. Patch is 145.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71494.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index cd810f0b43e50db..3ec526b2094c0ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1151,13 +1151,58 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p CheckTwoInstrs is set, we will check
+// against the last two instructions which calculate \p FullAddr. When \p
+// CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction before the last).
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr,
+ bool CheckTwoInstrs,
+ bool CheckTwoOperands,
uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
return true;
- // When value in 32-bit Base can be negative calculate scratch offset using
- // 32-bit add instruction, otherwise use Base(unsigned) + offset.
- return CurDAG->SignBitIsZero(Base);
+
+ // whether we can prove the operands are non-negative from operation.
+ auto HasOnlyNonNegativeOperands = [](SDValue Addr) -> bool {
+ return (Addr.getOpcode() == ISD::ADD &&
+ Addr->getFlags().hasNoUnsignedWrap()) ||
+ Addr->getOpcode() == ISD::OR;
+ };
+
+ if (CheckTwoInstrs) {
+ auto PartAddr = FullAddr.getOperand(0);
+ // Make sure we are doing SGPR + VGPR + Imm.
+ assert(isa<ConstantSDNode>(FullAddr.getOperand(1)));
+ if (HasOnlyNonNegativeOperands(FullAddr) &&
+ HasOnlyNonNegativeOperands(PartAddr))
+ return true;
+
+ auto LHS = PartAddr.getOperand(0);
+ auto RHS = PartAddr.getOperand(1);
+ return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+ }
+
+ // Single instruction case
+ if (HasOnlyNonNegativeOperands(FullAddr))
+ return true;
+
+ auto LHS = FullAddr.getOperand(0);
+ auto RHS = FullAddr.getOperand(1);
+ if (CheckTwoOperands)
+ return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+
+ // If the immediate offset is negative, the base address cannot also be
+ // negative.
+ ConstantSDNode *ImmOp = nullptr;
+ if (FullAddr.getOpcode() == ISD::ADD &&
+ (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
+ if (ImmOp->getSExtValue() < 0)
+ return true;
+ }
+
+ return CurDAG->SignBitIsZero(LHS);
}
// TODO: If offset is too big, put low 16-bit into offset.
@@ -1554,7 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
- isFlatScratchBaseLegal(N0, FlatVariant)) {
+ isFlatScratchBaseLegal(Addr, false, false, FlatVariant)) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1786,8 +1831,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
int64_t COffsetVal = 0;
- if (CurDAG->isBaseWithConstantOffset(Addr) &&
- isFlatScratchBaseLegal(Addr.getOperand(0))) {
+ if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
@@ -1844,6 +1888,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
int64_t ImmOffset = 0;
SDValue LHS, RHS;
+ SDValue FullAddr = Addr;
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1865,7 +1910,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
@@ -1891,7 +1936,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(FullAddr, FullAddr != Addr, true))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..08f393ab9fae8d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
- SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ SDValue FullAddr, bool CheckTwoInstrs = false,
+ bool CheckTwoOperands = false,
+ uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2cf60f338105b1e..a0808e032d13f90 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
@@ -4103,7 +4104,10 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
int64_t ConstOffset;
std::tie(PtrBase, ConstOffset) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
- if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
+
+ auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
+ if (ConstOffset == 0 ||
+ !isFlatScratchBaseLegal(*AddrDef->MI, nullptr, false, FlatVariant))
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4265,15 +4269,16 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// Match the immediate offset first, which canonically is moved as low as
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
- if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
+ if (ConstOffset != 0 && isFlatScratchBaseLegal(*AddrDef->MI) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
+ AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
}
- auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = AddrDef->MI->getOperand(1).getIndex();
return {{
@@ -4343,6 +4348,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ Register FullAddr = Addr;
if (ConstOffset != 0 &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
Addr = PtrBase;
@@ -4360,7 +4366,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register LHS = AddrDef->MI->getOperand(1).getReg();
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
- if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
+ auto FullAddrDef = getDefSrcRegIgnoringCopies(FullAddr, *MRI);
+ if (!isFlatScratchBaseLegal(*FullAddrDef->MI,
+ FullAddr != Addr ? AddrDef->MI : nullptr, true))
return std::nullopt;
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4494,14 +4502,52 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
return KB->signBitIsZero(Base);
}
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p PartAddr is set, we will check
+// against both instructions to be sure the address are non-negative. When
+// \p CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction \p PartAddr).
bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
- Register Base, uint64_t FlatVariant) const {
+ MachineInstr &FullAddr, MachineInstr *PartAddr, bool CheckTwoOperands,
+ uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
return true;
- // When value in 32-bit Base can be negative calculate scratch offset using
- // 32-bit add instruction, otherwise use Base(unsigned) + offset.
- return KB->signBitIsZero(Base);
+ // whether we can prove the operands are non-negative from operation.
+ auto HasOnlyNonNegativeOperands = [](MachineInstr *Addr) -> bool {
+ return Addr->getOpcode() == TargetOpcode::G_OR ||
+ (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
+ Addr->getFlag(MachineInstr::NoUWrap));
+ };
+
+ if (PartAddr) {
+ if (HasOnlyNonNegativeOperands(&FullAddr) &&
+ HasOnlyNonNegativeOperands(PartAddr))
+ return true;
+ Register LHS = PartAddr->getOperand(1).getReg();
+ Register RHS = PartAddr->getOperand(2).getReg();
+ return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+ }
+
+ // Single instruction case
+ if (HasOnlyNonNegativeOperands(&FullAddr))
+ return true;
+
+ Register LHS = FullAddr.getOperand(1).getReg();
+ Register RHS = FullAddr.getOperand(2).getReg();
+ if (CheckTwoOperands)
+ return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+
+ if (FullAddr.getOpcode() == TargetOpcode::G_PTR_ADD) {
+ auto RhsValReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+ // If the immediate offset is negative, the base address cannot also be
+ // negative.
+ if (RhsValReg && RhsValReg->Value.getSExtValue() < 0)
+ return true;
+ }
+
+ return KB->signBitIsZero(LHS);
}
bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..53e5fb995fc041e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,7 +244,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
- Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ MachineInstr &FullAddr, MachineInstr *PartAddr = nullptr,
+ bool CheckTwoOperands = false,
+ uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5c46d81f57af6a9..8be3d0460af4e96 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1357,6 +1357,13 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
return isLegalMUBUFAddressingMode(AM);
}
+bool SITargetLowering::isLegalFlatScratchAddressingMode(
+ const AddrMode &AM) const {
+ return AM.Scale == 0 &&
+ Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
+}
+
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -1448,7 +1455,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return isLegalMUBUFAddressingMode(AM);
+ return Subtarget->enableFlatScratch() ? isLegalFlatScratchAddressingMode(AM)
+ : isLegalMUBUFAddressingMode(AM);
if (AS == AMDGPUAS::LOCAL_ADDRESS ||
(AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 746a88c5ea13a30..90a67853e8011fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -222,6 +222,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalFlatScratchAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 45df3bc094f351e..ec2cd43e5fb5df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -89,17 +89,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_add_u32_e32 v1, 4, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
-; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 4
-; GFX9-NEXT: scratch_store_dword v1, v3, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT: scratch_load_dword v0, v0, off glc
+; GFX9-NEXT: scratch_store_dword v1, v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -111,42 +109,39 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT: v_add3_u32 v1, 4, v1, v2
-; GFX10-NEXT: scratch_store_dword v0, v3, off
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_vindex_kernel:
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: v_mov_b32_e32 v3, 15
; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT: v_mov_b32_e32 v2, 4
-; GFX940-NEXT: scratch_store_dword v1, v3, off offset:4 sc0 sc1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v2, 15
; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT: scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:4 dlc
+; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_add3_u32 v1, 4, v1, v2
-; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
bb:
@@ -233,34 +228,31 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX9-LABEL: private_ptr_foo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT: scratch_store_dword v0, v1, off
+; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_ptr_foo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT: scratch_store_dword v0, v1, off
+; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: private_ptr_foo:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v0, 4, v0
; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_ptr_foo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT: scratch_store_b32 v0, v1, off
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -366,16 +358,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x104
-; GFX9-NEXT: scratch_store_dword v1, v3, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT: scratch_load_dword v0, v0, off glc
+; GFX9-NEXT: scratch_store_dword v1, v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NE...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
@@ -1357,6 +1357,13 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { | |||
return isLegalMUBUFAddressingMode(AM); | |||
} | |||
|
|||
bool SITargetLowering::isLegalFlatScratchAddressingMode( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As an alternative, you could change isLegalFlatAddressingMode
to take AS
and Variant
as arguments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in the latest commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks.
f07398a
to
e5fe483
Compare
e5fe483
to
a81838e
Compare
Depends on #70634 for test changes.