AMDGPU: Return legal addressmode correctly for flat scratch #71494

ruiling · 2023-11-07T07:30:00Z

Depends on #70634 for test changes.

llvmbot · 2023-11-07T07:30:29Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Ruiling, Song (ruiling)

Changes

Depends on #70634 for test changes.

Patch is 145.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71494.diff

16 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+54-9)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+3-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+54-8)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+3-1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+9-1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+88-114)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir (+4-12)
(modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+9-9)
(modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+32-60)
(modified) llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll (+35-45)
(modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+189-223)
(modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+21-30)
(modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+284-308)
(modified) llvm/test/CodeGen/AMDGPU/memory_clause.ll (+6-12)
(added) llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll (+46)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index cd810f0b43e50db..3ec526b2094c0ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1151,13 +1151,58 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p CheckTwoInstrs is set, we will check
+// against the last two instructions which calculate \p FullAddr. When \p
+// CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction before the last).
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr,
+                                                bool CheckTwoInstrs,
+                                                bool CheckTwoOperands,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
-  // When value in 32-bit Base can be negative calculate scratch offset using
-  // 32-bit add instruction, otherwise use Base(unsigned) + offset.
-  return CurDAG->SignBitIsZero(Base);
+
+  // whether we can prove the operands are non-negative from operation.
+  auto HasOnlyNonNegativeOperands = [](SDValue Addr) -> bool {
+    return (Addr.getOpcode() == ISD::ADD &&
+            Addr->getFlags().hasNoUnsignedWrap()) ||
+           Addr->getOpcode() == ISD::OR;
+  };
+
+  if (CheckTwoInstrs) {
+    auto PartAddr = FullAddr.getOperand(0);
+    // Make sure we are doing SGPR + VGPR + Imm.
+    assert(isa<ConstantSDNode>(FullAddr.getOperand(1)));
+    if (HasOnlyNonNegativeOperands(FullAddr) &&
+        HasOnlyNonNegativeOperands(PartAddr))
+      return true;
+
+    auto LHS = PartAddr.getOperand(0);
+    auto RHS = PartAddr.getOperand(1);
+    return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+  }
+
+  // Single instruction case
+  if (HasOnlyNonNegativeOperands(FullAddr))
+    return true;
+
+  auto LHS = FullAddr.getOperand(0);
+  auto RHS = FullAddr.getOperand(1);
+  if (CheckTwoOperands)
+    return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+
+  // If the immediate offset is negative, the base address cannot also be
+  // negative.
+  ConstantSDNode *ImmOp = nullptr;
+  if (FullAddr.getOpcode() == ISD::ADD &&
+      (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
+    if (ImmOp->getSExtValue() < 0)
+      return true;
+  }
+
+  return CurDAG->SignBitIsZero(LHS);
 }
 
 // TODO: If offset is too big, put low 16-bit into offset.
@@ -1554,7 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
-        isFlatScratchBaseLegal(N0, FlatVariant)) {
+        isFlatScratchBaseLegal(Addr, false, false, FlatVariant)) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1786,8 +1831,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
 
   int64_t COffsetVal = 0;
 
-  if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr.getOperand(0))) {
+  if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {
@@ -1844,6 +1888,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   int64_t ImmOffset = 0;
 
   SDValue LHS, RHS;
+  SDValue FullAddr = Addr;
   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
     const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1865,7 +1910,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
-        if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+        if (!isFlatScratchBaseLegal(Addr))
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
@@ -1891,7 +1936,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+  if (!isFlatScratchBaseLegal(FullAddr, FullAddr != Addr, true))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..08f393ab9fae8d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+      SDValue FullAddr, bool CheckTwoInstrs = false,
+      bool CheckTwoOperands = false,
+      uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2cf60f338105b1e..a0808e032d13f90 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
@@ -4103,7 +4104,10 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
   int64_t ConstOffset;
   std::tie(PtrBase, ConstOffset) =
       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
-  if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
+
+  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
+  if (ConstOffset == 0 ||
+      !isFlatScratchBaseLegal(*AddrDef->MI, nullptr, false, FlatVariant))
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4265,15 +4269,16 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
 
-  if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
+  if (ConstOffset != 0 && isFlatScratchBaseLegal(*AddrDef->MI) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
                             SIInstrFlags::FlatScratch)) {
     Addr = PtrBase;
     ImmOffset = ConstOffset;
+    AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   }
 
-  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = AddrDef->MI->getOperand(1).getIndex();
     return {{
@@ -4343,6 +4348,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
 
+  Register FullAddr = Addr;
   if (ConstOffset != 0 &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
     Addr = PtrBase;
@@ -4360,7 +4366,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   Register LHS = AddrDef->MI->getOperand(1).getReg();
   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
 
-  if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
+  auto FullAddrDef = getDefSrcRegIgnoringCopies(FullAddr, *MRI);
+  if (!isFlatScratchBaseLegal(*FullAddrDef->MI,
+                              FullAddr != Addr ? AddrDef->MI : nullptr, true))
     return std::nullopt;
 
   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4494,14 +4502,52 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
   return KB->signBitIsZero(Base);
 }
 
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p PartAddr is set, we will check
+// against both instructions to be sure the address are non-negative. When
+// \p CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction \p PartAddr).
 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
-    Register Base, uint64_t FlatVariant) const {
+    MachineInstr &FullAddr, MachineInstr *PartAddr, bool CheckTwoOperands,
+    uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  // When value in 32-bit Base can be negative calculate scratch offset using
-  // 32-bit add instruction, otherwise use Base(unsigned) + offset.
-  return KB->signBitIsZero(Base);
+  // whether we can prove the operands are non-negative from operation.
+  auto HasOnlyNonNegativeOperands = [](MachineInstr *Addr) -> bool {
+    return Addr->getOpcode() == TargetOpcode::G_OR ||
+           (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
+            Addr->getFlag(MachineInstr::NoUWrap));
+  };
+
+  if (PartAddr) {
+    if (HasOnlyNonNegativeOperands(&FullAddr) &&
+        HasOnlyNonNegativeOperands(PartAddr))
+      return true;
+    Register LHS = PartAddr->getOperand(1).getReg();
+    Register RHS = PartAddr->getOperand(2).getReg();
+    return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+  }
+
+  // Single instruction case
+  if (HasOnlyNonNegativeOperands(&FullAddr))
+    return true;
+
+  Register LHS = FullAddr.getOperand(1).getReg();
+  Register RHS = FullAddr.getOperand(2).getReg();
+  if (CheckTwoOperands)
+    return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+
+  if (FullAddr.getOpcode() == TargetOpcode::G_PTR_ADD) {
+    auto RhsValReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+    // If the immediate offset is negative, the base address cannot also be
+    // negative.
+    if (RhsValReg && RhsValReg->Value.getSExtValue() < 0)
+      return true;
+  }
+
+  return KB->signBitIsZero(LHS);
 }
 
 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..53e5fb995fc041e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,7 +244,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+      MachineInstr &FullAddr, MachineInstr *PartAddr = nullptr,
+      bool CheckTwoOperands = false,
+      uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   std::pair<Register, unsigned>
   selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5c46d81f57af6a9..8be3d0460af4e96 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1357,6 +1357,13 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
   return isLegalMUBUFAddressingMode(AM);
 }
 
+bool SITargetLowering::isLegalFlatScratchAddressingMode(
+    const AddrMode &AM) const {
+  return AM.Scale == 0 &&
+         Subtarget->getInstrInfo()->isLegalFLATOffset(
+             AM.BaseOffs, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
+}
+
 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
   // additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -1448,7 +1455,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   }
 
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return isLegalMUBUFAddressingMode(AM);
+    return Subtarget->enableFlatScratch() ? isLegalFlatScratchAddressingMode(AM)
+                                          : isLegalMUBUFAddressingMode(AM);
 
   if (AS == AMDGPUAS::LOCAL_ADDRESS ||
       (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 746a88c5ea13a30..90a67853e8011fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -222,6 +222,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+  bool isLegalFlatScratchAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
   unsigned isCFIntrinsic(const SDNode *Intr) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 45df3bc094f351e..ec2cd43e5fb5df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -89,17 +89,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 4
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -111,42 +109,39 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add3_u32 v1, 4, v1, v2
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 4
-; GFX940-NEXT:    scratch_store_dword v1, v3, off offset:4 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v3, off offset:4 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add3_u32 v1, 4, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -233,34 +228,31 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -366,16 +358,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NE...
[truncated]

jayfoad

LGTM.

jayfoad · 2023-11-07T13:20:41Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -1357,6 +1357,13 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
  return isLegalMUBUFAddressingMode(AM);
 }

+bool SITargetLowering::isLegalFlatScratchAddressingMode(


As an alternative, you could change isLegalFlatAddressingMode to take AS and Variant as arguments.

Done in the latest commit.

jayfoad

LGTM, thanks.

ruiling requested review from jayfoad, arsenm and nhaehnle November 7, 2023 07:30

llvmbot added backend:AMDGPU llvm:globalisel labels Nov 7, 2023

jayfoad approved these changes Nov 7, 2023

View reviewed changes

jayfoad approved these changes Nov 9, 2023

View reviewed changes

ruiling force-pushed the return-legal-addr-mode-correctly branch from f07398a to e5fe483 Compare December 5, 2023 02:36

AMDGPU: Return legal addressmode correctly for flat scratch

a81838e

ruiling force-pushed the return-legal-addr-mode-correctly branch from e5fe483 to a81838e Compare December 5, 2023 03:22

ruiling merged commit 90681d3 into llvm:main Dec 5, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AMDGPU: Return legal addressmode correctly for flat scratch #71494

AMDGPU: Return legal addressmode correctly for flat scratch #71494

ruiling commented Nov 7, 2023

llvmbot commented Nov 7, 2023 •

edited

Loading

jayfoad left a comment

jayfoad Nov 7, 2023

ruiling Nov 9, 2023

jayfoad left a comment

AMDGPU: Return legal addressmode correctly for flat scratch #71494

AMDGPU: Return legal addressmode correctly for flat scratch #71494

Conversation

ruiling commented Nov 7, 2023

llvmbot commented Nov 7, 2023 • edited Loading

jayfoad left a comment

Choose a reason for hiding this comment

jayfoad Nov 7, 2023

Choose a reason for hiding this comment

ruiling Nov 9, 2023

Choose a reason for hiding this comment

jayfoad left a comment

Choose a reason for hiding this comment

llvmbot commented Nov 7, 2023 •

edited

Loading