Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Folding imm offset in more cases for scratch access #70634

Merged
merged 1 commit into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 69 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1152,13 +1152,64 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}

bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
// Return whether the operation has NoUnsignedWrap property.
static bool isNoUnsignedWrap(SDValue Addr) {
return (Addr.getOpcode() == ISD::ADD &&
Addr->getFlags().hasNoUnsignedWrap()) ||
Addr->getOpcode() == ISD::OR;
ruiling marked this conversation as resolved.
Show resolved Hide resolved
}

// Check that the base address of flat scratch load/store in the form of `base +
// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
// requirement). We always treat the first operand as the base address here.
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
if (isNoUnsignedWrap(Addr))
return true;
// When value in 32-bit Base can be negative calculate scratch offset using
// 32-bit add instruction, otherwise use Base(unsigned) + offset.
return CurDAG->SignBitIsZero(Base);

auto LHS = Addr.getOperand(0);
auto RHS = Addr.getOperand(1);

// If the immediate offset is negative and within certain range, the base
// address cannot also be negative. If the base is also negative, the sum
// would be either negative or much larger than the valid range of scratch
// memory a thread can access.
ConstantSDNode *ImmOp = nullptr;
if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
return true;
}

return CurDAG->SignBitIsZero(LHS);
}

// Check address value in SGPR/VGPR are legal for flat scratch in the form
// of: SGPR + VGPR.
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
if (isNoUnsignedWrap(Addr))
return true;

auto LHS = Addr.getOperand(0);
auto RHS = Addr.getOperand(1);
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
}

// Check address value in SGPR/VGPR are legal for flat scratch in the form
// of: SGPR + VGPR + Imm.
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
auto Base = Addr.getOperand(0);
auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
// If the immediate offset is negative and within certain range, the base
// address cannot also be negative. If the base is also negative, the sum
// would be either negative or much larger than the valid range of scratch
// memory a thread can access.
if (isNoUnsignedWrap(Base) &&
(isNoUnsignedWrap(Addr) ||
(RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
return true;

auto LHS = Base.getOperand(0);
auto RHS = Base.getOperand(1);
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
}

// TODO: If offset is too big, put low 16-bit into offset.
Expand Down Expand Up @@ -1555,7 +1606,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
isFlatScratchBaseLegal(N0, FlatVariant)) {
(FlatVariant != SIInstrFlags::FlatScratch ||
isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();

const SIInstrInfo *TII = Subtarget->getInstrInfo();
Expand Down Expand Up @@ -1787,8 +1839,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,

int64_t COffsetVal = 0;

if (CurDAG->isBaseWithConstantOffset(Addr) &&
isFlatScratchBaseLegal(Addr.getOperand(0))) {
if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
Expand Down Expand Up @@ -1845,6 +1896,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
int64_t ImmOffset = 0;

SDValue LHS, RHS;
SDValue OrigAddr = Addr;
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
Expand All @@ -1866,7 +1918,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
ruiling marked this conversation as resolved.
Show resolved Hide resolved
SAddr = LHS;
if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
if (!isFlatScratchBaseLegal(Addr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
Expand All @@ -1892,8 +1944,13 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}

if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
return false;
if (OrigAddr != Addr) {
if (!isFlatScratchBaseLegalSVImm(OrigAddr))
return false;
} else {
if (!isFlatScratchBaseLegalSV(OrigAddr))
return false;
}

if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
return false;
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;

bool isFlatScratchBaseLegal(SDValue Addr) const;
bool isFlatScratchBaseLegalSV(SDValue Addr) const;
bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;

bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
Expand Down
92 changes: 82 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4103,7 +4103,9 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
int64_t ConstOffset;
std::tie(PtrBase, ConstOffset) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))

if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
!isFlatScratchBaseLegal(Root.getReg())))
return Default;

unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
Expand Down Expand Up @@ -4266,7 +4268,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);

if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
Expand Down Expand Up @@ -4343,6 +4345,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);

Register OrigAddr = Addr;
if (ConstOffset != 0 &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
Addr = PtrBase;
Expand All @@ -4360,8 +4363,13 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register LHS = AddrDef->MI->getOperand(1).getReg();
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);

if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
return std::nullopt;
if (OrigAddr != Addr) {
if (!isFlatScratchBaseLegalSVImm(OrigAddr))
return std::nullopt;
} else {
if (!isFlatScratchBaseLegalSV(OrigAddr))
return std::nullopt;
}

if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
return std::nullopt;
Expand Down Expand Up @@ -4494,14 +4502,78 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
return KB->signBitIsZero(Base);
}

bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
Register Base, uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
// Return whether the operation has NoUnsignedWrap property.
bool isNoUnsignedWrap(MachineInstr *Addr) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added static to make this internal linkage and changed a };to } in commit 9535e01

return Addr->getOpcode() == TargetOpcode::G_OR ||
(Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
Addr->getFlag(MachineInstr::NoUWrap));
};

// Check that the base address of flat scratch load/store in the form of `base +
// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
// requirement). We always treat the first operand as the base address here.
bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);

if (isNoUnsignedWrap(AddrMI))
return true;

// When value in 32-bit Base can be negative calculate scratch offset using
// 32-bit add instruction, otherwise use Base(unsigned) + offset.
return KB->signBitIsZero(Base);
Register LHS = AddrMI->getOperand(1).getReg();
Register RHS = AddrMI->getOperand(2).getReg();

if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
std::optional<ValueAndVReg> RhsValReg =
getIConstantVRegValWithLookThrough(RHS, *MRI);
// If the immediate offset is negative and within certain range, the base
// address cannot also be negative. If the base is also negative, the sum
// would be either negative or much larger than the valid range of scratch
// memory a thread can access.
if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
RhsValReg->Value.getSExtValue() > -0x40000000)
return true;
}

return KB->signBitIsZero(LHS);
}

// Check address value in SGPR/VGPR are legal for flat scratch in the form
// of: SGPR + VGPR.
bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);

if (isNoUnsignedWrap(AddrMI))
return true;

Register LHS = AddrMI->getOperand(1).getReg();
Register RHS = AddrMI->getOperand(2).getReg();
return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
}

// Check address value in SGPR/VGPR are legal for flat scratch in the form
// of: SGPR + VGPR + Imm.
bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
Register Addr) const {
MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
Register Base = AddrMI->getOperand(1).getReg();
std::optional<DefinitionAndSourceRegister> BaseDef =
getDefSrcRegIgnoringCopies(Base, *MRI);
std::optional<ValueAndVReg> RHSOffset =
getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
assert(RHSOffset);

// If the immediate offset is negative and within certain range, the base
// address cannot also be negative. If the base is also negative, the sum
// would be either negative or much larger than the valid range of scratch
// memory a thread can access.
if (isNoUnsignedWrap(BaseDef->MI) &&
(isNoUnsignedWrap(AddrMI) ||
(RHSOffset->Value.getSExtValue() < 0 &&
RHSOffset->Value.getSExtValue() > -0x40000000)))
return true;

Register LHS = BaseDef->MI->getOperand(1).getReg();
Register RHS = BaseDef->MI->getOperand(2).getReg();
return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
}

bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool isDSOffsetLegal(Register Base, int64_t Offset) const;
bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
bool isFlatScratchBaseLegal(Register Addr) const;
bool isFlatScratchBaseLegalSV(Register Addr) const;
bool isFlatScratchBaseLegalSVImm(Register Addr) const;

std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
Expand Down
Loading