Skip to content

[AMDGPU] Optimize rotate instruction selection patterns #143551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def gi_vinterpmods_hi :
GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
GIComplexPatternEquiv<VINTERPModsHi>;

def gi_immsub :
GIComplexOperandMatcher<s32, "selectImmSub">,
GIComplexPatternEquiv<ImmSub>;

// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
def gi_vop3opsel :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3610,6 +3610,26 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
return true;
}

bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src,
SDValue &InvSrc) const {
Src = In;

// Handle constant operands
ConstantSDNode *ImmVal = dyn_cast<ConstantSDNode>(In);
if (ImmVal)
InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In),
MVT::i32);
else {
// Fallback: generate SUB instruction for non-constant, non-negation cases
SDNode *VMov = CurDAG->getMachineNode(
AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32,
{CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In});
InvSrc = SDValue(VMov, 0);
}

return true;
}

bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;

bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const;

SDValue getHi16Elt(SDValue In) const;

SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
Expand Down
25 changes: 25 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5831,6 +5831,31 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const {

MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

// Handle constant operands
std::optional<uint64_t> Val = getConstantZext32Val(Root.getReg(), *MRI);

if (!Val) {
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv)
.addImm(32)
.add(Root);
} else {
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv)
.addImm(32 - *Val);
}

return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); },
}};
}

std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns
selectImmSub(MachineOperand &Root) const;

bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
int64_t *Offset) const;
InstructionSelector::ComplexRendererFns
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2046,9 +2046,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();

getActionDefinitionsBuilder({G_ROTR, G_ROTL})
.scalarize(0)
.lower();
getActionDefinitionsBuilder(G_ROTR).legalFor({S32}).scalarize(0).lower();

getActionDefinitionsBuilder(G_ROTL).scalarize(0).lower();

// TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4073,6 +4073,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_ROTR:
case AMDGPU::G_ROTL: {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1630,6 +1630,8 @@ def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;

def ImmSub : ComplexPattern<untyped, 2, "SelectImmSub">;

//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2388,8 +2388,19 @@ def : AMDGPUPat <
$src1), sub1)
>;

// rotr pattern
def : AMDGPUPat <
(UniformBinFrag<rotr> i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))),
(S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv))
>;

let True16Predicate = NotHasTrue16BitInsts in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;

// rotr pattern
def : AMDGPUPat <
(DivergentBinFrag<rotr> i32:$src0, i32:$src1),
(V_ALIGNBIT_B32_e64 $src0, $src0, $src1)
>;

def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32)
; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[SUB]](s32)
; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTL %0, %1(s32)
Expand Down Expand Up @@ -301,14 +301,14 @@ body: |
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32)
; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[SUB]](s32)
; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32)
; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[SUB1]](s32)
; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32)
; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[SUB2]](s32)
; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32)
; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[SUB3]](s32)
; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
Expand Down Expand Up @@ -391,8 +391,8 @@ body: |
; GFX-NEXT: {{ $}}
; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32)
; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTR %0, %1(s32)
Expand Down Expand Up @@ -452,11 +452,11 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32)
; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32)
; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32)
; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32)
; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[UV4]](s32)
; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[UV5]](s32)
; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[UV6]](s32)
; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[UV7]](s32)
; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
Expand Down
Loading