-
Notifications
You must be signed in to change notification settings - Fork 14k
[AMDGPU] Optimize rotate instruction selection patterns #143551
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Optimize rotate instruction selection patterns #143551
Conversation
This patch improves rotate instruction selection for AMDGPU by adding optimized patterns for the rotate right (rotr) operation. It now selects s_lshl + s_lshr + s_or (3 SALU instructions) instead of the previous v_alignbit + v_readfirstlane (2 VALU instructions).
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be notified. If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers. If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-backend-amdgpu Author: None (aleksandar-amd) ChangesThis patch improves rotate instruction selection for AMDGPU by adding Patch is 41.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143551.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..061764ff4d269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -95,6 +95,10 @@ def gi_vinterpmods_hi :
GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
GIComplexPatternEquiv<VINTERPModsHi>;
+def gi_immsub :
+ GIComplexOperandMatcher<s32, "selectImmSub">,
+ GIComplexPatternEquiv<ImmSub>;
+
// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
def gi_vop3opsel :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e52c2d7fde436..22d3274d565f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3610,6 +3610,26 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src,
+ SDValue &InvSrc) const {
+ Src = In;
+
+ // Handle constant operands
+ ConstantSDNode *ImmVal = dyn_cast<ConstantSDNode>(In);
+ if (ImmVal)
+ InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In),
+ MVT::i32);
+ else {
+ // Fallback: generate SUB instruction for non-constant, non-negation cases
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32,
+ {CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In});
+ InvSrc = SDValue(VMov, 0);
+ }
+
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92b..82586329a369c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -246,6 +246,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
+ bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const;
+
SDValue getHi16Elt(SDValue In) const;
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e72f6ca478fd..b43e4c1093a16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5831,6 +5831,31 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const {
+
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ // Handle constant operands
+ std::optional<uint64_t> Val = getConstantZext32Val(Root.getReg(), *MRI);
+
+ if (!Val) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv)
+ .addImm(32)
+ .add(Root);
+ } else {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv)
+ .addImm(32 - *Val);
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); },
+ }};
+}
+
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6c3f3026e877a..6371b861ae55c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -224,6 +224,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectImmSub(MachineOperand &Root) const;
+
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
int64_t *Offset) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e8dff85064383..6e74ea56b16ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2046,7 +2046,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();
- getActionDefinitionsBuilder({G_ROTR, G_ROTL})
+ getActionDefinitionsBuilder(G_ROTR)
+ .legalFor({S32})
+ .scalarize(0)
+ .lower();
+
+ getActionDefinitionsBuilder(G_ROTL)
.scalarize(0)
.lower();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..6242eefb15ad8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4073,6 +4073,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_ROTR:
+ case AMDGPU::G_ROTL: {
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2c20475726a48..56d2d99079acb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1630,6 +1630,8 @@ def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
+def ImmSub : ComplexPattern<untyped, 2, "SelectImmSub">;
+
//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 360fd05cb3d96..e9801f7ce6823 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2388,8 +2388,19 @@ def : AMDGPUPat <
$src1), sub1)
>;
+// rotr pattern
+def : AMDGPUPat <
+ (UniformBinFrag<rotr> i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))),
+ (S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv))
+>;
+
let True16Predicate = NotHasTrue16BitInsts in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
+
+// rotr pattern
+def : AMDGPUPat <
+ (DivergentBinFrag<rotr> i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_e64 $src0, $src0, $src1)
+>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
index 7fdee12315754..9610caa1f2012 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
@@ -181,8 +181,8 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[SUB]](s32)
+ ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTL %0, %1(s32)
@@ -301,14 +301,14 @@ body: |
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[SUB]](s32)
; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32)
+ ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[SUB1]](s32)
; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32)
+ ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[SUB2]](s32)
; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[SUB3]](s32)
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
@@ -391,8 +391,8 @@ body: |
; GFX-NEXT: {{ $}}
; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
+ ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTR %0, %1(s32)
@@ -452,11 +452,11 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32)
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32)
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32)
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[UV4]](s32)
+ ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[UV5]](s32)
+ ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[UV6]](s32)
+ ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[UV7]](s32)
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0a3f572..bd245c45025db 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -26,11 +26,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_u32 s4, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s2, s3
+; SI-NEXT: s_lshl_b32 s2, s2, s4
+; SI-NEXT: s_or_b32 s2, s3, s2
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -39,10 +42,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT: s_sub_u32 s4, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s2, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s2, s3, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -52,18 +58,26 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_i32 s3, 32, s3
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-NEXT: s_lshl_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_u32 s4, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s2, s3
+; GFX11-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -97,14 +111,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s3, 32, s3
; SI-NEXT: s_sub_i32 s2, 32, s2
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_u32 s6, 32, s2
+; SI-NEXT: s_sub_u32 s8, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s1, s3
+; SI-NEXT: s_lshr_b32 s2, s0, s2
+; SI-NEXT: s_lshl_b32 s1, s1, s8
+; SI-NEXT: s_lshl_b32 s0, s0, s6
+; SI-NEXT: s_or_b32 s1, s3, s1
+; SI-NEXT: s_or_b32 s0, s2, s0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -115,11 +135,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s2, 32, s2
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: s_sub_u32 s6, 32, s2
+; GFX8-NEXT: s_sub_u32 s7, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s1, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, s7
+; GFX8-NEXT: s_lshr_b32 s2, s0, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s6
+; GFX8-NEXT: s_or_b32 s1, s3, s1
+; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -131,10 +157,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s3, 32, s3
; GFX10-NEXT: s_sub_i32 s2, 32, s2
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: s_sub_i32 s3, 32, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s2
+; GFX10-NEXT: s_sub_u32 s5, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s1, s3
+; GFX10-NEXT: s_lshr_b32 s2, s0, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_lshl_b32 s1, s1, s5
+; GFX10-NEXT: s_or_b32 s0, s2, s0
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -143,12 +177,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s3, 32, s3
; GFX11-NEXT: s_sub_i32 s2, 32, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_sub_u32 s6, 32, s2
+; GFX11-NEXT: s_sub_u32 s7, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s1, s3
+; GFX11-NEXT: s_lshr_b32 s2, s0, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-NEXT: s_or_b32 s0, s2, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -188,20 +230,32 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s4, 32, s12
-; SI-NEXT: s_sub_i32 s5, 32, s13
+; SI-NEXT: s_sub_i32 s2, 32, s12
+; SI-NEXT: s_sub_i32 s4, 32, s13
+; SI-NEXT: s_sub_i32 s5, 32, s14
; SI-NEXT: s_sub_i32 s6, 32, s15
-; SI-NEXT: s_sub_i32 s7, 32, s14
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT: s_sub_u32 s7, 32, s2
+; SI-NEXT: s_sub_u32 s12, 32, s4
+; SI-NEXT: s_sub_u32 s13, 32, s5
+; SI-NEXT: s_sub_u32 s14, 32, s6
+; SI-NEXT: s_lshr_b32 s6, s11, s6
+; SI-NEXT: s_lshr_b32 s5, s10, s5
+; SI-NEXT: s_lshr_b32 s4, s9, s4
+; SI-NEXT: s_lshr_b32 s2, s8, s2
+; SI-NEXT: s_lshl_b32 s11, s11, s14
+; SI-NEXT: s_lshl_b32 s10, s10, s13
+; SI-NEXT: s_lshl_b32 s9, s9, s12
+; SI-NEXT: s_lshl_b32 s7, s8, s7
+; SI-NEXT: s_or_b32 s6, s6, s11
+; SI-NEXT: s_or_b32 s5, s5, s10
+; SI-NEXT: s_or_b32 s4, s4, s9
+; SI-NEXT: s_or_b32 s7, s2, s7
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEX...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: None (aleksandar-amd) ChangesThis patch improves rotate instruction selection for AMDGPU by adding Patch is 41.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143551.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..061764ff4d269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -95,6 +95,10 @@ def gi_vinterpmods_hi :
GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
GIComplexPatternEquiv<VINTERPModsHi>;
+def gi_immsub :
+ GIComplexOperandMatcher<s32, "selectImmSub">,
+ GIComplexPatternEquiv<ImmSub>;
+
// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
def gi_vop3opsel :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e52c2d7fde436..22d3274d565f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3610,6 +3610,26 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src,
+ SDValue &InvSrc) const {
+ Src = In;
+
+ // Handle constant operands
+ ConstantSDNode *ImmVal = dyn_cast<ConstantSDNode>(In);
+ if (ImmVal)
+ InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In),
+ MVT::i32);
+ else {
+ // Fallback: generate SUB instruction for non-constant, non-negation cases
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32,
+ {CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In});
+ InvSrc = SDValue(VMov, 0);
+ }
+
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92b..82586329a369c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -246,6 +246,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
+ bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const;
+
SDValue getHi16Elt(SDValue In) const;
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e72f6ca478fd..b43e4c1093a16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5831,6 +5831,31 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const {
+
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ // Handle constant operands
+ std::optional<uint64_t> Val = getConstantZext32Val(Root.getReg(), *MRI);
+
+ if (!Val) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv)
+ .addImm(32)
+ .add(Root);
+ } else {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv)
+ .addImm(32 - *Val);
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); },
+ }};
+}
+
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6c3f3026e877a..6371b861ae55c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -224,6 +224,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectImmSub(MachineOperand &Root) const;
+
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
int64_t *Offset) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e8dff85064383..6e74ea56b16ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2046,7 +2046,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();
- getActionDefinitionsBuilder({G_ROTR, G_ROTL})
+ getActionDefinitionsBuilder(G_ROTR)
+ .legalFor({S32})
+ .scalarize(0)
+ .lower();
+
+ getActionDefinitionsBuilder(G_ROTL)
.scalarize(0)
.lower();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..6242eefb15ad8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4073,6 +4073,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_ROTR:
+ case AMDGPU::G_ROTL: {
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2c20475726a48..56d2d99079acb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1630,6 +1630,8 @@ def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
+def ImmSub : ComplexPattern<untyped, 2, "SelectImmSub">;
+
//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 360fd05cb3d96..e9801f7ce6823 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2388,8 +2388,19 @@ def : AMDGPUPat <
$src1), sub1)
>;
+// rotr pattern
+def : AMDGPUPat <
+ (UniformBinFrag<rotr> i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))),
+ (S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv))
+>;
+
let True16Predicate = NotHasTrue16BitInsts in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
+
+// rotr pattern
+def : AMDGPUPat <
+ (DivergentBinFrag<rotr> i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_e64 $src0, $src0, $src1)
+>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
index 7fdee12315754..9610caa1f2012 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
@@ -181,8 +181,8 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[SUB]](s32)
+ ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTL %0, %1(s32)
@@ -301,14 +301,14 @@ body: |
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[SUB]](s32)
; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32)
+ ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[SUB1]](s32)
; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32)
+ ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[SUB2]](s32)
; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[SUB3]](s32)
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
@@ -391,8 +391,8 @@ body: |
; GFX-NEXT: {{ $}}
; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
+ ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTR %0, %1(s32)
@@ -452,11 +452,11 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32)
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32)
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32)
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[UV4]](s32)
+ ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[UV5]](s32)
+ ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[UV6]](s32)
+ ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[UV7]](s32)
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0a3f572..bd245c45025db 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -26,11 +26,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_u32 s4, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s2, s3
+; SI-NEXT: s_lshl_b32 s2, s2, s4
+; SI-NEXT: s_or_b32 s2, s3, s2
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -39,10 +42,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT: s_sub_u32 s4, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s2, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s2, s3, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -52,18 +58,26 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_i32 s3, 32, s3
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-NEXT: s_lshl_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_u32 s4, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s2, s3
+; GFX11-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -97,14 +111,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s3, 32, s3
; SI-NEXT: s_sub_i32 s2, 32, s2
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_u32 s6, 32, s2
+; SI-NEXT: s_sub_u32 s8, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s1, s3
+; SI-NEXT: s_lshr_b32 s2, s0, s2
+; SI-NEXT: s_lshl_b32 s1, s1, s8
+; SI-NEXT: s_lshl_b32 s0, s0, s6
+; SI-NEXT: s_or_b32 s1, s3, s1
+; SI-NEXT: s_or_b32 s0, s2, s0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -115,11 +135,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s2, 32, s2
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: s_sub_u32 s6, 32, s2
+; GFX8-NEXT: s_sub_u32 s7, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s1, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, s7
+; GFX8-NEXT: s_lshr_b32 s2, s0, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s6
+; GFX8-NEXT: s_or_b32 s1, s3, s1
+; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -131,10 +157,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s3, 32, s3
; GFX10-NEXT: s_sub_i32 s2, 32, s2
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: s_sub_i32 s3, 32, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s2
+; GFX10-NEXT: s_sub_u32 s5, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s1, s3
+; GFX10-NEXT: s_lshr_b32 s2, s0, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_lshl_b32 s1, s1, s5
+; GFX10-NEXT: s_or_b32 s0, s2, s0
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -143,12 +177,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s3, 32, s3
; GFX11-NEXT: s_sub_i32 s2, 32, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_sub_u32 s6, 32, s2
+; GFX11-NEXT: s_sub_u32 s7, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s1, s3
+; GFX11-NEXT: s_lshr_b32 s2, s0, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-NEXT: s_or_b32 s0, s2, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -188,20 +230,32 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s4, 32, s12
-; SI-NEXT: s_sub_i32 s5, 32, s13
+; SI-NEXT: s_sub_i32 s2, 32, s12
+; SI-NEXT: s_sub_i32 s4, 32, s13
+; SI-NEXT: s_sub_i32 s5, 32, s14
; SI-NEXT: s_sub_i32 s6, 32, s15
-; SI-NEXT: s_sub_i32 s7, 32, s14
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT: s_sub_u32 s7, 32, s2
+; SI-NEXT: s_sub_u32 s12, 32, s4
+; SI-NEXT: s_sub_u32 s13, 32, s5
+; SI-NEXT: s_sub_u32 s14, 32, s6
+; SI-NEXT: s_lshr_b32 s6, s11, s6
+; SI-NEXT: s_lshr_b32 s5, s10, s5
+; SI-NEXT: s_lshr_b32 s4, s9, s4
+; SI-NEXT: s_lshr_b32 s2, s8, s2
+; SI-NEXT: s_lshl_b32 s11, s11, s14
+; SI-NEXT: s_lshl_b32 s10, s10, s13
+; SI-NEXT: s_lshl_b32 s9, s9, s12
+; SI-NEXT: s_lshl_b32 s7, s8, s7
+; SI-NEXT: s_or_b32 s6, s6, s11
+; SI-NEXT: s_or_b32 s5, s5, s10
+; SI-NEXT: s_or_b32 s4, s4, s9
+; SI-NEXT: s_or_b32 s7, s2, s7
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEX...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
I.m planning to land #143551 soon, I've made some small additions to to rotr while adding v2i32 or support. |
This patch improves rotate instruction selection for AMDGPU by adding
optimized patterns for the rotate right (rotr) operation. It now selects
s_lshl + s_lshr + s_or (3 SALU instructions) instead of the previous
v_alignbit + v_readfirstlane (2 VALU instructions).