-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Add sink-and-fold support for RISC-V. #67602
Conversation
@llvm/pr-subscribers-backend-risc-v ChangesThis uses the recently introduced sink-and-fold support in MachineSink. https://reviews.llvm.org/D152828 This enables folding ADDI into load/store addresses. I quickly threw this together after seeing the AArch64 patches. This could be an alternative to #67024 We'd probably want to stage this as a commit to add the code and a commit to enable it. I've done no testing other than lit so far. Patch is 56.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67602.diff 8 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 6ee5e2d4c584049..7b9f8b08002c077 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1907,6 +1907,77 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
}
+bool RISCVInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+ Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const {
+ switch (MemI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::LB:
+ case RISCV::LBU:
+ case RISCV::LH:
+ case RISCV::LHU:
+ case RISCV::LW:
+ case RISCV::LWU:
+ case RISCV::LD:
+ case RISCV::FLH:
+ case RISCV::FLW:
+ case RISCV::FLD:
+ case RISCV::SB:
+ case RISCV::SH:
+ case RISCV::SW:
+ case RISCV::SD:
+ case RISCV::FSH:
+ case RISCV::FSW:
+ case RISCV::FSD:
+ break;
+ }
+
+ // Check the fold operand is not the loaded/stored value.
+ const MachineOperand &BaseRegOp = MemI.getOperand(0);
+ if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
+ return false;
+
+ if (AddrI.getOpcode() != RISCV::ADDI)
+ return false;
+
+ int64_t OldOffset = MemI.getOperand(2).getImm();
+ int64_t Disp = AddrI.getOperand(2).getImm();
+ int64_t NewOffset = OldOffset + Disp;
+ if (!STI.is64Bit())
+ NewOffset = SignExtend64<32>(NewOffset);
+
+ if (!isInt<12>(NewOffset))
+ return false;
+
+ AM.BaseReg = AddrI.getOperand(1).getReg();
+ AM.ScaledReg = 0;
+ AM.Scale = 0;
+ AM.Displacement = NewOffset;
+ AM.Form = ExtAddrMode::Formula::Basic;
+ return true;
+}
+
+MachineInstr *RISCVInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const {
+
+ const DebugLoc &DL = MemI.getDebugLoc();
+ MachineBasicBlock &MBB = *MemI.getParent();
+
+ assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+ "Addressing mode not supported for folding");
+
+ auto B = BuildMI(MBB, MemI, DL, get(MemI.getOpcode()))
+ .addReg(MemI.getOperand(0).getReg(),
+ MemI.mayLoad() ? RegState::Define : 0)
+ .addReg(AM.BaseReg)
+ .addImm(AM.Displacement)
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ return B.getInstr();
+}
+
// Return true if get the base operand, byte offset of an instruction and the
// memory width. Width is the size of memory that is being loaded/stored.
bool RISCVInstrInfo::getMemOperandWithOffsetWidth(
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 99c907a98121ae3..2f8ea749b96e3d1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -137,6 +137,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
bool verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const override;
+ bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const override;
+
+ MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const override;
+
bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
const MachineOperand *&BaseOp,
int64_t &Offset, unsigned &Width,
diff --git a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp b/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp
index 7c6a89b6036fa3c..9d7660ba9a4b103 100644
--- a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp
@@ -275,7 +275,7 @@ bool RISCVInitUndef::processBasicBlock(MachineFunction &MF,
Changed |= handleSubReg(MF, MI, DLD);
if (MI.isImplicitDef()) {
auto DstReg = MI.getOperand(0).getReg();
- if (isVectorRegClass(DstReg))
+ if (DstReg.isVirtual() && isVectorRegClass(DstReg))
Changed |= handleImplicitDef(MBB, I);
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 69a0569fccc4eca..cbf9d7f3d4872a7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -78,6 +78,11 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination(
" them with stores to x0"),
cl::init(true));
+static cl::opt<bool>
+ EnableSinkFold("riscv-enable-sink-fold",
+ cl::desc("Enable sinking and folding of instruction copies"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -242,7 +247,9 @@ namespace {
class RISCVPassConfig : public TargetPassConfig {
public:
RISCVPassConfig(RISCVTargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ setEnableSinkAndFold(EnableSinkFold);
+ }
RISCVTargetMachine &getRISCVTargetMachine() const {
return getTM<RISCVTargetMachine>();
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 480e5c2f8f2b8b6..e3b06af20e5ee14 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -12972,38 +12972,39 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) {
; RV64ZVE32F-NEXT: vmset.m v8
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
; RV64ZVE32F-NEXT: # implicit-def: $v8
-; RV64ZVE32F-NEXT: bnez zero, .LBB106_2
-; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero
-; RV64ZVE32F-NEXT: .LBB106_2: # %else
-; RV64ZVE32F-NEXT: andi a3, a1, 2
-; RV64ZVE32F-NEXT: addi a2, a0, -512
-; RV64ZVE32F-NEXT: bnez a3, .LBB106_6
-; RV64ZVE32F-NEXT: # %bb.3: # %else2
-; RV64ZVE32F-NEXT: andi a3, a1, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB106_7
-; RV64ZVE32F-NEXT: .LBB106_4: # %else5
+; RV64ZVE32F-NEXT: beqz zero, .LBB106_5
+; RV64ZVE32F-NEXT: # %bb.1: # %else
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: bnez a2, .LBB106_6
+; RV64ZVE32F-NEXT: .LBB106_2: # %else2
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB106_7
+; RV64ZVE32F-NEXT: .LBB106_3: # %else5
; RV64ZVE32F-NEXT: andi a1, a1, 8
; RV64ZVE32F-NEXT: bnez a1, .LBB106_8
-; RV64ZVE32F-NEXT: .LBB106_5: # %else8
+; RV64ZVE32F-NEXT: .LBB106_4: # %else8
; RV64ZVE32F-NEXT: ret
+; RV64ZVE32F-NEXT: .LBB106_5: # %cond.load
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB106_2
; RV64ZVE32F-NEXT: .LBB106_6: # %cond.load1
-; RV64ZVE32F-NEXT: lw a3, 0(a2)
+; RV64ZVE32F-NEXT: lw a2, -512(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v9, a3
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: andi a3, a1, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB106_4
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: beqz a2, .LBB106_3
; RV64ZVE32F-NEXT: .LBB106_7: # %cond.load4
-; RV64ZVE32F-NEXT: lw a0, 0(a0)
+; RV64ZVE32F-NEXT: lw a2, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v9, a0
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
; RV64ZVE32F-NEXT: andi a1, a1, 8
-; RV64ZVE32F-NEXT: beqz a1, .LBB106_5
+; RV64ZVE32F-NEXT: beqz a1, .LBB106_4
; RV64ZVE32F-NEXT: .LBB106_8: # %cond.load7
-; RV64ZVE32F-NEXT: lw a0, 0(a2)
+; RV64ZVE32F-NEXT: lw a0, -512(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
@@ -13480,11 +13481,10 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 2
; RV64ZVE32F-NEXT: beqz a2, .LBB107_2
; RV64ZVE32F-NEXT: .LBB107_10: # %cond.load1
-; RV64ZVE32F-NEXT: addi a2, a0, 4
-; RV64ZVE32F-NEXT: lbu a3, 1(a2)
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: slli a3, a3, 8
-; RV64ZVE32F-NEXT: or a2, a3, a2
+; RV64ZVE32F-NEXT: lbu a2, 5(a0)
+; RV64ZVE32F-NEXT: lbu a3, 4(a0)
+; RV64ZVE32F-NEXT: slli a2, a2, 8
+; RV64ZVE32F-NEXT: or a2, a2, a3
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma
@@ -13492,64 +13492,58 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: beqz a2, .LBB107_3
; RV64ZVE32F-NEXT: .LBB107_11: # %cond.load4
-; RV64ZVE32F-NEXT: addi a2, a0, 8
-; RV64ZVE32F-NEXT: lbu a3, 1(a2)
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: slli a3, a3, 8
-; RV64ZVE32F-NEXT: or a2, a3, a2
+; RV64ZVE32F-NEXT: lbu a2, 9(a0)
+; RV64ZVE32F-NEXT: lbu a3, 8(a0)
+; RV64ZVE32F-NEXT: slli a2, a2, 8
+; RV64ZVE32F-NEXT: or a2, a2, a3
; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB107_4
; RV64ZVE32F-NEXT: .LBB107_12: # %cond.load7
-; RV64ZVE32F-NEXT: addi a2, a0, 12
-; RV64ZVE32F-NEXT: lbu a3, 1(a2)
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: slli a3, a3, 8
-; RV64ZVE32F-NEXT: or a2, a3, a2
+; RV64ZVE32F-NEXT: lbu a2, 13(a0)
+; RV64ZVE32F-NEXT: lbu a3, 12(a0)
+; RV64ZVE32F-NEXT: slli a2, a2, 8
+; RV64ZVE32F-NEXT: or a2, a2, a3
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB107_5
; RV64ZVE32F-NEXT: .LBB107_13: # %cond.load10
-; RV64ZVE32F-NEXT: addi a2, a0, 16
-; RV64ZVE32F-NEXT: lbu a3, 1(a2)
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: slli a3, a3, 8
-; RV64ZVE32F-NEXT: or a2, a3, a2
+; RV64ZVE32F-NEXT: lbu a2, 17(a0)
+; RV64ZVE32F-NEXT: lbu a3, 16(a0)
+; RV64ZVE32F-NEXT: slli a2, a2, 8
+; RV64ZVE32F-NEXT: or a2, a2, a3
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
; RV64ZVE32F-NEXT: beqz a2, .LBB107_6
; RV64ZVE32F-NEXT: .LBB107_14: # %cond.load13
-; RV64ZVE32F-NEXT: addi a2, a0, 20
-; RV64ZVE32F-NEXT: lbu a3, 1(a2)
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: slli a3, a3, 8
-; RV64ZVE32F-NEXT: or a2, a3, a2
+; RV64ZVE32F-NEXT: lbu a2, 21(a0)
+; RV64ZVE32F-NEXT: lbu a3, 20(a0)
+; RV64ZVE32F-NEXT: slli a2, a2, 8
+; RV64ZVE32F-NEXT: or a2, a2, a3
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: beqz a2, .LBB107_7
; RV64ZVE32F-NEXT: .LBB107_15: # %cond.load16
-; RV64ZVE32F-NEXT: addi a2, a0, 24
-; RV64ZVE32F-NEXT: lbu a3, 1(a2)
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: slli a3, a3, 8
-; RV64ZVE32F-NEXT: or a2, a3, a2
+; RV64ZVE32F-NEXT: lbu a2, 25(a0)
+; RV64ZVE32F-NEXT: lbu a3, 24(a0)
+; RV64ZVE32F-NEXT: slli a2, a2, 8
+; RV64ZVE32F-NEXT: or a2, a2, a3
; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: beqz a1, .LBB107_8
; RV64ZVE32F-NEXT: .LBB107_16: # %cond.load19
-; RV64ZVE32F-NEXT: addi a0, a0, 28
-; RV64ZVE32F-NEXT: lbu a1, 1(a0)
-; RV64ZVE32F-NEXT: lbu a0, 0(a0)
+; RV64ZVE32F-NEXT: lbu a1, 29(a0)
+; RV64ZVE32F-NEXT: lbu a0, 28(a0)
; RV64ZVE32F-NEXT: slli a1, a1, 8
; RV64ZVE32F-NEXT: or a0, a1, a0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
@@ -13614,8 +13608,7 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 2
; RV64ZVE32F-NEXT: beqz a2, .LBB108_2
; RV64ZVE32F-NEXT: .LBB108_10: # %cond.load1
-; RV64ZVE32F-NEXT: addi a2, a0, 2
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 2(a0)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma
@@ -13623,48 +13616,42 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: beqz a2, .LBB108_3
; RV64ZVE32F-NEXT: .LBB108_11: # %cond.load4
-; RV64ZVE32F-NEXT: addi a2, a0, 8
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB108_4
; RV64ZVE32F-NEXT: .LBB108_12: # %cond.load7
-; RV64ZVE32F-NEXT: addi a2, a0, 10
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 10(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB108_5
; RV64ZVE32F-NEXT: .LBB108_13: # %cond.load10
-; RV64ZVE32F-NEXT: addi a2, a0, 16
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 16(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
; RV64ZVE32F-NEXT: beqz a2, .LBB108_6
; RV64ZVE32F-NEXT: .LBB108_14: # %cond.load13
-; RV64ZVE32F-NEXT: addi a2, a0, 18
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 18(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: beqz a2, .LBB108_7
; RV64ZVE32F-NEXT: .LBB108_15: # %cond.load16
-; RV64ZVE32F-NEXT: addi a2, a0, 24
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 24(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: beqz a1, .LBB108_8
; RV64ZVE32F-NEXT: .LBB108_16: # %cond.load19
-; RV64ZVE32F-NEXT: addi a0, a0, 26
-; RV64ZVE32F-NEXT: lh a0, 0(a0)
+; RV64ZVE32F-NEXT: lh a0, 26(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7
@@ -13730,8 +13717,7 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 2
; RV64ZVE32F-NEXT: beqz a2, .LBB109_2
; RV64ZVE32F-NEXT: .LBB109_10: # %cond.load1
-; RV64ZVE32F-NEXT: addi a2, a0, 6
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 6(a0)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma
@@ -13739,48 +13725,42 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: beqz a2, .LBB109_3
; RV64ZVE32F-NEXT: .LBB109_11: # %cond.load4
-; RV64ZVE32F-NEXT: addi a2, a0, 12
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 12(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB109_4
; RV64ZVE32F-NEXT: .LBB109_12: # %cond.load7
-; RV64ZVE32F-NEXT: addi a2, a0, 14
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 14(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB109_5
; RV64ZVE32F-NEXT: .LBB109_13: # %cond.load10
-; RV64ZVE32F-NEXT: addi a2, a0, 20
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 20(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
; RV64ZVE32F-NEXT: beqz a2, .LBB109_6
; RV64ZVE32F-NEXT: .LBB109_14: # %cond.load13
-; RV64ZVE32F-NEXT: addi a2, a0, 22
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 22(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: beqz a2, .LBB109_7
; RV64ZVE32F-NEXT: .LBB109_15: # %cond.load16
-; RV64ZVE32F-NEXT: addi a2, a0, 28
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 28(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: beqz a1, .LBB109_8
; RV64ZVE32F-NEXT: .LBB109_16: # %cond.load19
-; RV64ZVE32F-NEXT: addi a0, a0, 30
-; RV64ZVE32F-NEXT: lh a0, 0(a0)
+; RV64ZVE32F-NEXT: lh a0, 30(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7
@@ -13846,8 +13826,7 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 2
; RV64ZVE32F-NEXT: beqz a2, .LBB110_2
; RV64ZVE32F-NEXT: .LBB110_10: # %cond.load1
-; RV64ZVE32F-NEXT: addi a2, a0, 30
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 30(a0)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma
@@ -13855,48 +13834,42 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: beqz a2, .LBB110_3
; RV64ZVE32F-NEXT: .LBB110_11: # %cond.load4
-; RV64ZVE32F-NEXT: addi a2, a0, 24
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 24(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB110_4
; RV64ZVE32F-NEXT: .LBB110_12: # %cond.load7
-; RV64ZVE32F-NEXT: addi a2, a0, 26
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 26(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB110_5
; RV64ZVE32F-NEXT: .LBB110_13: # %cond.load10
-; RV64ZVE32F-NEXT: addi a2, a0, 20
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 20(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
; RV64ZVE32F-NEXT: beqz a2, .LBB110_6
; RV64ZVE32F-NEXT: .LBB110_14: # %cond.load13
-; RV64ZVE32F-NEXT: addi a2, a0, 22
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: lh a2, 22(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: beqz a2, .LBB110_7
; RV64ZVE32F-NEXT: .LBB110_15: # %cond.load16
-; RV64ZVE32F-NEXT: ad...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
e721d9e
to
2e7456a
Compare
I've removed the enabling by default but added the option to all currently affected test demonstration. |
This makes sense to me as well. LGTM |
I can confirm that the train dataset on 429.mcf went from 17766169495 instructions to 17632812468 in my local testing. I think it was |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
(Question for future iterations, rather than this patch): Do we ever get an ORI we'd want to fold, due to DAGCombiner converting a+b to a|b when that's a legal transformation?
Great, this is what I saw as well. Thanks for confirming! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
This uses the recently introduced sink-and-fold support in MachineSink. https://reviews.llvm.org/D152828 This enables folding ADDI into load/store addresses. Enabling by default will be a separate PR.
2e7456a
to
201bfb1
Compare
This uses the recently introduced sink-and-fold support in MachineSink. https://reviews.llvm.org/D152828
This enables folding ADDI into load/store addresses.
Enabling by default will be a separate PR.