diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 90f7a4f1edbe94..11d99fa5100e1b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1907,6 +1907,74 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } +bool RISCVInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const { + switch (MemI.getOpcode()) { + default: + return false; + case RISCV::LB: + case RISCV::LBU: + case RISCV::LH: + case RISCV::LHU: + case RISCV::LW: + case RISCV::LWU: + case RISCV::LD: + case RISCV::FLH: + case RISCV::FLW: + case RISCV::FLD: + case RISCV::SB: + case RISCV::SH: + case RISCV::SW: + case RISCV::SD: + case RISCV::FSH: + case RISCV::FSW: + case RISCV::FSD: + break; + } + + if (MemI.getOperand(0).getReg() == Reg) + return false; + + if (AddrI.getOpcode() != RISCV::ADDI || !AddrI.getOperand(1).isReg() || + !AddrI.getOperand(2).isImm()) + return false; + + int64_t OldOffset = MemI.getOperand(2).getImm(); + int64_t Disp = AddrI.getOperand(2).getImm(); + int64_t NewOffset = OldOffset + Disp; + if (!STI.is64Bit()) + NewOffset = SignExtend64<32>(NewOffset); + + if (!isInt<12>(NewOffset)) + return false; + + AM.BaseReg = AddrI.getOperand(1).getReg(); + AM.ScaledReg = 0; + AM.Scale = 0; + AM.Displacement = NewOffset; + AM.Form = ExtAddrMode::Formula::Basic; + return true; +} + +MachineInstr *RISCVInstrInfo::emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const { + + const DebugLoc &DL = MemI.getDebugLoc(); + MachineBasicBlock &MBB = *MemI.getParent(); + + assert(AM.ScaledReg == 0 && AM.Scale == 0 && + "Addressing mode not supported for folding"); + + return BuildMI(MBB, MemI, DL, get(MemI.getOpcode())) + .addReg(MemI.getOperand(0).getReg(), + MemI.mayLoad() ? RegState::Define : 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); +} + // Return true if get the base operand, byte offset of an instruction and the // memory width. Width is the size of memory that is being loaded/stored. bool RISCVInstrInfo::getMemOperandWithOffsetWidth( diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index d56d3c0b303bf9..5584e5571c9bc3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -137,6 +137,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; + bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const override; + + MachineInstr *emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const override; + bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, diff --git a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp b/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp index 7c6a89b6036fa3..9d7660ba9a4b10 100644 --- a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp +++ b/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp @@ -275,7 +275,7 @@ bool RISCVInitUndef::processBasicBlock(MachineFunction &MF, Changed |= handleSubReg(MF, MI, DLD); if (MI.isImplicitDef()) { auto DstReg = MI.getOperand(0).getReg(); - if (isVectorRegClass(DstReg)) + if (DstReg.isVirtual() && isVectorRegClass(DstReg)) Changed |= handleImplicitDef(MBB, I); } } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 69a0569fccc4ec..1281528ea511a4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -78,6 +78,11 @@ static cl::opt EnableRISCVDeadRegisterElimination( " them with stores to x0"), cl::init(true)); +static cl::opt + EnableSinkFold("riscv-enable-sink-fold", + cl::desc("Enable sinking and folding of instruction copies"), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); @@ -242,7 +247,9 @@ namespace { class RISCVPassConfig : public TargetPassConfig { public: RISCVPassConfig(RISCVTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + setEnableSinkAndFold(EnableSinkFold); + } RISCVTargetMachine &getRISCVTargetMachine() const { return getTM(); diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll index 5a6e0baf752d07..321857b2104eb5 100644 --- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefix=RV32I %s +; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I %s ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -code-model=medium < %s \ -; RUN: | FileCheck -check-prefix=RV32I-MEDIUM %s +; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I-MEDIUM %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefix=RV64I %s +; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -code-model=medium < %s \ -; RUN: | FileCheck -check-prefix=RV64I-MEDIUM %s +; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I-MEDIUM %s ; We can often fold an ADDI into the offset of load/store instructions: ; (load (addi base, off1), off2) -> (load base, off1+off2) @@ -769,14 +769,13 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV32I-NEXT: li s3, 0 ; RV32I-NEXT: li s4, 0 ; RV32I-NEXT: slli a0, a0, 4 -; RV32I-NEXT: add a0, s0, a0 -; RV32I-NEXT: addi s7, a0, 8 +; RV32I-NEXT: add s7, s0, a0 ; RV32I-NEXT: .LBB20_5: # %for.body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call f@plt -; RV32I-NEXT: lw a0, 4(s7) -; RV32I-NEXT: lw a1, 0(s7) +; RV32I-NEXT: lw a0, 12(s7) +; RV32I-NEXT: lw a1, 8(s7) ; RV32I-NEXT: add a0, a0, s4 ; RV32I-NEXT: add s3, a1, s3 ; RV32I-NEXT: sltu s4, s3, a1 @@ -835,14 +834,13 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV32I-MEDIUM-NEXT: li s3, 0 ; RV32I-MEDIUM-NEXT: li s4, 0 ; RV32I-MEDIUM-NEXT: slli a0, a0, 4 -; RV32I-MEDIUM-NEXT: add a0, s0, a0 -; RV32I-MEDIUM-NEXT: addi s7, a0, 8 +; RV32I-MEDIUM-NEXT: add s7, s0, a0 ; RV32I-MEDIUM-NEXT: .LBB20_5: # %for.body ; RV32I-MEDIUM-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-MEDIUM-NEXT: mv a0, s0 ; RV32I-MEDIUM-NEXT: call f@plt -; RV32I-MEDIUM-NEXT: lw a0, 4(s7) -; RV32I-MEDIUM-NEXT: lw a1, 0(s7) +; RV32I-MEDIUM-NEXT: lw a0, 12(s7) +; RV32I-MEDIUM-NEXT: lw a1, 8(s7) ; RV32I-MEDIUM-NEXT: add a0, a0, s4 ; RV32I-MEDIUM-NEXT: add s3, a1, s3 ; RV32I-MEDIUM-NEXT: sltu s4, s3, a1 @@ -883,13 +881,12 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV64I-NEXT: mv s1, a1 ; RV64I-NEXT: li s2, 0 ; RV64I-NEXT: slli a0, a0, 4 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: addi s3, a0, 8 +; RV64I-NEXT: add s3, a2, a0 ; RV64I-NEXT: .LBB20_2: # %for.body ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call f@plt -; RV64I-NEXT: ld a0, 0(s3) +; RV64I-NEXT: ld a0, 8(s3) ; RV64I-NEXT: addi s1, s1, -1 ; RV64I-NEXT: add s2, a0, s2 ; RV64I-NEXT: bnez s1, .LBB20_2 @@ -920,13 +917,12 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV64I-MEDIUM-NEXT: mv s1, a1 ; RV64I-MEDIUM-NEXT: li s2, 0 ; RV64I-MEDIUM-NEXT: slli a0, a0, 4 -; RV64I-MEDIUM-NEXT: add a0, a2, a0 -; RV64I-MEDIUM-NEXT: addi s3, a0, 8 +; RV64I-MEDIUM-NEXT: add s3, a2, a0 ; RV64I-MEDIUM-NEXT: .LBB20_2: # %for.body ; RV64I-MEDIUM-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-MEDIUM-NEXT: mv a0, s0 ; RV64I-MEDIUM-NEXT: call f@plt -; RV64I-MEDIUM-NEXT: ld a0, 0(s3) +; RV64I-MEDIUM-NEXT: ld a0, 8(s3) ; RV64I-MEDIUM-NEXT: addi s1, s1, -1 ; RV64I-MEDIUM-NEXT: add s2, a0, s2 ; RV64I-MEDIUM-NEXT: bnez s1, .LBB20_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index f8a8ffd3a07970..6ee0e4525f5ec7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V +; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V +; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F +; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F +; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F declare <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i8>) @@ -12972,38 +12972,39 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) { ; RV64ZVE32F-NEXT: vmset.m v8 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: # implicit-def: $v8 -; RV64ZVE32F-NEXT: bnez zero, .LBB106_2 -; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero -; RV64ZVE32F-NEXT: .LBB106_2: # %else -; RV64ZVE32F-NEXT: andi a3, a1, 2 -; RV64ZVE32F-NEXT: addi a2, a0, -512 -; RV64ZVE32F-NEXT: bnez a3, .LBB106_6 -; RV64ZVE32F-NEXT: # %bb.3: # %else2 -; RV64ZVE32F-NEXT: andi a3, a1, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB106_7 -; RV64ZVE32F-NEXT: .LBB106_4: # %else5 +; RV64ZVE32F-NEXT: beqz zero, .LBB106_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB106_6 +; RV64ZVE32F-NEXT: .LBB106_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB106_7 +; RV64ZVE32F-NEXT: .LBB106_3: # %else5 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: bnez a1, .LBB106_8 -; RV64ZVE32F-NEXT: .LBB106_5: # %else8 +; RV64ZVE32F-NEXT: .LBB106_4: # %else8 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB106_5: # %cond.load +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB106_2 ; RV64ZVE32F-NEXT: .LBB106_6: # %cond.load1 -; RV64ZVE32F-NEXT: lw a3, 0(a2) +; RV64ZVE32F-NEXT: lw a2, -512(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a1, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB106_4 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB106_3 ; RV64ZVE32F-NEXT: .LBB106_7: # %cond.load4 -; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a1, a1, 8 -; RV64ZVE32F-NEXT: beqz a1, .LBB106_5 +; RV64ZVE32F-NEXT: beqz a1, .LBB106_4 ; RV64ZVE32F-NEXT: .LBB106_8: # %cond.load7 -; RV64ZVE32F-NEXT: lw a0, 0(a2) +; RV64ZVE32F-NEXT: lw a0, -512(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 @@ -13480,11 +13481,10 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_2 ; RV64ZVE32F-NEXT: .LBB107_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lbu a3, 1(a2) -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: or a2, a3, a2 +; RV64ZVE32F-NEXT: lbu a2, 5(a0) +; RV64ZVE32F-NEXT: lbu a3, 4(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: or a2, a2, a3 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -13492,64 +13492,58 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_3 ; RV64ZVE32F-NEXT: .LBB107_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 8 -; RV64ZVE32F-NEXT: lbu a3, 1(a2) -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: or a2, a3, a2 +; RV64ZVE32F-NEXT: lbu a2, 9(a0) +; RV64ZVE32F-NEXT: lbu a3, 8(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: or a2, a2, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_4 ; RV64ZVE32F-NEXT: .LBB107_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 12 -; RV64ZVE32F-NEXT: lbu a3, 1(a2) -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: or a2, a3, a2 +; RV64ZVE32F-NEXT: lbu a2, 13(a0) +; RV64ZVE32F-NEXT: lbu a3, 12(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: or a2, a2, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_5 ; RV64ZVE32F-NEXT: .LBB107_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 16 -; RV64ZVE32F-NEXT: lbu a3, 1(a2) -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: or a2, a3, a2 +; RV64ZVE32F-NEXT: lbu a2, 17(a0) +; RV64ZVE32F-NEXT: lbu a3, 16(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: or a2, a2, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_6 ; RV64ZVE32F-NEXT: .LBB107_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lbu a3, 1(a2) -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: or a2, a3, a2 +; RV64ZVE32F-NEXT: lbu a2, 21(a0) +; RV64ZVE32F-NEXT: lbu a3, 20(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: or a2, a2, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_7 ; RV64ZVE32F-NEXT: .LBB107_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 24 -; RV64ZVE32F-NEXT: lbu a3, 1(a2) -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: or a2, a3, a2 +; RV64ZVE32F-NEXT: lbu a2, 25(a0) +; RV64ZVE32F-NEXT: lbu a3, 24(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: or a2, a2, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB107_8 ; RV64ZVE32F-NEXT: .LBB107_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 28 -; RV64ZVE32F-NEXT: lbu a1, 1(a0) -; RV64ZVE32F-NEXT: lbu a0, 0(a0) +; RV64ZVE32F-NEXT: lbu a1, 29(a0) +; RV64ZVE32F-NEXT: lbu a0, 28(a0) ; RV64ZVE32F-NEXT: slli a1, a1, 8 ; RV64ZVE32F-NEXT: or a0, a1, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma @@ -13614,8 +13608,7 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_2 ; RV64ZVE32F-NEXT: .LBB108_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -13623,48 +13616,42 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_3 ; RV64ZVE32F-NEXT: .LBB108_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 8 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_4 ; RV64ZVE32F-NEXT: .LBB108_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 10 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_5 ; RV64ZVE32F-NEXT: .LBB108_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 16 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 16(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_6 ; RV64ZVE32F-NEXT: .LBB108_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 18 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_7 ; RV64ZVE32F-NEXT: .LBB108_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 24 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB108_8 ; RV64ZVE32F-NEXT: .LBB108_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 26 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 26(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -13730,8 +13717,7 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB109_2 ; RV64ZVE32F-NEXT: .LBB109_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 6 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 6(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -13739,48 +13725,42 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB109_3 ; RV64ZVE32F-NEXT: .LBB109_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 12 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 12(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB109_4 ; RV64ZVE32F-NEXT: .LBB109_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 14 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB109_5 ; RV64ZVE32F-NEXT: .LBB109_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB109_6 ; RV64ZVE32F-NEXT: .LBB109_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 22 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB109_7 ; RV64ZVE32F-NEXT: .LBB109_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 28 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 28(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB109_8 ; RV64ZVE32F-NEXT: .LBB109_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 30 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 30(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -13846,8 +13826,7 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB110_2 ; RV64ZVE32F-NEXT: .LBB110_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 30 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 30(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -13855,48 +13834,42 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB110_3 ; RV64ZVE32F-NEXT: .LBB110_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 24 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB110_4 ; RV64ZVE32F-NEXT: .LBB110_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 26 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 26(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB110_5 ; RV64ZVE32F-NEXT: .LBB110_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB110_6 ; RV64ZVE32F-NEXT: .LBB110_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 22 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB110_7 ; RV64ZVE32F-NEXT: .LBB110_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 16 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 16(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB110_8 ; RV64ZVE32F-NEXT: .LBB110_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 18 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -13962,8 +13935,7 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB111_2 ; RV64ZVE32F-NEXT: .LBB111_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 30 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 30(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -13971,48 +13943,42 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB111_3 ; RV64ZVE32F-NEXT: .LBB111_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB111_4 ; RV64ZVE32F-NEXT: .LBB111_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 22 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB111_5 ; RV64ZVE32F-NEXT: .LBB111_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 12 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 12(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB111_6 ; RV64ZVE32F-NEXT: .LBB111_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 14 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB111_7 ; RV64ZVE32F-NEXT: .LBB111_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB111_8 ; RV64ZVE32F-NEXT: .LBB111_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 6 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14076,8 +14042,7 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB112_2 ; RV64ZVE32F-NEXT: .LBB112_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -14085,48 +14050,42 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB112_3 ; RV64ZVE32F-NEXT: .LBB112_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 16 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 16(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB112_4 ; RV64ZVE32F-NEXT: .LBB112_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 18 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB112_5 ; RV64ZVE32F-NEXT: .LBB112_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 8 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB112_6 ; RV64ZVE32F-NEXT: .LBB112_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 10 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB112_7 ; RV64ZVE32F-NEXT: .LBB112_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB112_8 ; RV64ZVE32F-NEXT: .LBB112_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 6 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14193,8 +14152,7 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB113_2 ; RV64ZVE32F-NEXT: .LBB113_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -14202,48 +14160,42 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB113_3 ; RV64ZVE32F-NEXT: .LBB113_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 18 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB113_4 ; RV64ZVE32F-NEXT: .LBB113_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB113_5 ; RV64ZVE32F-NEXT: .LBB113_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 8 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB113_6 ; RV64ZVE32F-NEXT: .LBB113_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 10 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB113_7 ; RV64ZVE32F-NEXT: .LBB113_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB113_8 ; RV64ZVE32F-NEXT: .LBB113_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 6 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14282,84 +14234,80 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) { ; RV64ZVE32F-NEXT: vmset.m v8 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: # implicit-def: $v8 -; RV64ZVE32F-NEXT: bnez zero, .LBB114_2 -; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: .LBB114_2: # %else -; RV64ZVE32F-NEXT: andi a3, a1, 2 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB114_10 -; RV64ZVE32F-NEXT: # %bb.3: # %else2 -; RV64ZVE32F-NEXT: andi a3, a1, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB114_11 -; RV64ZVE32F-NEXT: .LBB114_4: # %else5 -; RV64ZVE32F-NEXT: andi a3, a1, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB114_12 -; RV64ZVE32F-NEXT: .LBB114_5: # %else8 -; RV64ZVE32F-NEXT: andi a3, a1, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB114_13 -; RV64ZVE32F-NEXT: .LBB114_6: # %else11 -; RV64ZVE32F-NEXT: andi a3, a1, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB114_14 -; RV64ZVE32F-NEXT: .LBB114_7: # %else14 -; RV64ZVE32F-NEXT: andi a3, a1, 64 -; RV64ZVE32F-NEXT: bnez a3, .LBB114_15 -; RV64ZVE32F-NEXT: .LBB114_8: # %else17 +; RV64ZVE32F-NEXT: beqz zero, .LBB114_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB114_10 +; RV64ZVE32F-NEXT: .LBB114_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB114_11 +; RV64ZVE32F-NEXT: .LBB114_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB114_12 +; RV64ZVE32F-NEXT: .LBB114_4: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB114_13 +; RV64ZVE32F-NEXT: .LBB114_5: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB114_14 +; RV64ZVE32F-NEXT: .LBB114_6: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: bnez a2, .LBB114_15 +; RV64ZVE32F-NEXT: .LBB114_7: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB114_16 -; RV64ZVE32F-NEXT: .LBB114_9: # %else20 +; RV64ZVE32F-NEXT: .LBB114_8: # %else20 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB114_9: # %cond.load +; RV64ZVE32F-NEXT: addi a2, a0, 2 +; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB114_2 ; RV64ZVE32F-NEXT: .LBB114_10: # %cond.load1 -; RV64ZVE32F-NEXT: lh a3, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a1, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB114_4 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB114_3 ; RV64ZVE32F-NEXT: .LBB114_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a3, a0, 18 -; RV64ZVE32F-NEXT: lh a3, 0(a3) +; RV64ZVE32F-NEXT: lh a2, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 -; RV64ZVE32F-NEXT: andi a3, a1, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB114_5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB114_4 ; RV64ZVE32F-NEXT: .LBB114_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a3, a0, 20 -; RV64ZVE32F-NEXT: lh a3, 0(a3) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: andi a3, a1, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB114_6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB114_5 ; RV64ZVE32F-NEXT: .LBB114_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a3, a0, 8 -; RV64ZVE32F-NEXT: lh a3, 0(a3) +; RV64ZVE32F-NEXT: lh a2, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 -; RV64ZVE32F-NEXT: andi a3, a1, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB114_7 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB114_6 ; RV64ZVE32F-NEXT: .LBB114_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a3, a0, 10 -; RV64ZVE32F-NEXT: lh a3, 0(a3) +; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 -; RV64ZVE32F-NEXT: andi a3, a1, 64 -; RV64ZVE32F-NEXT: beqz a3, .LBB114_8 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: beqz a2, .LBB114_7 ; RV64ZVE32F-NEXT: .LBB114_15: # %cond.load16 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB114_9 +; RV64ZVE32F-NEXT: beqz a1, .LBB114_8 ; RV64ZVE32F-NEXT: .LBB114_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 6 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14430,8 +14378,7 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB115_2 ; RV64ZVE32F-NEXT: .LBB115_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -14439,48 +14386,42 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB115_3 ; RV64ZVE32F-NEXT: .LBB115_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB115_4 ; RV64ZVE32F-NEXT: .LBB115_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 6 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB115_5 ; RV64ZVE32F-NEXT: .LBB115_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 16 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 16(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB115_6 ; RV64ZVE32F-NEXT: .LBB115_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 18 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB115_7 ; RV64ZVE32F-NEXT: .LBB115_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB115_8 ; RV64ZVE32F-NEXT: .LBB115_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 22 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14548,8 +14489,7 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB116_2 ; RV64ZVE32F-NEXT: .LBB116_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -14557,48 +14497,42 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB116_3 ; RV64ZVE32F-NEXT: .LBB116_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB116_4 ; RV64ZVE32F-NEXT: .LBB116_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 6 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB116_5 ; RV64ZVE32F-NEXT: .LBB116_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 16 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 16(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB116_6 ; RV64ZVE32F-NEXT: .LBB116_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 18 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 18(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB116_7 ; RV64ZVE32F-NEXT: .LBB116_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 20 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB116_8 ; RV64ZVE32F-NEXT: .LBB116_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 22 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14678,8 +14612,7 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB118_2 ; RV64ZVE32F-NEXT: .LBB118_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 10 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -14687,16 +14620,14 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB118_3 ; RV64ZVE32F-NEXT: .LBB118_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 12 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 12(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB118_4 ; RV64ZVE32F-NEXT: .LBB118_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 14 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 @@ -14710,24 +14641,21 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB118_6 ; RV64ZVE32F-NEXT: .LBB118_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB118_7 ; RV64ZVE32F-NEXT: .LBB118_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB118_8 ; RV64ZVE32F-NEXT: .LBB118_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 6 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 @@ -14795,8 +14723,7 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB119_2 ; RV64ZVE32F-NEXT: .LBB119_10: # %cond.load1 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma @@ -14804,48 +14731,42 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB119_3 ; RV64ZVE32F-NEXT: .LBB119_11: # %cond.load4 -; RV64ZVE32F-NEXT: addi a2, a0, 6 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB119_4 ; RV64ZVE32F-NEXT: .LBB119_12: # %cond.load7 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB119_5 ; RV64ZVE32F-NEXT: .LBB119_13: # %cond.load10 -; RV64ZVE32F-NEXT: addi a2, a0, 8 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB119_6 ; RV64ZVE32F-NEXT: .LBB119_14: # %cond.load13 -; RV64ZVE32F-NEXT: addi a2, a0, 10 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB119_7 ; RV64ZVE32F-NEXT: .LBB119_15: # %cond.load16 -; RV64ZVE32F-NEXT: addi a2, a0, 12 -; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a2, 12(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: beqz a1, .LBB119_8 ; RV64ZVE32F-NEXT: .LBB119_16: # %cond.load19 -; RV64ZVE32F-NEXT: addi a0, a0, 14 -; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index 2ec3a5e464b879..fc35bc4d2a16d8 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck %s -check-prefix=RV32I -; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck %s -check-prefix=RV64I ; Check that memory accesses to array elements with large offsets have those @@ -157,23 +157,21 @@ define void @test4(ptr %dest) { ; RV32I-LABEL: test4: ; RV32I: # %bb.0: ; RV32I-NEXT: addi a0, a0, 2047 -; RV32I-NEXT: addi a1, a0, 1 -; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sb a2, 1(a0) -; RV32I-NEXT: sb a2, 1(a1) -; RV32I-NEXT: sb a2, 2(a1) -; RV32I-NEXT: sb a2, 3(a1) +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: sb a1, 1(a0) +; RV32I-NEXT: sb a1, 2(a0) +; RV32I-NEXT: sb a1, 3(a0) +; RV32I-NEXT: sb a1, 4(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test4: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a0, a0, 2047 -; RV64I-NEXT: addi a1, a0, 1 -; RV64I-NEXT: li a2, 1 -; RV64I-NEXT: sb a2, 1(a0) -; RV64I-NEXT: sb a2, 1(a1) -; RV64I-NEXT: sb a2, 2(a1) -; RV64I-NEXT: sb a2, 3(a1) +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: sb a1, 1(a0) +; RV64I-NEXT: sb a1, 2(a0) +; RV64I-NEXT: sb a1, 3(a0) +; RV64I-NEXT: sb a1, 4(a0) ; RV64I-NEXT: ret %p1 = getelementptr i8, ptr %dest, i32 2048 store i8 1, ptr %p1 diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 2e0c541311e10b..3335ca3a34b6c6 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefix=RV32I %s -; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefix=RV32IM %s -; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefix=RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefix=RV64IM %s define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { @@ -1085,15 +1085,15 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s1, 24(a1) -; RV32I-NEXT: lw s2, 28(a1) -; RV32I-NEXT: lw s3, 16(a1) -; RV32I-NEXT: lw s4, 20(a1) -; RV32I-NEXT: lw s5, 8(a1) -; RV32I-NEXT: lw s6, 12(a1) +; RV32I-NEXT: lw s0, 24(a1) +; RV32I-NEXT: lw s1, 28(a1) +; RV32I-NEXT: lw s2, 16(a1) +; RV32I-NEXT: lw s3, 20(a1) +; RV32I-NEXT: lw s4, 8(a1) +; RV32I-NEXT: lw s5, 12(a1) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 @@ -1101,33 +1101,33 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 ; RV32I-NEXT: li a2, 654 -; RV32I-NEXT: mv a0, s5 -; RV32I-NEXT: mv a1, s6 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s5 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: mv s6, a1 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s5, a1 ; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: mv a1, s4 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s3, a1 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: sw a1, 28(s0) -; RV32I-NEXT: sw a0, 24(s0) -; RV32I-NEXT: sw s4, 20(s0) -; RV32I-NEXT: sw s3, 16(s0) -; RV32I-NEXT: sw s6, 12(s0) -; RV32I-NEXT: sw s5, 8(s0) -; RV32I-NEXT: sw s8, 4(s0) -; RV32I-NEXT: sw s7, 0(s0) +; RV32I-NEXT: sw a1, 28(s6) +; RV32I-NEXT: sw a0, 24(s6) +; RV32I-NEXT: sw s3, 20(s6) +; RV32I-NEXT: sw s2, 16(s6) +; RV32I-NEXT: sw s5, 12(s6) +; RV32I-NEXT: sw s4, 8(s6) +; RV32I-NEXT: sw s8, 4(s6) +; RV32I-NEXT: sw s7, 0(s6) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1154,15 +1154,15 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s1, 24(a1) -; RV32IM-NEXT: lw s2, 28(a1) -; RV32IM-NEXT: lw s3, 16(a1) -; RV32IM-NEXT: lw s4, 20(a1) -; RV32IM-NEXT: lw s5, 8(a1) -; RV32IM-NEXT: lw s6, 12(a1) +; RV32IM-NEXT: lw s0, 24(a1) +; RV32IM-NEXT: lw s1, 28(a1) +; RV32IM-NEXT: lw s2, 16(a1) +; RV32IM-NEXT: lw s3, 20(a1) +; RV32IM-NEXT: lw s4, 8(a1) +; RV32IM-NEXT: lw s5, 12(a1) ; RV32IM-NEXT: lw a3, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: mv s6, a0 ; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 @@ -1170,33 +1170,33 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 ; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s5 -; RV32IM-NEXT: mv a1, s6 +; RV32IM-NEXT: mv a0, s4 +; RV32IM-NEXT: mv a1, s5 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s5, a0 -; RV32IM-NEXT: mv s6, a1 +; RV32IM-NEXT: mv s4, a0 +; RV32IM-NEXT: mv s5, a1 ; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s3 -; RV32IM-NEXT: mv a1, s4 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s3, a0 -; RV32IM-NEXT: mv s4, a1 +; RV32IM-NEXT: mv s2, a0 +; RV32IM-NEXT: mv s3, a1 ; RV32IM-NEXT: lui a0, 1 ; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s1 -; RV32IM-NEXT: mv a1, s2 +; RV32IM-NEXT: mv a0, s0 +; RV32IM-NEXT: mv a1, s1 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw s4, 20(s0) -; RV32IM-NEXT: sw s3, 16(s0) -; RV32IM-NEXT: sw s6, 12(s0) -; RV32IM-NEXT: sw s5, 8(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s7, 0(s0) +; RV32IM-NEXT: sw a1, 28(s6) +; RV32IM-NEXT: sw a0, 24(s6) +; RV32IM-NEXT: sw s3, 20(s6) +; RV32IM-NEXT: sw s2, 16(s6) +; RV32IM-NEXT: sw s5, 12(s6) +; RV32IM-NEXT: sw s4, 8(s6) +; RV32IM-NEXT: sw s8, 4(s6) +; RV32IM-NEXT: sw s7, 0(s6) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index ac67b9005b3d07..32aca29d16e9b9 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s -; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s -; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s @@ -791,15 +791,15 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s1, 24(a1) -; RV32I-NEXT: lw s2, 28(a1) -; RV32I-NEXT: lw s3, 16(a1) -; RV32I-NEXT: lw s4, 20(a1) -; RV32I-NEXT: lw s5, 8(a1) -; RV32I-NEXT: lw s6, 12(a1) +; RV32I-NEXT: lw s0, 24(a1) +; RV32I-NEXT: lw s1, 28(a1) +; RV32I-NEXT: lw s2, 16(a1) +; RV32I-NEXT: lw s3, 20(a1) +; RV32I-NEXT: lw s4, 8(a1) +; RV32I-NEXT: lw s5, 12(a1) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 @@ -807,33 +807,33 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 ; RV32I-NEXT: li a2, 654 -; RV32I-NEXT: mv a0, s5 -; RV32I-NEXT: mv a1, s6 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s5 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: mv s6, a1 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s5, a1 ; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: mv a1, s4 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s3, a1 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: sw a1, 28(s0) -; RV32I-NEXT: sw a0, 24(s0) -; RV32I-NEXT: sw s4, 20(s0) -; RV32I-NEXT: sw s3, 16(s0) -; RV32I-NEXT: sw s6, 12(s0) -; RV32I-NEXT: sw s5, 8(s0) -; RV32I-NEXT: sw s8, 4(s0) -; RV32I-NEXT: sw s7, 0(s0) +; RV32I-NEXT: sw a1, 28(s6) +; RV32I-NEXT: sw a0, 24(s6) +; RV32I-NEXT: sw s3, 20(s6) +; RV32I-NEXT: sw s2, 16(s6) +; RV32I-NEXT: sw s5, 12(s6) +; RV32I-NEXT: sw s4, 8(s6) +; RV32I-NEXT: sw s8, 4(s6) +; RV32I-NEXT: sw s7, 0(s6) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -860,15 +860,15 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s1, 24(a1) -; RV32IM-NEXT: lw s2, 28(a1) -; RV32IM-NEXT: lw s3, 16(a1) -; RV32IM-NEXT: lw s4, 20(a1) -; RV32IM-NEXT: lw s5, 8(a1) -; RV32IM-NEXT: lw s6, 12(a1) +; RV32IM-NEXT: lw s0, 24(a1) +; RV32IM-NEXT: lw s1, 28(a1) +; RV32IM-NEXT: lw s2, 16(a1) +; RV32IM-NEXT: lw s3, 20(a1) +; RV32IM-NEXT: lw s4, 8(a1) +; RV32IM-NEXT: lw s5, 12(a1) ; RV32IM-NEXT: lw a3, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: mv s6, a0 ; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 @@ -876,33 +876,33 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 ; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s5 -; RV32IM-NEXT: mv a1, s6 +; RV32IM-NEXT: mv a0, s4 +; RV32IM-NEXT: mv a1, s5 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s5, a0 -; RV32IM-NEXT: mv s6, a1 +; RV32IM-NEXT: mv s4, a0 +; RV32IM-NEXT: mv s5, a1 ; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s3 -; RV32IM-NEXT: mv a1, s4 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s3, a0 -; RV32IM-NEXT: mv s4, a1 +; RV32IM-NEXT: mv s2, a0 +; RV32IM-NEXT: mv s3, a1 ; RV32IM-NEXT: lui a0, 1 ; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s1 -; RV32IM-NEXT: mv a1, s2 +; RV32IM-NEXT: mv a0, s0 +; RV32IM-NEXT: mv a1, s1 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw s4, 20(s0) -; RV32IM-NEXT: sw s3, 16(s0) -; RV32IM-NEXT: sw s6, 12(s0) -; RV32IM-NEXT: sw s5, 8(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s7, 0(s0) +; RV32IM-NEXT: sw a1, 28(s6) +; RV32IM-NEXT: sw a0, 24(s6) +; RV32IM-NEXT: sw s3, 20(s6) +; RV32IM-NEXT: sw s2, 16(s6) +; RV32IM-NEXT: sw s5, 12(s6) +; RV32IM-NEXT: sw s4, 8(s6) +; RV32IM-NEXT: sw s8, 4(s6) +; RV32IM-NEXT: sw s7, 0(s6) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload