-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] enable VTYPE before whole RVVReg move #117866
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Piyou Chen (BeMg) ChangesAddress #114518 This patch inserts the VSETVLIX0 instruction before any RVVReg move that doesn't be affected by VSETVL instruction in the same basic block. The goal is to avoid the illegal instruction trap caused by disabling vtype. It may occur the regression due to redundant VSETVL insertion. Patch is 1.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117866.diff 173 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 052b4a61298223..5433916202ec28 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -922,6 +922,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
void forwardVSETVLIAVL(VSETVLIInfo &Info) const;
+ void enableVTYPEBeforeMove(MachineBasicBlock &MBB);
};
} // end anonymous namespace
@@ -1768,6 +1769,56 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
}
}
+static bool isRVVCopy(const MachineInstr &MI) {
+ static const TargetRegisterClass *RVVRegClasses[] = {
+ &RISCV::VRRegClass, &RISCV::VRM2RegClass, &RISCV::VRM4RegClass,
+ &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass, &RISCV::VRN2M2RegClass,
+ &RISCV::VRN2M4RegClass, &RISCV::VRN3M1RegClass, &RISCV::VRN3M2RegClass,
+ &RISCV::VRN4M1RegClass, &RISCV::VRN4M2RegClass, &RISCV::VRN5M1RegClass,
+ &RISCV::VRN6M1RegClass, &RISCV::VRN7M1RegClass, &RISCV::VRN8M1RegClass};
+
+ if (MI.getOpcode() != TargetOpcode::COPY)
+ return false;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ for (const auto &RegClass : RVVRegClasses) {
+ if (RegClass->contains(DstReg, SrcReg)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void RISCVInsertVSETVLI::enableVTYPEBeforeMove(MachineBasicBlock &MBB) {
+ bool NeedVSETVL = true;
+
+ if (!BlockInfo[MBB.getNumber()].Pred.isUnknown() &&
+ BlockInfo[MBB.getNumber()].Pred.isValid())
+ NeedVSETVL = false;
+
+ for (auto &MI : MBB) {
+ if (isVectorConfigInstr(MI) || RISCVII::hasSEWOp(MI.getDesc().TSFlags))
+ NeedVSETVL = false;
+
+ if (MI.isCall() || MI.isInlineAsm())
+ NeedVSETVL = true;
+
+ if (NeedVSETVL && isRVVCopy(MI)) {
+ auto VSETVL0MI =
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(RISCVVType::encodeVTYPE(RISCVII::VLMUL::LMUL_1, 32, false,
+ false))
+ .addReg(RISCV::VL, RegState::Implicit);
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*VSETVL0MI);
+ NeedVSETVL = false;
+ }
+ }
+}
+
bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
// Skip if the vector extension is not enabled.
ST = &MF.getSubtarget<RISCVSubtarget>();
@@ -1798,12 +1849,6 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
}
- // If we didn't find any instructions that need VSETVLI, we're done.
- if (!HaveVectorOp) {
- BlockInfo.clear();
- return false;
- }
-
// Phase 2 - determine the exit VL/VTYPE from each block. We add all
// blocks to the list here, but will also add any that need to be revisited
// during Phase 2 processing.
@@ -1842,6 +1887,9 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
insertReadVL(MBB);
+ for (MachineBasicBlock &MBB : MF)
+ enableVTYPEBeforeMove(MBB);
+
BlockInfo.clear();
return HaveVectorOp;
}
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-v-constraint.ll b/llvm/test/CodeGen/RISCV/inline-asm-v-constraint.ll
index c04e4fea7b2c29..6b566e2df0d798 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-v-constraint.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-v-constraint.ll
@@ -45,6 +45,7 @@ define <vscale x 1 x i8> @constraint_vd(<vscale x 1 x i8> %0, <vscale x 1 x i8>
define <vscale x 1 x i1> @constraint_vm(<vscale x 1 x i1> %0, <vscale x 1 x i1> %1) nounwind {
; RV32I-LABEL: constraint_vm:
; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV32I-NEXT: vmv1r.v v9, v0
; RV32I-NEXT: vmv1r.v v0, v8
; RV32I-NEXT: #APP
@@ -54,6 +55,7 @@ define <vscale x 1 x i1> @constraint_vm(<vscale x 1 x i1> %0, <vscale x 1 x i1>
;
; RV64I-LABEL: constraint_vm:
; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64I-NEXT: vmv1r.v v9, v0
; RV64I-NEXT: vmv1r.v v0, v8
; RV64I-NEXT: #APP
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 163d9145bc3623..685e29ef6d9179 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -567,6 +567,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 66a1178cddb66c..3d0a5cc77ef679 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -3075,6 +3075,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
@@ -3158,6 +3159,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
;
; CHECK-ZVBB-LABEL: vp_bitreverse_nxv64i16:
; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-ZVBB-NEXT: vmv1r.v v24, v0
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: srli a2, a1, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 1c95ec8fafd4f1..19f30a7ce438aa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1584,6 +1584,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
@@ -1631,6 +1632,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
;
; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16:
; CHECK-ZVKB: # %bb.0:
+; CHECK-ZVKB-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-ZVKB-NEXT: vmv1r.v v24, v0
; CHECK-ZVKB-NEXT: csrr a1, vlenb
; CHECK-ZVKB-NEXT: srli a2, a1, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index a4e5ab661c5285..e85a7af56cc497 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -336,6 +336,7 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: li a3, 2
; RV32-NEXT: vs8r.v v16, (a1)
+; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV32-NEXT: vmv8r.v v8, v0
; RV32-NEXT: vmv8r.v v16, v24
; RV32-NEXT: call ext2
@@ -374,6 +375,7 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: li a3, 2
; RV64-NEXT: vs8r.v v16, (a1)
+; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64-NEXT: vmv8r.v v8, v0
; RV64-NEXT: vmv8r.v v16, v24
; RV64-NEXT: call ext2
@@ -451,6 +453,7 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 128
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV32-NEXT: vmv8r.v v16, v0
; RV32-NEXT: call ext3
; RV32-NEXT: addi sp, s0, -144
@@ -523,6 +526,7 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 128
; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64-NEXT: vmv8r.v v16, v0
; RV64-NEXT: call ext3
; RV64-NEXT: addi sp, s0, -144
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 9b27116fef7cae..05873a4e83aa29 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -103,6 +103,7 @@ define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @caller_tuple_return(
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: call callee_tuple_return
+; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV32-NEXT: vmv2r.v v6, v8
; RV32-NEXT: vmv2r.v v8, v10
; RV32-NEXT: vmv2r.v v10, v6
@@ -119,6 +120,7 @@ define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @caller_tuple_return(
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset ra, -8
; RV64-NEXT: call callee_tuple_return
+; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64-NEXT: vmv2r.v v6, v8
; RV64-NEXT: vmv2r.v v8, v10
; RV64-NEXT: vmv2r.v v10, v6
@@ -144,6 +146,7 @@ define void @caller_tuple_argument(target("riscv.vector.tuple", <vscale x 16 x i
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV32-NEXT: vmv2r.v v6, v8
; RV32-NEXT: vmv2r.v v8, v10
; RV32-NEXT: vmv2r.v v10, v6
@@ -160,6 +163,7 @@ define void @caller_tuple_argument(target("riscv.vector.tuple", <vscale x 16 x i
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64-NEXT: vmv2r.v v6, v8
; RV64-NEXT: vmv2r.v v8, v10
; RV64-NEXT: vmv2r.v v10, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 7d0b0118a72725..b5cf302605f885 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -117,6 +117,7 @@ declare <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat>, <vsc
define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4bf16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
@@ -169,6 +170,7 @@ declare <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat>, <vsc
define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8bf16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
@@ -221,6 +223,7 @@ declare <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat>, <
define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv16bf16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
@@ -279,6 +282,7 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
@@ -582,6 +586,7 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
;
; ZVFHMIN-LABEL: vp_ceil_vv_nxv4f16:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFHMIN-NEXT: vmv1r.v v9, v0
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
@@ -649,6 +654,7 @@ declare <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half>, <vscale x
define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv8f16:
; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
@@ -668,6 +674,7 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
;
; ZVFHMIN-LABEL: vp_ceil_vv_nxv8f16:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFHMIN-NEXT: vmv1r.v v10, v0
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
@@ -735,6 +742,7 @@ declare <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half>, <vscal
define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv16f16:
; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
@@ -754,6 +762,7 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
;
; ZVFHMIN-LABEL: vp_ceil_vv_nxv16f16:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFHMIN-NEXT: vmv1r.v v12, v0
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
@@ -821,6 +830,7 @@ declare <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half>, <vscal
define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv32f16:
; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
@@ -846,6 +856,7 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; ZVFHMIN-NEXT: vmv1r.v v7, v0
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
@@ -1068,6 +1079,7 @@ declare <vscale x 4 x float> @llvm.vp.ceil.nxv4f32(<vscale x 4 x float>, <vscale
define <vscale x 4 x float> @vp_ceil_vv_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4f32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -1112,6 +1124,7 @@ declare <vscale x 8 x float> @llvm.vp.ceil.nxv8f32(<vscale x 8 x float>, <vscale
define <vscale x 8 x float> @vp_ceil_vv_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8f32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -1156,6 +1169,7 @@ declare <vscale x 16 x float> @llvm.vp.ceil.nxv16f32(<vscale x 16 x float>, <vsc
define <vscale x 16 x float> @vp_ceil_vv_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv16f32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1242,6 +1256,7 @@ declare <vscale x 2 x double> @llvm.vp.ceil.nxv2f64(<vscale x 2 x double>, <vsca
define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv2f64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
@@ -1286,6 +1301,7 @@ declare <vscale x 4 x double> @llvm.vp.ceil.nxv4f64(<vscale x 4 x double>, <vsca
define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4f64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
@@ -1330,6 +1346,7 @@ declare <vscale x 7 x double> @llvm.vp.ceil.nxv7f64(<vscale x 7 x double>, <vsca
define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv7f64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
@@ -1374,6 +1391,7 @@ declare <vscale x 8 x double> @llvm.vp.ceil.nxv8f64(<vscale x 8 x double>, <vsca
define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8f64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
@@ -1425,6 +1443,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: lui a2, %hi(.LCPI44_0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
index bfb2d0a3accc44..d1679b6e2d7fdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -197,6 +197,7 @@ entry:
define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data) {
; RV64-LABEL: test_compresstore_v256i8:
; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64-NEXT: vmv1r.v v7, v8
; RV64-NEXT: li a2, 128
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -230,6 +231,7 @@ define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV32-NEXT: vmv8r.v v24, v16
; RV32-NEXT: li a2, 128
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash....
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(小聲bb: why can't this be the default hardware behavior...)
Register DstReg = MI.getOperand(0).getReg(); | ||
Register SrcReg = MI.getOperand(1).getReg(); | ||
for (const auto &RegClass : RVVRegClasses) { | ||
if (RegClass->contains(DstReg, SrcReg)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can use isRVVRegClass
here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC isRVVRegClass
is for virtual registers, and the insertvsetvl pass runs after RVV register allocation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh thanks for reminding! Would getMinimalPhysRegClass
and check the TSFlags better than a loop?
Do you have any performance data available with this patch? My primary concern with the entire "treat this as a compiler problem" is that we may have some lurking performance problem exposed by the mitigation. I would strongly like to see measurements of this on representative hardware - in particular, hardware like the BP3 which doesn't have the faulting behavior. For the code itself, this is working in terms of COPY instructions. I'm not sure this is (or isn't) safe. Are we absolutely sure we don't have any code anywhere that moves a COPY between vector register allocation and lowering? I have not done this due diligence, but that seems like a strong claim to make. To be clear, the code structure bits are addressable. I suggest using this patch as a test vehicle to check the performance impact before bothering to get the patch "just right". |
For most workloads this probably doesn't show up much. The call would have also invalidated the vector registers so there's nothing to copy unless we also just did a whole register load and immediately copied it's result to another register. I think the case that's more likely to show performance issues is when there are vector arguments/returns. That's going to require intrinsic code. So what should we measure performance on? |
I mean, anything is better than nothing? The claim that this doesn't effect most workloads seems reasonable, but I'd like to see performance data that actually shows that. Once we have an established baseline, we can refine to workloads of concern. |
@@ -922,6 +922,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass { | |||
VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const; | |||
VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const; | |||
void forwardVSETVLIAVL(VSETVLIInfo &Info) const; | |||
void enableVTYPEBeforeMove(MachineBasicBlock &MBB); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"enable" is an odd word choice here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
if (NeedVSETVL && isRVVCopy(MI)) { | ||
auto VSETVL0MI = | ||
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(RISCV::PseudoVSETVLIX0)) | ||
.addReg(RISCV::X0, RegState::Define | RegState::Dead) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can't use x0, x0 when vtype is vill. The vtype is invalid and thus vlmax is invalid. You can only use x0, x0 when vlmax isn't being reduced since reducing vlmax may need to change VL.
If vlmax is invalid due to vill, I don't think we can know that VL won't need to be reduced for the new vlmax.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does RISCV::PseudoVSETIVLI
that emit the vsetivli zero, 0, e8, m1, tu, mu
work for whole register move's vtype?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That should work.
I created another patch (#118252) that performs the same task during the COPY Pseudo expanding hook to handle any COPY pseudo generated after insertvsetvl. And current patch could be an optimization to reduce redundant VSETVL by inserting it with analysis. |
This is an alternative to llvm#117866 that works by demanding a valid vtype instead using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for llvm#114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
This is an alternative to llvm#117866 that works by demanding a valid vtype instead using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for llvm#114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
#118283 is better when we try to resolve it in insert vsetvl pass. |
This is an alternative to llvm#117866 that works by demanding a valid vtype instead using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for llvm#114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
This is an alternative to llvm#117866 that works by demanding a valid vtype instead using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for llvm#114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
This is an alternative to llvm#117866 that works by demanding a valid vtype instead using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for llvm#114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
…on (#118283) This is an alternative to #117866 that works by demanding a valid vtype instead of using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for #114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
…on (llvm#118283) This is an alternative to llvm#117866 that works by demanding a valid vtype instead of using a separate pass. The main advantage of this is that it allows coalesceVSETVLIs to just reuse an existing vsetvli later in the block. To do this we need to first transfer the vsetvli info to some arbitrary valid state in transferBefore when we encounter a vector copy. Then we add a new vill demanded field that will happily accept any other known vtype, which allows us to coalesce these where possible. Note we also need to check for vector copies in computeVLVTYPEChanges, otherwise the pass will completely skip over functions that only have vector copies and nothing else. This is one part of a fix for llvm#114518. We still need to check if there's other cases where vector copies/whole register moves that are inserted after vsetvli insertion.
Address #114518
This patch inserts the VSETVLIX0 instruction before any RVVReg move that doesn't be affected by VSETVL instruction in the same basic block. The goal is to avoid the illegal instruction trap caused by disabling vtype.
It may occur the regression due to redundant VSETVL insertion.