-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Remove vmv.s.x and vmv.x.s lmul pseudo variants #71501
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) Changesvmv.s.x and vmv.x.s ignore LMUL, so we can replace the PseudoVMV_S_X_MX and The test diff is due to the fact that a PseudoVMV_S_X/PsuedoVMV_X_S no longer Patch is 328.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71501.diff 32 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 51a235bf2ca1861..5e0adf0ba72eeeb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3362,7 +3362,9 @@ static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) {
static bool isImplicitDef(SDValue V) {
return V.isMachineOpcode() &&
- V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
+ (V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF ||
+ (V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG &&
+ isImplicitDef(V.getOperand(0))));
}
// Optimize masked RVV pseudo instructions with a known all-ones mask to their
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index bec67153b6543d4..9e4692843430837 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6753,24 +6753,19 @@ defm PseudoVID : VPseudoVID_V;
let Predicates = [HasVInstructions] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- foreach m = MxList in {
- defvar mx = m.MX;
- let VLMul = m.value in {
- let HasSEWOp = 1, BaseInstr = VMV_X_S in
- def PseudoVMV_X_S # "_" # mx:
- Pseudo<(outs GPR:$rd), (ins m.vrclass:$rs2, ixlenimm:$sew), []>,
- Sched<[WriteVIMovVX, ReadVIMovVX]>,
- RISCVVPseudo;
- let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
- Constraints = "$rd = $rs1" in
- def PseudoVMV_S_X # "_" # mx: Pseudo<(outs m.vrclass:$rd),
- (ins m.vrclass:$rs1, GPR:$rs2,
- AVL:$vl, ixlenimm:$sew),
- []>,
- Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>,
- RISCVVPseudo;
- }
- }
+ let HasSEWOp = 1, BaseInstr = VMV_X_S in
+ def PseudoVMV_X_S:
+ Pseudo<(outs GPR:$rd), (ins VR:$rs2, ixlenimm:$sew), []>,
+ Sched<[WriteVIMovVX, ReadVIMovVX]>,
+ RISCVVPseudo;
+ let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
+ Constraints = "$rd = $rs1" in
+ def PseudoVMV_S_X: Pseudo<(outs VR:$rd),
+ (ins VR:$rs1, GPR:$rs2,
+ AVL:$vl, ixlenimm:$sew),
+ []>,
+ Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>,
+ RISCVVPseudo;
}
} // Predicates = [HasVInstructions]
@@ -7400,7 +7395,12 @@ defm : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
foreach vti = AllIntegerVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in
def : Pat<(XLenVT (riscv_vmv_x_s (vti.Vector vti.RegClass:$rs2))),
- (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.Log2SEW)>;
+ (PseudoVMV_X_S
+ !if(!isa<GroupVTypeInfo>(vti),
+ (!cast<GroupVTypeInfo>(vti).VectorM1
+ (EXTRACT_SUBREG $rs2, sub_vrm1_0)),
+ (vti.Vector $rs2)),
+ vti.Log2SEW)>;
// vmv.s.x is handled with a custom node in RISCVInstrInfoVVLPatterns.td
}
@@ -7418,10 +7418,15 @@ foreach fvti = AllFloatVectors in {
(fvti.Scalar fvti.ScalarRegClass:$rs2),
GPR:$vl, fvti.Log2SEW)>;
+ defvar is_group = !isa<GroupVTypeInfo>(fvti);
+ defvar merge = !if(is_group,
+ (!cast<GroupVTypeInfo>(fvti).VectorM1
+ (EXTRACT_SUBREG $rs1, sub_vrm1_0)),
+ (fvti.Vector $rs1));
+ defvar vmv_s_x = (PseudoVMV_S_X merge, (XLenVT X0), GPR:$vl, fvti.Log2SEW);
def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1),
(fvti.Scalar (fpimm0)), VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_S_X_" # fvti.LMul.MX)
- (fvti.Vector $rs1), (XLenVT X0), GPR:$vl, fvti.Log2SEW)>;
+ !if(is_group, (INSERT_SUBREG $rs1, vmv_s_x, sub_vrm1_0), vmv_s_x)>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index d92d3975d12f533..f510d3369dd5acc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2825,12 +2825,20 @@ foreach mti = AllMasks in {
// 16.4. Vector Register Gather Instruction
foreach vti = AllIntegerVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in {
+ defvar is_group = !isa<GroupVTypeInfo>(vti);
+ defvar merge = !if(is_group,
+ (!cast<GroupVTypeInfo>(vti).VectorM1
+ (EXTRACT_SUBREG $merge, sub_vrm1_0)),
+ (vti.Vector $merge));
+ defvar vmv_s_x = (PseudoVMV_S_X merge,
+ (vti.Scalar vti.ScalarRegClass:$rs1),
+ GPR:$vl, vti.Log2SEW);
def : Pat<(vti.Vector (riscv_vmv_s_x_vl (vti.Vector vti.RegClass:$merge),
vti.ScalarRegClass:$rs1,
VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX)
- vti.RegClass:$merge,
- (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
+ !if(is_group, (INSERT_SUBREG $merge, vmv_s_x, sub_vrm1_0),
+ vmv_s_x)>;
+
def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
vti.RegClass:$rs1,
@@ -2881,16 +2889,25 @@ foreach vti = AllIntegerVectors in {
// 16.2. Floating-Point Scalar Move Instructions
foreach vti = AllFloatVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in {
+ defvar is_group = !isa<GroupVTypeInfo>(vti);
+ defvar merge = !if(is_group,
+ (!cast<GroupVTypeInfo>(vti).VectorM1
+ (EXTRACT_SUBREG $merge, sub_vrm1_0)),
+ (vti.Vector $merge));
+ defvar vmv_s_x_x0 = (PseudoVMV_S_X merge, (XLenVT X0), GPR:$vl, vti.Log2SEW);
def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
(vti.Scalar (fpimm0)),
VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX)
- vti.RegClass:$merge, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
+ !if(is_group, (INSERT_SUBREG $merge, vmv_s_x_x0, sub_vrm1_0),
+ vmv_s_x_x0)>;
+
+ defvar vmv_s_x = (PseudoVMV_S_X merge, GPR:$imm, GPR:$vl, vti.Log2SEW);
def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
(vti.Scalar (SelectFPImm (XLenVT GPR:$imm))),
VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX)
- vti.RegClass:$merge, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
+ !if(is_group, (INSERT_SUBREG $merge, vmv_s_x, sub_vrm1_0),
+ vmv_s_x)>;
+
def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
vti.ScalarRegClass:$rs1,
VLOpFrag)),
diff --git a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
index 8fb4be6b49ed648..600084632ce68a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
@@ -139,7 +139,7 @@ body: |
; CHECK-NEXT: renamable $v0 = VL1RE8_V killed $x10 :: (load unknown-size from %stack.1, align 8)
; CHECK-NEXT: $x10 = LD $x2, 8 :: (load (s64) from %stack.15)
; CHECK-NEXT: renamable $v0 = PseudoVSLIDEDOWN_VX_M1 undef renamable $v0, killed renamable $v0, killed renamable $x13, $noreg, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: renamable $x13 = PseudoVMV_X_S_M1 killed renamable $v0, 3 /* e8 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: renamable $x13 = PseudoVMV_X_S killed renamable $v0, 3 /* e8 */, implicit $vl, implicit $vtype
; CHECK-NEXT: BLT killed renamable $x16, renamable $x27, %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
@@ -206,7 +206,7 @@ body: |
renamable $x13 = nsw ADDI renamable $x16, -2
renamable $v0 = VL1RE8_V %stack.1 :: (load unknown-size from %stack.1, align 8)
renamable $v0 = PseudoVSLIDEDOWN_VX_M1 undef renamable $v0, killed renamable $v0, killed renamable $x13, $noreg, 3, 1, implicit $vl, implicit $vtype
- renamable $x13 = PseudoVMV_X_S_M1 killed renamable $v0, 3, implicit $vl, implicit $vtype
+ renamable $x13 = PseudoVMV_X_S killed renamable $v0, 3, implicit $vl, implicit $vtype
BLT killed renamable $x16, renamable $x27, %bb.2
bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index fd2f89e26e59809..d9fdec3041cb065 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -8,7 +8,7 @@
define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) {
; CHECK-LABEL: extractelt_nxv1i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 0
@@ -40,7 +40,7 @@ define signext i8 @extractelt_nxv1i8_idx(<vscale x 1 x i8> %v, i32 %idx) {
define signext i8 @extractelt_nxv2i8_0(<vscale x 2 x i8> %v) {
; CHECK-LABEL: extractelt_nxv2i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 0
@@ -72,7 +72,7 @@ define signext i8 @extractelt_nxv2i8_idx(<vscale x 2 x i8> %v, i32 %idx) {
define signext i8 @extractelt_nxv4i8_0(<vscale x 4 x i8> %v) {
; CHECK-LABEL: extractelt_nxv4i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 0
@@ -232,7 +232,7 @@ define signext i8 @extractelt_nxv64i8_idx(<vscale x 64 x i8> %v, i32 %idx) {
define signext i16 @extractelt_nxv1i16_0(<vscale x 1 x i16> %v) {
; CHECK-LABEL: extractelt_nxv1i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 0
@@ -264,7 +264,7 @@ define signext i16 @extractelt_nxv1i16_idx(<vscale x 1 x i16> %v, i32 %idx) {
define signext i16 @extractelt_nxv2i16_0(<vscale x 2 x i16> %v) {
; CHECK-LABEL: extractelt_nxv2i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 0
@@ -424,7 +424,7 @@ define signext i16 @extractelt_nxv32i16_idx(<vscale x 32 x i16> %v, i32 %idx) {
define i32 @extractelt_nxv1i32_0(<vscale x 1 x i32> %v) {
; CHECK-LABEL: extractelt_nxv1i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i32> %v, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index 34dcce3fe058bc9..c1a95ce1ae8143c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -7,7 +7,7 @@
define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) {
; CHECK-LABEL: extractelt_nxv1i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i8> %v, i32 0
@@ -39,7 +39,7 @@ define signext i8 @extractelt_nxv1i8_idx(<vscale x 1 x i8> %v, i32 zeroext %idx)
define signext i8 @extractelt_nxv2i8_0(<vscale x 2 x i8> %v) {
; CHECK-LABEL: extractelt_nxv2i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i8> %v, i32 0
@@ -71,7 +71,7 @@ define signext i8 @extractelt_nxv2i8_idx(<vscale x 2 x i8> %v, i32 zeroext %idx)
define signext i8 @extractelt_nxv4i8_0(<vscale x 4 x i8> %v) {
; CHECK-LABEL: extractelt_nxv4i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i8> %v, i32 0
@@ -231,7 +231,7 @@ define signext i8 @extractelt_nxv64i8_idx(<vscale x 64 x i8> %v, i32 zeroext %id
define signext i16 @extractelt_nxv1i16_0(<vscale x 1 x i16> %v) {
; CHECK-LABEL: extractelt_nxv1i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i16> %v, i32 0
@@ -263,7 +263,7 @@ define signext i16 @extractelt_nxv1i16_idx(<vscale x 1 x i16> %v, i32 zeroext %i
define signext i16 @extractelt_nxv2i16_0(<vscale x 2 x i16> %v) {
; CHECK-LABEL: extractelt_nxv2i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 2 x i16> %v, i32 0
@@ -423,7 +423,7 @@ define signext i16 @extractelt_nxv32i16_idx(<vscale x 32 x i16> %v, i32 zeroext
define signext i32 @extractelt_nxv1i32_0(<vscale x 1 x i32> %v) {
; CHECK-LABEL: extractelt_nxv1i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 1 x i32> %v, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
index 47edb9eecb00bc1..e3ee5b54acafb22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
@@ -32,13 +32,13 @@ define <32 x i1> @bitcast_v4i8_v32i1(<4 x i8> %a, <32 x i1> %b) {
define i8 @bitcast_v1i8_i8(<1 x i8> %a) {
; CHECK-LABEL: bitcast_v1i8_i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
;
; ELEN32-LABEL: bitcast_v1i8_i8:
; ELEN32: # %bb.0:
-; ELEN32-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; ELEN32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; ELEN32-NEXT: vmv.x.s a0, v8
; ELEN32-NEXT: ret
%b = bitcast <1 x i8> %a to i8
@@ -48,13 +48,13 @@ define i8 @bitcast_v1i8_i8(<1 x i8> %a) {
define i16 @bitcast_v2i8_i16(<2 x i8> %a) {
; CHECK-LABEL: bitcast_v2i8_i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
;
; ELEN32-LABEL: bitcast_v2i8_i16:
; ELEN32: # %bb.0:
-; ELEN32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ELEN32-NEXT: vmv.x.s a0, v8
; ELEN32-NEXT: ret
%b = bitcast <2 x i8> %a to i16
@@ -64,13 +64,13 @@ define i16 @bitcast_v2i8_i16(<2 x i8> %a) {
define i16 @bitcast_v1i16_i16(<1 x i16> %a) {
; CHECK-LABEL: bitcast_v1i16_i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
;
; ELEN32-LABEL: bitcast_v1i16_i16:
; ELEN32: # %bb.0:
-; ELEN32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ELEN32-NEXT: vmv.x.s a0, v8
; ELEN32-NEXT: ret
%b = bitcast <1 x i16> %a to i16
@@ -80,7 +80,7 @@ define i16 @bitcast_v1i16_i16(<1 x i16> %a) {
define i32 @bitcast_v4i8_i32(<4 x i8> %a) {
; CHECK-LABEL: bitcast_v4i8_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
;
@@ -96,7 +96,7 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a) {
define i32 @bitcast_v2i16_i32(<2 x i16> %a) {
; CHECK-LABEL: bitcast_v2i16_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
;
@@ -112,7 +112,7 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a) {
define i32 @bitcast_v1i32_i32(<1 x i32> %a) {
; CHECK-LABEL: bitcast_v1i32_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
;
@@ -433,13 +433,13 @@ define double @bitcast_v1i64_f64(<1 x i64> %a) {
define <1 x i16> @bitcast_i16_v1i16(i16 %a) {
; CHECK-LABEL: bitcast_i16_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.s.x v8, a0
; CHECK-NEXT: ret
;
; ELEN32-LABEL: bitcast_i16_v1i16:
; ELEN32: # %bb.0:
-; ELEN32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ELEN32-NEXT: vmv.s.x v8, a0
; ELEN32-NEXT: ret
%b = bitcast i16 %a to <1 x i16>
@@ -449,7 +449,7 @@ define <1 x i16> @bitcast_i16_v1i16(i16 %a) {
define <2 x i16> @bitcast_i32_v2i16(i32 %a) {
; CHECK-LABEL: bitcast_i32_v2i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v8, a0
; CHECK-NEXT: ret
;
@@ -465,7 +465,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a) {
define <1 x i32> @bitcast_i32_v1i32(i32 %a) {
; CHECK-LABEL: bitcast_i32_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v8, a0
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 9d689c732d7999f..c71b2e687e5dff0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -106,7 +106,7 @@ define i1 @extractelt_v16i1(ptr %x, i64 %idx) nounwind {
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vle8.v v8, (a0)
; RV32-NEXT: vmseq.vi v8, v8, 0
-; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: srl a0, a0, a1
; RV32-NEXT: andi a0, a0, 1
@@ -117,7 +117,7 @@ define i1 @extractelt_v16i1(ptr %x, i64 %idx) nounwind {
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64-NEXT: vle8.v v8, (a0)
; RV64-NEXT: vmseq.vi v8, v8, 0
-; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: srl a0, a0, a1
; RV64-NEXT: andi a0, a0, 1
@@ -128,7 +128,7 @@ define i1 @extractelt_v16i1(ptr %x, i64 %idx) nounwind {
; RV32ZBS-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32ZBS-NEXT: vle8.v v8, (a0)
; RV32ZBS-NEXT: vmseq.vi v8, v8, 0
-; RV32ZBS-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32ZBS-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; RV32ZBS-NEXT: vmv.x.s a0, v8
; RV32ZBS-NEXT: bext a0, a0, a1
; RV32ZB...
[truncated]
|
V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF; | ||
(V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF || | ||
(V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG && | ||
isImplicitDef(V.getOperand(0)))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is needed for the implicit_def -> noregister peephole: Without it we lose some CSE when we have (vmv_s_x IMPLICIT_DEF, ...) and (vmv_s_x (extract_subreg IMPLICIT_DEF, ...), ...)
422fbee
to
5e6c2d6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I have never thought it would be so easy.
A bit of context I should probably mention, it's easy to do now because #69788 has landed. Prior to that removing the LMUL information from the pseudos resulted in a lot of extra vsetvli toggles |
5e6c2d6
to
18374ef
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nits. And I need more eyes on the test diffs since they are large.
; V512-NEXT: vmv.s.x v10, a0 | ||
; V512-NEXT: vsetvli zero, zero, e8, mf8, ta, ma | ||
; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here's a case where a backwards pass of #69788 would help.
We can also remove the combine that reduces LMUL, since we've removed the LMUL variants in llvm#71501.
Ping for another set of eyes over the test diffs |
@@ -8,7 +8,7 @@ | |||
define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) { | |||
; CHECK-LABEL: extractelt_nxv1i8_0: | |||
; CHECK: # %bb.0: | |||
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma | |||
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there agreement that m1 compared to mf8 is not a regression? Some microarchitectures have same behavior for M1 and fractional LMUL. I don't know whether there is any microarchitectures that can optimize on the fractional LMUL cases.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We already increase LMUL for vmv.s.x and vmv.x.s in RISCVInsertVSETVLI, albeit it to avoid a vsetvli. Perhaps that already sets a precedent.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The vmv.x.s instruction doesn't really use the VL or the LMUL. It doesn't require the vector register to be naturally aligned to the LMUL. It only uses SEW to know how many bits to extract and where to sign extend in the scalar register. This makes it different than other instructions so it's hard to know for sure.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know of any reason to think increasing to m1 here will have any effect on known processors, but there's a lot of uncertainty here. I will note that defaulting to smallest-legal-fractional-for-sew in insert vsetvli insertion on this instruction probably wouldn't be too painful, so we can likely reverse this easily if needed. (Luke, might be worth playing with that just to reduce the diff.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there was another vector instruction in the sequence that needed some different LMUL other than m1, then that LMUL would have been used instead of m1, since vmv.s.x and vmv.x.s has no impact on LMUL anymore. So I am not worried about the possibility of an extra vsetvli instruction being inserted.
I also don't know of any hardware that has differing performance on fractional LMUL compared to m1. For that reason, I think we could probably hold off on defaulting to the smallest-legal-fractional-for-sew in vsetvli insertion until there was a use case that made it worthwhile.
IMO what we have here is okay.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried out defaulting to mf8 for vmv.s.x and vmv.x.s here lukel97@59bd222, and then rebasing this branch on top of it here 59bd222...lukel97:llvm-project:vmv.s.x-mf8
Doing it this way moves most of the test diffs into that prior commit, but we still see the same missing vsetvli transforms etc. in it
Do we still compile this with a single vsetvli after this change https://godbolt.org/z/bvGd46zWa |
Just checked, there's no change |
@@ -7405,7 +7405,7 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 | |||
; RV64ZVE32F-NEXT: slli a2, a2, 1 | |||
; RV64ZVE32F-NEXT: add a2, a0, a2 | |||
; RV64ZVE32F-NEXT: flh fa5, 0(a2) | |||
; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, tu, ma | |||
; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where did m2 come from?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My guess is that it tried to touch up the SEW/LMUL ratio so it's the same as bb.0. But I'm not sure why it wasn't able to emit the x0,x0 form here, it looks like it should be ok to use it.
@@ -447,9 +447,9 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m) | |||
; RV64-SLOW-NEXT: .LBB7_2: # %else2 | |||
; RV64-SLOW-NEXT: ret | |||
; RV64-SLOW-NEXT: .LBB7_3: # %cond.store | |||
; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma | |||
; RV64-SLOW-NEXT: vsetivli zero, 1, e32, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any fix for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#72352 should fix this
What would it take to remove the INSERT_SUBREG/EXTRACT_SUBREG from tblgen? Can we avoid creating the nodes with non-LMUL types in the first place? |
I think we might be able to wrap all the places where we emit VMV_S_X_VL/VMV_X_S_VL in RISCVISelLowering with INSERT_SUBVECTOR/EXTRACT_SUBVECTORs instead. I'll give that a try |
a1dad85
to
d390274
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One concerning possible regression, otherwise looks straight forward.
@@ -534,7 +534,7 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi | |||
; RV64-SLOW: # %bb.0: | |||
; RV64-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma | |||
; RV64-SLOW-NEXT: vmseq.vi v8, v8, 0 | |||
; RV64-SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma | |||
; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a missing vsetvli insertion optimization.
; CHECK-NEXT: vmv.s.x v9, zero | ||
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma | ||
; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another vsetvli missing transform.
@@ -2760,15 +2760,18 @@ foreach mti = AllMasks in { | |||
|
|||
// 16.1. Integer Scalar Move Instructions | |||
// 16.4. Vector Register Gather Instruction |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this comment down to above new foreach.
let Predicates = GetVTypePredicates<fvti>.Predicates in { | ||
def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1), | ||
(fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), | ||
(!cast<Instruction>("PseudoVFMV_S_"#fvti.ScalarSuffix#"_" # |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can leave these as patterns and just drop the MX suffix? Would be more obviously NFC.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the advantage of having a dead pattern? Or are you suggesting to not remap int_riscv_vfmv_s_f to RISCVISD::VFMV_S_F_VL during lowering?
@@ -8,7 +8,7 @@ | |||
define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) { | |||
; CHECK-LABEL: extractelt_nxv1i8_0: | |||
; CHECK: # %bb.0: | |||
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma | |||
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know of any reason to think increasing to m1 here will have any effect on known processors, but there's a lot of uncertainty here. I will note that defaulting to smallest-legal-fractional-for-sew in insert vsetvli insertion on this instruction probably wouldn't be too painful, so we can likely reverse this easily if needed. (Luke, might be worth playing with that just to reduce the diff.)
This was what I should have sent, the integer version https://godbolt.org/z/f9bzqz6aq but it already has a vsetvli toggle. :( |
@@ -1921,24 +1921,28 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x ptr> %ptrs, <8 x i1> %m) { | |||
; RV64ZVE32F-NEXT: .LBB28_13: # %cond.store7 | |||
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma | |||
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 | |||
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this a regression here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I meant to flag this earlier sorry. These were introduced with the last commit to avoid extract_subreg/insert_subreg d390274cf8295c5e40ae06d7308ceace0c1ee82b
It's because this combine no longer kicks in now that the vmv_x_s has its operand wrapped in an extract_subvector:
llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Lines 15172 to 15190 in 51c351f
if (Val.getOpcode() == RISCVISD::VMV_X_S || | |
(DCI.isAfterLegalizeDAG() && | |
Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |
isNullConstant(Val.getOperand(1)))) { | |
SDValue Src = Val.getOperand(0); | |
MVT VecVT = Src.getSimpleValueType(); | |
// VecVT should be scalable and memory VT should match the element type. | |
if (VecVT.isScalableVector() && | |
MemVT == VecVT.getVectorElementType()) { | |
SDLoc DL(N); | |
MVT MaskVT = getMaskTypeFor(VecVT); | |
return DAG.getStoreVP( | |
Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(), | |
DAG.getConstant(1, DL, MaskVT), | |
DAG.getConstant(1, DL, Subtarget.getXLenVT()), MemVT, | |
Store->getMemOperand(), Store->getAddressingMode(), | |
Store->isTruncatingStore(), /*IsCompress*/ false); | |
} | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could add a check in here like
SDValue Src = Val.getOperand(0);
if (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR && isNullConstant(Src.getOperand(1)))
Src = Src.getOperand(0);
but then we actually end up increasing LMUL in some cases, e.g.
define void @store_vfmv_f_s_nxv8f64(<vscale x 8 x double>* %x, double* %p) {
; CHECK-LABEL: store_vfmv_f_s_nxv8f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: ret
%a = load <vscale x 8 x double>, <vscale x 8 x double>* %x
%b = call double @llvm.riscv.vfmv.f.s.nxv8f64(<vscale x 8 x double> %a)
store double %b, double* %p
ret void
}
This extra vsetvli actually decreases LMUL in this example here: Does this have any effect on performance on vse32e if the VL is the same?
cc @preames @topperc
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this regression is particularly concerning. Given there can be costs associated with a higher LMUL on an operation, it's not even clear to me that having the extra vsetvli here is actually a regression in practice. I don't think this is worth trying to change at this time.
Rebased now that #74040 is landed, test diffs should be smaller now. The one remaining regression is where we lose the x0, x0 form when a vmv.s.x/vmv.x.s is the first instruction in the basic block: https://github.com/llvm/llvm-project/pull/71501/files#diff-10cadf020a900ddef6073743714540c0dd895bb1eabc9555c9fe6a0004f9d15bL358-L361 I think the correct fix for this is to perform SEW/LMUL ratio adjustment in the backwards pass (Or generally improve how we emit the x0, x0 form). But this doesn't seem to be a common occurrence, and I would prefer to leave it to a follow up patch. |
These test cases where intended to get a single vsetvli by using the vmv.s.x intrinsic with the same LMUL as the reduction. This works for FP, but does not work for integer. I believe llvm#71501 will break this for FP too. Hopefully the vsetvli pass can be taught to fix this.
I happened to talk with Craig offline yesterday and we discussed this patch a bit. Craig had previous raised a question above which appears to have gotten lost in the review discussion. He posted #75544 to land the test cases in tree, and this patch needs to be rebased over that one, and the fix for the issue flagged in my comment as I believe that should resolve some diffs in this one. I'd also like to request that you split this patch into the integer subset and the floating point subset. The original patch was integer only, and some of the confusion on the review is caused by the addition of the floating point without it being clearly called out. |
Currently vfmv.s.f intrinsics are directly selected to their pseudos via a tablegen pattern in RISCVInstrInfoVPseudos.td, whereas the other move instructions (vmv.s.x/vmv.v.x/vmv.v.f etc.) first get lowered to their corresponding VL SDNode, then get selected from a pattern in RISCVInstrInfoVVLPatterns.td This patch brings vfmv.s.f inline with the other move instructions, and shows how the LMUL reducing combine for VFMV_S_F_VL and VMV_S_X_VL results in the discrepancy in the test added in llvm#75544. Split out from llvm#71501, where we did this to preserve the behaviour of selecting vmv_s_x for VFMV_S_F_VL for small enough immediates.
Currently vfmv.s.f intrinsics are directly selected to their pseudos via a tablegen pattern in RISCVInstrInfoVPseudos.td, whereas the other move instructions (vmv.s.x/vmv.v.x/vmv.v.f etc.) first get lowered to their corresponding VL SDNode, then get selected from a pattern in RISCVInstrInfoVVLPatterns.td This patch brings vfmv.s.f inline with the other move instructions. Split out from #71501, where we did this to preserve the behaviour of selecting vmv_s_x for VFMV_S_F_VL for small enough immediates.
vmv.s.x and vmv.x.s ignore LMUL, so we can replace the PseudoVMV_S_X_MX and PseudoVMV_X_S_MX with just one pseudo each. These pseudos use the VR register class (just like the actual instruction), so for the tablegen patterns we need to wrap LMUL>1 in subregister inserts/extracts. The test diff is due to the fact that a PseudoVMV_S_X/PsuedoVMV_X_S no longer carries any information about LMUL, so if it's the only vector pseudo instruction in a block then it now default to LMUL=1.
Note that we have to remove the pseudo patterns for the int_riscv_vfmv_s_f intrinsics and lower them instead to VFMV_S_F_VL nodes in ISelLowering, since they may be selected as PseudoVMV_S_X when the scalar is zero.
b693b4f
to
1a7a4f9
Compare
Rebased after #76699 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Currently vfmv.s.f intrinsics are directly selected to their pseudos via a tablegen pattern in RISCVInstrInfoVPseudos.td, whereas the other move instructions (vmv.s.x/vmv.v.x/vmv.v.f etc.) first get lowered to their corresponding VL SDNode, then get selected from a pattern in RISCVInstrInfoVVLPatterns.td This patch brings vfmv.s.f inline with the other move instructions. Split out from llvm#71501, where we did this to preserve the behaviour of selecting vmv_s_x for VFMV_S_F_VL for small enough immediates.
vmv.s.x and vmv.x.s ignore LMUL, so we can replace the PseudoVMV_S_X_MX and PseudoVMV_X_S_MX with just one pseudo each. These pseudos use the VR register class (just like the actual instruction), so we now only have TableGen patterns for vectors of LMUL <= 1. We now rely on the existing combines that shrink LMUL down to 1 for vmv_s_x_vl (and vfmv_s_f_vl). We could look into removing these combines later and just inserting the nodes with the correct type in a later patch. The test diff is due to the fact that a PseudoVMV_S_X/PsuedoVMV_X_S no longer carries any information about LMUL, so if it's the only vector pseudo instruction in a block then it now defaults to LMUL=1.
In llvm#71501 we removed the LMUL variants for vmv.s.x and vmv.x.s because they ignore register groups, so this patch does the same for their floating point equivalnts. We don't need to add any extra patterns for extractelt in RISCVInstrInfoVSDPatterns.td because in lowerEXTRACT_VECTOR_ELT we make sure that the node is narrowed down to LMUL 1.
In #71501 we removed the LMUL variants for vmv.s.x and vmv.x.s because they ignore register groups, so this patch does the same for their floating point equivalents. We don't need to add any extra patterns for extractelt in RISCVInstrInfoVSDPatterns.td because in lowerEXTRACT_VECTOR_ELT we make sure that the node is narrowed down to LMUL 1.
In llvm#71501 we removed the LMUL variants for vmv.s.x and vmv.x.s because they ignore register groups, so this patch does the same for their floating point equivalents. We don't need to add any extra patterns for extractelt in RISCVInstrInfoVSDPatterns.td because in lowerEXTRACT_VECTOR_ELT we make sure that the node is narrowed down to LMUL 1.
vmv.s.x and vmv.x.s ignore LMUL, so we can replace the PseudoVMV_S_X_MX and
PseudoVMV_X_S_MX with just one pseudo each. These pseudos use the VR register
class (just like the actual instruction), so we now only have TableGen patterns for vectors of LMUL <= 1.
We now rely on the existing combines that shrink LMUL down to 1 for vmv_s_x_vl (and vfmv_s_f_vl). We could look into removing these combines later and just inserting the nodes with the correct type in a later patch.
The test diff is due to the fact that a PseudoVMV_S_X/PsuedoVMV_X_S no longer
carries any information about LMUL, so if it's the only vector pseudo
instruction in a block then it now defaults to LMUL=1.