From 62019b2e9bd931a73f8007c111d10ac3ac7d6c3b Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonso360@users.noreply.github.com>
Date: Fri, 16 Jun 2023 16:55:37 +0100
Subject: [PATCH] riscv64: Implement `iadd_pairwise` (#6568)

* riscv64: Add Mov and VSlideUp

* riscv64: Implement `iadd_pairwise`

* riscv64: Use `late_use` in `VecAluRRRImm5`

* machinst: Add `OperandCollector::reg_fixed_late_use`
---
 build.rs                                      |   3 -
 cranelift/codegen/src/isa/riscv64/inst.isle   |   9 +
 .../codegen/src/isa/riscv64/inst/emit.rs      |  89 ++++---
 .../codegen/src/isa/riscv64/inst/encode.rs    |  25 +-
 cranelift/codegen/src/isa/riscv64/inst/mod.rs |  75 +++++-
 .../codegen/src/isa/riscv64/inst/vector.rs    |  65 ++++-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  45 ++++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  28 +++
 cranelift/codegen/src/machinst/reg.rs         |  18 +-
 .../isa/riscv64/simd-iadd_pairwise.clif       | 222 ++++++++++++++++++
 .../runtests/simd-iaddpairwise-64bit.clif     |   1 +
 .../filetests/runtests/simd-iaddpairwise.clif |   1 +
 .../simd-wideningpairwisedotproducts.clif     |   1 +
 13 files changed, 534 insertions(+), 48 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif

diff --git a/build.rs b/build.rs
index 9cb05298cd64..28dceb3ee1f6 100644
--- a/build.rs
+++ b/build.rs
@@ -248,12 +248,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_f64x2_rounding",
                 "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
-                "simd_i16x8_extadd_pairwise_i8x16",
                 "simd_i16x8_q15mulr_sat_s",
                 "simd_i32x4_arith2",
                 "simd_i32x4_cmp",
-                "simd_i32x4_dot_i16x8",
-                "simd_i32x4_extadd_pairwise_i16x8",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
                 "simd_i64x2_arith2",
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
index f0a79b9d5dbb..a02145caa417 100644
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -324,6 +324,15 @@
       (probe_count u32)
       (tmp WritableReg))
 
+    (VecAluRRRImm5
+      (op VecAluOpRRRImm5)
+      (vd WritableReg)
+      (vd_src Reg)
+      (vs2 Reg)
+      (imm Imm5)
+      (mask VecOpMasking)
+      (vstate VState))
+
     (VecAluRRR
       (op VecAluOpRRR)
       (vd WritableReg)
diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
index 53d86c5bdf77..bcbf85f7a80e 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
@@ -464,13 +464,18 @@ impl Inst {
             | Inst::Cltz { .. }
             | Inst::Brev8 { .. }
             | Inst::StackProbeLoop { .. } => None,
+
             // VecSetState does not expect any vstate, rather it updates it.
             Inst::VecSetState { .. } => None,
 
+            // `vmv` instructions copy a set of registers and ignore vstate.
+            Inst::VecAluRRImm5 { op: VecAluOpRRImm5::VmvrV, .. } => None,
+
             Inst::VecAluRR { vstate, .. } |
             Inst::VecAluRRR { vstate, .. } |
             Inst::VecAluRImm5 { vstate, .. } |
             Inst::VecAluRRImm5 { vstate, .. } |
+            Inst::VecAluRRRImm5 { vstate, .. } |
             // TODO: Unit-stride loads and stores only need the AVL to be correct, not
             // the full vtype. A future optimization could be to decouple these two when
             // updating vstate. This would allow us to avoid emitting a VecSetState in
@@ -951,34 +956,44 @@ impl MachInstEmit for Inst {
             }
 
             &Inst::Mov { rd, rm, ty } => {
-                debug_assert_ne!(rd.to_reg().class(), RegClass::Vector);
-                debug_assert_ne!(rm.class(), RegClass::Vector);
-                if rd.to_reg() != rm {
-                    let rm = allocs.next(rm);
-                    let rd = allocs.next_writable(rd);
-                    if ty.is_float() {
-                        Inst::FpuRRR {
-                            alu_op: if ty == F32 {
-                                FpuOPRRR::FsgnjS
-                            } else {
-                                FpuOPRRR::FsgnjD
-                            },
-                            frm: None,
-                            rd: rd,
-                            rs1: rm,
-                            rs2: rm,
-                        }
-                        .emit(&[], sink, emit_info, state);
-                    } else {
-                        let x = Inst::AluRRImm12 {
-                            alu_op: AluOPRRI::Ori,
-                            rd: rd,
-                            rs: rm,
-                            imm12: Imm12::zero(),
-                        };
-                        x.emit(&[], sink, emit_info, state);
-                    }
+                debug_assert_eq!(rd.to_reg().class(), rm.class());
+                if rd.to_reg() == rm {
+                    return;
+                }
+
+                let rm = allocs.next(rm);
+                let rd = allocs.next_writable(rd);
+
+                match rm.class() {
+                    RegClass::Int => Inst::AluRRImm12 {
+                        alu_op: AluOPRRI::Ori,
+                        rd: rd,
+                        rs: rm,
+                        imm12: Imm12::zero(),
+                    },
+                    RegClass::Float => Inst::FpuRRR {
+                        alu_op: if ty == F32 {
+                            FpuOPRRR::FsgnjS
+                        } else {
+                            FpuOPRRR::FsgnjD
+                        },
+                        frm: None,
+                        rd: rd,
+                        rs1: rm,
+                        rs2: rm,
+                    },
+                    RegClass::Vector => Inst::VecAluRRImm5 {
+                        op: VecAluOpRRImm5::VmvrV,
+                        vd: rd,
+                        vs2: rm,
+                        // Imm 0 means copy 1 register.
+                        imm: Imm5::maybe_from_i8(0).unwrap(),
+                        mask: VecOpMasking::Disabled,
+                        // Vstate for this instruction is ignored.
+                        vstate: VState::from_type(ty),
+                    },
                 }
+                .emit(&[], sink, emit_info, state);
             }
 
             &Inst::MovFromPReg { rd, rm } => {
@@ -2827,6 +2842,24 @@ impl MachInstEmit for Inst {
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
+            &Inst::VecAluRRRImm5 {
+                op,
+                vd,
+                vd_src,
+                imm,
+                vs2,
+                ref mask,
+                ..
+            } => {
+                let vs2 = allocs.next(vs2);
+                let vd_src = allocs.next(vd_src);
+                let vd = allocs.next_writable(vd);
+                let mask = mask.with_allocs(&mut allocs);
+
+                debug_assert_eq!(vd.to_reg(), vd_src);
+
+                sink.put4(encode_valu_rrr_imm(op, vd, imm, vs2, mask));
+            }
             &Inst::VecAluRRR {
                 op,
                 vd,
@@ -2854,7 +2887,7 @@ impl MachInstEmit for Inst {
                 let vd = allocs.next_writable(vd);
                 let mask = mask.with_allocs(&mut allocs);
 
-                sink.put4(encode_valu_imm(op, vd, imm, vs2, mask));
+                sink.put4(encode_valu_rr_imm(op, vd, imm, vs2, mask));
             }
             &Inst::VecAluRR {
                 op,
diff --git a/cranelift/codegen/src/isa/riscv64/inst/encode.rs b/cranelift/codegen/src/isa/riscv64/inst/encode.rs
index e52d05aa48a1..bf7c469901ff 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/encode.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs
@@ -9,8 +9,8 @@
 use super::{Imm12, Imm5, UImm5, VType};
 use crate::isa::riscv64::inst::reg_to_gpr_num;
 use crate::isa::riscv64::lower::isle::generated_code::{
-    VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecElementWidth, VecOpCategory,
-    VecOpMasking,
+    VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAluOpRRRImm5, VecElementWidth,
+    VecOpCategory, VecOpMasking,
 };
 use crate::machinst::isle::WritableReg;
 use crate::Reg;
@@ -127,7 +127,7 @@ pub fn encode_valu(
 /// - funct6 (6 bits)
 ///
 /// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc
-pub fn encode_valu_imm(
+pub fn encode_valu_rr_imm(
     op: VecAluOpRRImm5,
     vd: WritableReg,
     imm: Imm5,
@@ -146,6 +146,25 @@ pub fn encode_valu_imm(
     )
 }
 
+pub fn encode_valu_rrr_imm(
+    op: VecAluOpRRRImm5,
+    vd: WritableReg,
+    imm: Imm5,
+    vs2: Reg,
+    masking: VecOpMasking,
+) -> u32 {
+    let funct7 = (op.funct6() << 1) | masking.encode();
+    let imm = imm.bits() as u32;
+    encode_r_type_bits(
+        op.opcode(),
+        reg_to_gpr_num(vd.to_reg()),
+        op.funct3(),
+        imm,
+        reg_to_gpr_num(vs2),
+        funct7,
+    )
+}
+
 pub fn encode_valu_rr(op: VecAluOpRR, vd: WritableReg, vs: Reg, masking: VecOpMasking) -> u32 {
     let funct7 = (op.funct6() << 1) | masking.encode();
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 2f37b57ae513..34933c3c5930 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -352,6 +352,17 @@ fn vec_mask_operands<F: Fn(VReg) -> VReg>(
         VecOpMasking::Disabled => {}
     }
 }
+fn vec_mask_late_operands<F: Fn(VReg) -> VReg>(
+    mask: &VecOpMasking,
+    collector: &mut OperandCollector<'_, F>,
+) {
+    match mask {
+        VecOpMasking::Enabled { reg } => {
+            collector.reg_fixed_late_use(*reg, pv_reg(0).into());
+        }
+        VecOpMasking::Disabled => {}
+    }
+}
 
 fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCollector<'_, F>) {
     match inst {
@@ -642,6 +653,32 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             // gen_prologue is called at emit stage.
             // no need let reg alloc know.
         }
+        &Inst::VecAluRRRImm5 {
+            op,
+            vd,
+            vd_src,
+            vs2,
+            ref mask,
+            ..
+        } => {
+            debug_assert_eq!(vd_src.class(), RegClass::Vector);
+            debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
+            debug_assert_eq!(vs2.class(), RegClass::Vector);
+
+            // If the operation forbids source/destination overlap we need to
+            // ensure that the source and destination registers are different.
+            if op.forbids_src_dst_overlaps() {
+                collector.reg_late_use(vs2);
+                collector.reg_use(vd_src);
+                collector.reg_reuse_def(vd, 1); // `vd` == `vd_src`.
+                vec_mask_late_operands(mask, collector);
+            } else {
+                collector.reg_use(vs2);
+                collector.reg_use(vd_src);
+                collector.reg_reuse_def(vd, 1); // `vd` == `vd_src`.
+                vec_mask_operands(mask, collector);
+            }
+        }
         &Inst::VecAluRRR {
             op,
             vd,
@@ -1614,14 +1651,15 @@ impl Inst {
             &MInst::Mov { rd, rm, ty } => {
                 let rd = format_reg(rd.to_reg(), allocs);
                 let rm = format_reg(rm, allocs);
-                let v = if ty == F32 {
-                    "fmv.s"
-                } else if ty == F64 {
-                    "fmv.d"
-                } else {
-                    "mv"
+
+                let op = match ty {
+                    F32 => "fmv.s",
+                    F64 => "fmv.d",
+                    ty if ty.is_vector() => "vmv1r.v",
+                    _ => "mv",
                 };
-                format!("{} {},{}", v, rd, rm)
+
+                format!("{op} {rd},{rm}")
             }
             &MInst::MovFromPReg { rd, rm } => {
                 let rd = format_reg(rd.to_reg(), allocs);
@@ -1654,6 +1692,29 @@ impl Inst {
             &MInst::Udf { trap_code } => format!("udf##trap_code={}", trap_code),
             &MInst::EBreak {} => String::from("ebreak"),
             &MInst::ECall {} => String::from("ecall"),
+            &Inst::VecAluRRRImm5 {
+                op,
+                vd,
+                imm,
+                vs2,
+                ref mask,
+                ref vstate,
+                ..
+            } => {
+                let vs2_s = format_reg(vs2, allocs);
+                let vd_s = format_reg(vd.to_reg(), allocs);
+                let mask = format_mask(mask, allocs);
+
+                // Some opcodes interpret the immediate as unsigned, lets show the
+                // correct number here.
+                let imm_s = if op.imm_is_unsigned() {
+                    format!("{}", imm.bits())
+                } else {
+                    format!("{}", imm)
+                };
+
+                format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}")
+            }
             &Inst::VecAluRRR {
                 op,
                 vd,
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 8114d78398ae..08992e83f9ea 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -1,8 +1,8 @@
 use crate::isa::riscv64::inst::AllocationConsumer;
 use crate::isa::riscv64::inst::EmitState;
 use crate::isa::riscv64::lower::isle::generated_code::{
-    VecAMode, VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAvl, VecElementWidth,
-    VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode,
+    VecAMode, VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAluOpRRRImm5, VecAvl,
+    VecElementWidth, VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode,
 };
 use crate::machinst::RegClass;
 use crate::Reg;
@@ -252,6 +252,51 @@ impl VecOpMasking {
     }
 }
 
+impl VecAluOpRRRImm5 {
+    pub fn opcode(&self) -> u32 {
+        // Vector Opcode
+        0x57
+    }
+    pub fn funct3(&self) -> u32 {
+        self.category().encode()
+    }
+
+    pub fn funct6(&self) -> u32 {
+        // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc
+        match self {
+            VecAluOpRRRImm5::VslideupVI => 0b001110,
+        }
+    }
+
+    pub fn category(&self) -> VecOpCategory {
+        match self {
+            VecAluOpRRRImm5::VslideupVI => VecOpCategory::OPIVI,
+        }
+    }
+
+    pub fn imm_is_unsigned(&self) -> bool {
+        match self {
+            VecAluOpRRRImm5::VslideupVI => true,
+        }
+    }
+
+    /// Some instructions do not allow the source and destination registers to overlap.
+    pub fn forbids_src_dst_overlaps(&self) -> bool {
+        match self {
+            VecAluOpRRRImm5::VslideupVI => true,
+        }
+    }
+}
+
+impl fmt::Display for VecAluOpRRRImm5 {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut s = format!("{self:?}");
+        s.make_ascii_lowercase();
+        let (opcode, category) = s.split_at(s.len() - 2);
+        f.write_str(&format!("{opcode}.{category}"))
+    }
+}
+
 impl VecAluOpRRR {
     pub fn opcode(&self) -> u32 {
         // Vector Opcode
@@ -287,7 +332,10 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmaxVV | VecAluOpRRR::VmaxVX => 0b000111,
             VecAluOpRRR::VslidedownVX => 0b001111,
             VecAluOpRRR::VfrsubVF => 0b100111,
-            VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 0b010111,
+            VecAluOpRRR::VmergeVVM
+            | VecAluOpRRR::VmergeVXM
+            | VecAluOpRRR::VfmergeVFM
+            | VecAluOpRRR::VcompressVM => 0b010111,
             VecAluOpRRR::VfdivVV
             | VecAluOpRRR::VfdivVF
             | VecAluOpRRR::VsadduVV
@@ -341,7 +389,8 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VmulhVV
             | VecAluOpRRR::VmulhuVV
             | VecAluOpRRR::VredmaxuVS
-            | VecAluOpRRR::VredminuVS => VecOpCategory::OPMVV,
+            | VecAluOpRRR::VredminuVS
+            | VecAluOpRRR::VcompressVM => VecOpCategory::OPMVV,
             VecAluOpRRR::VwaddVX
             | VecAluOpRRR::VwadduVX
             | VecAluOpRRR::VwadduWX
@@ -401,6 +450,7 @@ impl VecAluOpRRR {
         match self {
             VecAluOpRRR::VrgatherVV
             | VecAluOpRRR::VrgatherVX
+            | VecAluOpRRR::VcompressVM
             | VecAluOpRRR::VwadduVV
             | VecAluOpRRR::VwadduVX
             | VecAluOpRRR::VwaddVV
@@ -461,6 +511,7 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VsadduVI => 0b100000,
             VecAluOpRRImm5::VsaddVI => 0b100001,
             VecAluOpRRImm5::VrgatherVI => 0b001100,
+            VecAluOpRRImm5::VmvrV => 0b100111,
         }
     }
 
@@ -478,7 +529,8 @@ impl VecAluOpRRImm5 {
             | VecAluOpRRImm5::VmergeVIM
             | VecAluOpRRImm5::VsadduVI
             | VecAluOpRRImm5::VsaddVI
-            | VecAluOpRRImm5::VrgatherVI => VecOpCategory::OPIVI,
+            | VecAluOpRRImm5::VrgatherVI
+            | VecAluOpRRImm5::VmvrV => VecOpCategory::OPIVI,
         }
     }
 
@@ -488,7 +540,8 @@ impl VecAluOpRRImm5 {
             | VecAluOpRRImm5::VsrlVI
             | VecAluOpRRImm5::VsraVI
             | VecAluOpRRImm5::VslidedownVI
-            | VecAluOpRRImm5::VrgatherVI => true,
+            | VecAluOpRRImm5::VrgatherVI
+            | VecAluOpRRImm5::VmvrV => true,
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
             | VecAluOpRRImm5::VandVI
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index b0fc2d98c0fb..9cb076670b78 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -126,6 +126,7 @@
   (VredmaxuVS)
   (VredminuVS)
   (VrgatherVV)
+  (VcompressVM)
 
   ;; Vector-Scalar Opcodes
   (VaddVX)
@@ -166,6 +167,13 @@
   (VmsltVX)
 ))
 
+
+
+;; Register-Imm ALU Ops that modify the destination register
+(type VecAluOpRRRImm5 (enum
+  (VslideupVI)
+))
+
 ;; Register-Imm ALU Ops
 (type VecAluOpRRImm5 (enum
   ;; Regular VI Opcodes
@@ -182,6 +190,9 @@
   (VslidedownVI)
   (VmergeVIM)
   (VrgatherVI)
+  ;; This opcode represents multiple instructions `vmv1r`/`vmv2r`/`vmv4r`/etc...
+  ;; The immediate field specifies how many registers should be copied.
+  (VmvrV)
 ))
 
 ;; Imm only ALU Ops
@@ -276,6 +287,21 @@
 ;; of the usual RISC-V register order.
 ;; See Section 10.1 of the RISC-V Vector Extension Specification.
 
+
+;; Helper for emitting `MInst.VecAluRRRImm5` instructions.
+;; These instructions modify the destination register.
+(decl vec_alu_rrr_imm5 (VecAluOpRRRImm5 VReg VReg Imm5 VecOpMasking  VState) VReg)
+(rule (vec_alu_rrr_imm5 op vd_src vs2 imm mask vstate)
+      (let ((vd WritableVReg (temp_writable_vreg))
+            (_ Unit (emit (MInst.VecAluRRRImm5 op vd vd_src vs2 imm mask vstate))))
+        vd))
+
+;; Helper for emitting `MInst.VecAluRRRImm5` instructions where the immediate
+;; is zero extended instead of sign extended.
+(decl vec_alu_rrr_uimm5 (VecAluOpRRRImm5 VReg VReg UImm5 VecOpMasking VState) VReg)
+(rule (vec_alu_rrr_uimm5 op vd_src vs2 imm mask vstate)
+      (vec_alu_rrr_imm5 op vd_src vs2 (uimm5_bitcast_to_imm5 imm) mask vstate))
+
 ;; Helper for emitting `MInst.VecAluRRR` instructions.
 (decl vec_alu_rrr (VecAluOpRRR Reg Reg VecOpMasking VState) Reg)
 (rule (vec_alu_rrr op vs2 vs1 mask vstate)
@@ -758,6 +784,14 @@
 (rule (rv_vslidedown_vi vs2 imm mask vstate)
   (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm mask vstate))
 
+;; Helper for emitting the `vslideup.vi` instruction.
+;; Unlike other `vi` instructions the immediate is zero extended.
+;; This is implemented as a 2 source operand instruction, since it only
+;; partially modifies the destination register.
+(decl rv_vslideup_vvi (VReg VReg UImm5 VecOpMasking VState) VReg)
+(rule (rv_vslideup_vvi vd vs2 imm mask vstate)
+  (vec_alu_rrr_uimm5 (VecAluOpRRRImm5.VslideupVI) vd vs2 imm mask vstate))
+
 ;; Helper for emitting the `vmv.x.s` instruction.
 ;; This instruction copies the first element of the source vector to the destination X register.
 ;; Masked versions of this instuction are not supported.
@@ -878,6 +912,17 @@
 (rule (rv_vrgather_vi vs2 imm mask vstate)
   (vec_alu_rr_uimm5 (VecAluOpRRImm5.VrgatherVI) vs2 imm mask vstate))
 
+;; Helper for emitting the `vcompress.vm` instruction.
+;;
+;; The vector compress instruction allows elements selected by a vector mask
+;; register from a source vector register group to be packed into contiguous
+;; elements at the start of the destination vector register group.
+;;
+;; The mask register is specified through vs1
+(decl rv_vcompress_vm (VReg VReg VState) VReg)
+(rule (rv_vcompress_vm vs2 vs1 vstate)
+  (vec_alu_rrr (VecAluOpRRR.VcompressVM) vs2 vs1 (unmasked) vstate))
+
 ;; Helper for emitting the `vmslt.vx` (Vector Mask Set Less Than) instruction.
 (decl rv_vmslt_vx (VReg XReg VecOpMasking VState) VReg)
 (rule (rv_vmslt_vx vs2 vs1 mask vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 80410961b0d2..dd26e352ba3a 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1744,3 +1744,31 @@
 
 (rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low (uwiden_low x)))))
   (rv_vzext_vf8 x (unmasked) out_ty))
+
+;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; We don't have a dedicated instruction for this, rearrange the register elements
+;; and use a vadd.
+;;
+;; We do this by building two masks, one for the even elements and one for the odd
+;; elements. Using vcompress we can extract the elements and group them together.
+;;
+;; This is likely not the optimal way of doing this. LLVM does this using a bunch
+;; of vrgathers (See: https://godbolt.org/z/jq8Wj8WG4), that doesen't seem to be
+;; too much better than this.
+;;
+;; However V8 does something better. They use 2 vcompresses using LMUL2, that means
+;; that they can do the whole thing in 3 instructions (2 vcompress + vadd). We don't
+;; support LMUL > 1, so we can't do that.
+(rule (lower (has_type (ty_vec_fits_in_register ty) (iadd_pairwise x y)))
+  (if-let half_size (u64_to_uimm5 (u64_udiv (ty_lane_count ty) 2)))
+  (let ((odd_mask  VReg (gen_vec_mask 0x5555555555555555))
+        (lhs_lo VReg (rv_vcompress_vm x odd_mask ty))
+        (lhs_hi VReg (rv_vcompress_vm y odd_mask ty))
+        (lhs VReg (rv_vslideup_vvi lhs_lo lhs_hi half_size (unmasked) ty))
+
+        (even_mask VReg (gen_vec_mask 0xAAAAAAAAAAAAAAAA))
+        (rhs_lo VReg (rv_vcompress_vm x even_mask ty))
+        (rhs_hi VReg (rv_vcompress_vm y even_mask ty))
+        (rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty)))
+    (rv_vadd_vv lhs rhs (unmasked) ty)))
diff --git a/cranelift/codegen/src/machinst/reg.rs b/cranelift/codegen/src/machinst/reg.rs
index 61b804f1f45f..0e56e728bb3c 100644
--- a/cranelift/codegen/src/machinst/reg.rs
+++ b/cranelift/codegen/src/machinst/reg.rs
@@ -4,7 +4,9 @@
 
 use alloc::{string::String, vec::Vec};
 use core::{fmt::Debug, hash::Hash};
-use regalloc2::{Allocation, Operand, OperandConstraint, PReg, PRegSet, VReg};
+use regalloc2::{
+    Allocation, Operand, OperandConstraint, OperandKind, OperandPos, PReg, PRegSet, VReg,
+};
 
 #[cfg(feature = "enable-serde")]
 use serde::{Deserialize, Serialize};
@@ -405,6 +407,20 @@ impl<'a, F: Fn(VReg) -> VReg> OperandCollector<'a, F> {
         }
     }
 
+    /// Add a register "fixed use", which ties a vreg to a particular
+    /// RealReg at the end of the instruction.
+    pub fn reg_fixed_late_use(&mut self, reg: Reg, rreg: Reg) {
+        debug_assert!(reg.is_virtual());
+        let rreg = rreg.to_real_reg().expect("fixed reg is not a RealReg");
+        debug_assert!(self.is_allocatable_preg(rreg.into()));
+        self.add_operand(Operand::new(
+            reg.into(),
+            OperandConstraint::FixedReg(rreg.into()),
+            OperandKind::Use,
+            OperandPos::Late,
+        ));
+    }
+
     /// Add a register "fixed use", which ties a vreg to a particular
     /// RealReg at this point.
     pub fn reg_fixed_use(&mut self, reg: Reg, rreg: Reg) {
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif
new file mode 100644
index 000000000000..edbb4edbc2f6
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif
@@ -0,0 +1,222 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %iadd_pairwise_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = iadd_pairwise v0, v1
+  return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   auipc a3,0; ld a3,12(a3); j 12; .8byte 0x5555555555555555
+;   vmv.s.x v8,a3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vcompress.vm v12,v1,v8 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vcompress.vm v13,v3,v8 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslideup.vi v12,v13,8 #avl=16, #vtype=(e8, m1, ta, ma)
+;   auipc a1,0; ld a1,12(a1); j 12; .8byte 0xaaaaaaaaaaaaaaaa
+;   vmv.s.x v18,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vcompress.vm v22,v1,v18 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vcompress.vm v23,v3,v18 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslideup.vi v22,v23,8 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vv v26,v12,v22 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v26,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   auipc a3, 0
+;   ld a3, 0xc(a3)
+;   j 0xc
+;   .byte 0x55, 0x55, 0x55, 0x55
+;   .byte 0x55, 0x55, 0x55, 0x55
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe4, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x26, 0x14, 0x5e
+;   .byte 0xd7, 0x26, 0x34, 0x5e
+;   .byte 0x57, 0x36, 0xd4, 0x3a
+;   auipc a1, 0
+;   ld a1, 0xc(a1)
+;   j 0xc
+;   .byte 0xaa, 0xaa, 0xaa, 0xaa
+;   .byte 0xaa, 0xaa, 0xaa, 0xaa
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe9, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x2b, 0x19, 0x5e
+;   .byte 0xd7, 0x2b, 0x39, 0x5e
+;   .byte 0x57, 0x3b, 0x74, 0x3b
+;   .byte 0x57, 0x0d, 0xcb, 0x02
+;   .byte 0x27, 0x0d, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+  v2 = iadd_pairwise v0, v1
+  return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   auipc a3,0; ld a3,12(a3); j 12; .8byte 0x5555555555555555
+;   vmv.s.x v8,a3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vcompress.vm v12,v1,v8 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vcompress.vm v13,v3,v8 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vslideup.vi v12,v13,4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   auipc a1,0; ld a1,12(a1); j 12; .8byte 0xaaaaaaaaaaaaaaaa
+;   vmv.s.x v18,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vcompress.vm v22,v1,v18 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vcompress.vm v23,v3,v18 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vslideup.vi v22,v23,4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vadd.vv v26,v12,v22 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v26,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   auipc a3, 0
+;   ld a3, 0xc(a3)
+;   j 0xc
+;   .byte 0x55, 0x55, 0x55, 0x55
+;   .byte 0x55, 0x55, 0x55, 0x55
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe4, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x26, 0x14, 0x5e
+;   .byte 0xd7, 0x26, 0x34, 0x5e
+;   .byte 0x57, 0x36, 0xd2, 0x3a
+;   auipc a1, 0
+;   ld a1, 0xc(a1)
+;   j 0xc
+;   .byte 0xaa, 0xaa, 0xaa, 0xaa
+;   .byte 0xaa, 0xaa, 0xaa, 0xaa
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe9, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x2b, 0x19, 0x5e
+;   .byte 0xd7, 0x2b, 0x39, 0x5e
+;   .byte 0x57, 0x3b, 0x72, 0x3b
+;   .byte 0x57, 0x0d, 0xcb, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x0d, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+  v2 = iadd_pairwise v0, v1
+  return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   auipc a3,0; ld a3,12(a3); j 12; .8byte 0x5555555555555555
+;   vmv.s.x v8,a3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vcompress.vm v12,v1,v8 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vcompress.vm v13,v3,v8 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vslideup.vi v12,v13,2 #avl=4, #vtype=(e32, m1, ta, ma)
+;   auipc a1,0; ld a1,12(a1); j 12; .8byte 0xaaaaaaaaaaaaaaaa
+;   vmv.s.x v18,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vcompress.vm v22,v1,v18 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vcompress.vm v23,v3,v18 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vslideup.vi v22,v23,2 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vadd.vv v26,v12,v22 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v26,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   auipc a3, 0
+;   ld a3, 0xc(a3)
+;   j 0xc
+;   .byte 0x55, 0x55, 0x55, 0x55
+;   .byte 0x55, 0x55, 0x55, 0x55
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe4, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x26, 0x14, 0x5e
+;   .byte 0xd7, 0x26, 0x34, 0x5e
+;   .byte 0x57, 0x36, 0xd1, 0x3a
+;   auipc a1, 0
+;   ld a1, 0xc(a1)
+;   j 0xc
+;   .byte 0xaa, 0xaa, 0xaa, 0xaa
+;   .byte 0xaa, 0xaa, 0xaa, 0xaa
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe9, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x2b, 0x19, 0x5e
+;   .byte 0xd7, 0x2b, 0x39, 0x5e
+;   .byte 0x57, 0x3b, 0x71, 0x3b
+;   .byte 0x57, 0x0d, 0xcb, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x0d, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif b/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif
index f735777efe6e..05cdb017753f 100644
--- a/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif
@@ -1,6 +1,7 @@
 test interpret
 test run
 target aarch64
+target riscv64 has_v
 
 function %iaddp_i8x8(i8x8, i8x8) -> i8x8 {
 block0(v0: i8x8, v1: i8x8):
diff --git a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif
index 2f3c8e35b429..972a9f6c880b 100644
--- a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif
@@ -8,6 +8,7 @@ target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
+target riscv64 has_v
 
 function %iaddp_i8x16(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
diff --git a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif
index 8c1354d439d0..b98ee715e908 100644
--- a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif
+++ b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif
@@ -4,6 +4,7 @@ target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target riscv64 has_v
 
 function %wpdps(i16x8, i16x8) -> i32x4 {
 block0(v0: i16x8, v1: i16x8):