From 62019b2e9bd931a73f8007c111d10ac3ac7d6c3b Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Fri, 16 Jun 2023 16:55:37 +0100 Subject: [PATCH] riscv64: Implement `iadd_pairwise` (#6568) * riscv64: Add Mov and VSlideUp * riscv64: Implement `iadd_pairwise` * riscv64: Use `late_use` in `VecAluRRRImm5` * machinst: Add `OperandCollector::reg_fixed_late_use` --- build.rs | 3 - cranelift/codegen/src/isa/riscv64/inst.isle | 9 + .../codegen/src/isa/riscv64/inst/emit.rs | 89 ++++--- .../codegen/src/isa/riscv64/inst/encode.rs | 25 +- cranelift/codegen/src/isa/riscv64/inst/mod.rs | 75 +++++- .../codegen/src/isa/riscv64/inst/vector.rs | 65 ++++- .../codegen/src/isa/riscv64/inst_vector.isle | 45 ++++ cranelift/codegen/src/isa/riscv64/lower.isle | 28 +++ cranelift/codegen/src/machinst/reg.rs | 18 +- .../isa/riscv64/simd-iadd_pairwise.clif | 222 ++++++++++++++++++ .../runtests/simd-iaddpairwise-64bit.clif | 1 + .../filetests/runtests/simd-iaddpairwise.clif | 1 + .../simd-wideningpairwisedotproducts.clif | 1 + 13 files changed, 534 insertions(+), 48 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif diff --git a/build.rs b/build.rs index 9cb05298cd64..28dceb3ee1f6 100644 --- a/build.rs +++ b/build.rs @@ -248,12 +248,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_f64x2_rounding", "simd_i16x8_arith2", "simd_i16x8_cmp", - "simd_i16x8_extadd_pairwise_i8x16", "simd_i16x8_q15mulr_sat_s", "simd_i32x4_arith2", "simd_i32x4_cmp", - "simd_i32x4_dot_i16x8", - "simd_i32x4_extadd_pairwise_i16x8", "simd_i32x4_trunc_sat_f32x4", "simd_i32x4_trunc_sat_f64x2", "simd_i64x2_arith2", diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index f0a79b9d5dbb..a02145caa417 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -324,6 +324,15 @@ (probe_count u32) (tmp WritableReg)) + (VecAluRRRImm5 + (op VecAluOpRRRImm5) + (vd WritableReg) + (vd_src Reg) + (vs2 Reg) + (imm Imm5) + (mask VecOpMasking) + (vstate VState)) + (VecAluRRR (op VecAluOpRRR) (vd WritableReg) diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index 53d86c5bdf77..bcbf85f7a80e 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -464,13 +464,18 @@ impl Inst { | Inst::Cltz { .. } | Inst::Brev8 { .. } | Inst::StackProbeLoop { .. } => None, + // VecSetState does not expect any vstate, rather it updates it. Inst::VecSetState { .. } => None, + // `vmv` instructions copy a set of registers and ignore vstate. + Inst::VecAluRRImm5 { op: VecAluOpRRImm5::VmvrV, .. } => None, + Inst::VecAluRR { vstate, .. } | Inst::VecAluRRR { vstate, .. } | Inst::VecAluRImm5 { vstate, .. } | Inst::VecAluRRImm5 { vstate, .. } | + Inst::VecAluRRRImm5 { vstate, .. } | // TODO: Unit-stride loads and stores only need the AVL to be correct, not // the full vtype. A future optimization could be to decouple these two when // updating vstate. This would allow us to avoid emitting a VecSetState in @@ -951,34 +956,44 @@ impl MachInstEmit for Inst { } &Inst::Mov { rd, rm, ty } => { - debug_assert_ne!(rd.to_reg().class(), RegClass::Vector); - debug_assert_ne!(rm.class(), RegClass::Vector); - if rd.to_reg() != rm { - let rm = allocs.next(rm); - let rd = allocs.next_writable(rd); - if ty.is_float() { - Inst::FpuRRR { - alu_op: if ty == F32 { - FpuOPRRR::FsgnjS - } else { - FpuOPRRR::FsgnjD - }, - frm: None, - rd: rd, - rs1: rm, - rs2: rm, - } - .emit(&[], sink, emit_info, state); - } else { - let x = Inst::AluRRImm12 { - alu_op: AluOPRRI::Ori, - rd: rd, - rs: rm, - imm12: Imm12::zero(), - }; - x.emit(&[], sink, emit_info, state); - } + debug_assert_eq!(rd.to_reg().class(), rm.class()); + if rd.to_reg() == rm { + return; + } + + let rm = allocs.next(rm); + let rd = allocs.next_writable(rd); + + match rm.class() { + RegClass::Int => Inst::AluRRImm12 { + alu_op: AluOPRRI::Ori, + rd: rd, + rs: rm, + imm12: Imm12::zero(), + }, + RegClass::Float => Inst::FpuRRR { + alu_op: if ty == F32 { + FpuOPRRR::FsgnjS + } else { + FpuOPRRR::FsgnjD + }, + frm: None, + rd: rd, + rs1: rm, + rs2: rm, + }, + RegClass::Vector => Inst::VecAluRRImm5 { + op: VecAluOpRRImm5::VmvrV, + vd: rd, + vs2: rm, + // Imm 0 means copy 1 register. + imm: Imm5::maybe_from_i8(0).unwrap(), + mask: VecOpMasking::Disabled, + // Vstate for this instruction is ignored. + vstate: VState::from_type(ty), + }, } + .emit(&[], sink, emit_info, state); } &Inst::MovFromPReg { rd, rm } => { @@ -2827,6 +2842,24 @@ impl MachInstEmit for Inst { .emit(&[], sink, emit_info, state); sink.bind_label(label_done, &mut state.ctrl_plane); } + &Inst::VecAluRRRImm5 { + op, + vd, + vd_src, + imm, + vs2, + ref mask, + .. + } => { + let vs2 = allocs.next(vs2); + let vd_src = allocs.next(vd_src); + let vd = allocs.next_writable(vd); + let mask = mask.with_allocs(&mut allocs); + + debug_assert_eq!(vd.to_reg(), vd_src); + + sink.put4(encode_valu_rrr_imm(op, vd, imm, vs2, mask)); + } &Inst::VecAluRRR { op, vd, @@ -2854,7 +2887,7 @@ impl MachInstEmit for Inst { let vd = allocs.next_writable(vd); let mask = mask.with_allocs(&mut allocs); - sink.put4(encode_valu_imm(op, vd, imm, vs2, mask)); + sink.put4(encode_valu_rr_imm(op, vd, imm, vs2, mask)); } &Inst::VecAluRR { op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/encode.rs b/cranelift/codegen/src/isa/riscv64/inst/encode.rs index e52d05aa48a1..bf7c469901ff 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/encode.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs @@ -9,8 +9,8 @@ use super::{Imm12, Imm5, UImm5, VType}; use crate::isa::riscv64::inst::reg_to_gpr_num; use crate::isa::riscv64::lower::isle::generated_code::{ - VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecElementWidth, VecOpCategory, - VecOpMasking, + VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAluOpRRRImm5, VecElementWidth, + VecOpCategory, VecOpMasking, }; use crate::machinst::isle::WritableReg; use crate::Reg; @@ -127,7 +127,7 @@ pub fn encode_valu( /// - funct6 (6 bits) /// /// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc -pub fn encode_valu_imm( +pub fn encode_valu_rr_imm( op: VecAluOpRRImm5, vd: WritableReg, imm: Imm5, @@ -146,6 +146,25 @@ pub fn encode_valu_imm( ) } +pub fn encode_valu_rrr_imm( + op: VecAluOpRRRImm5, + vd: WritableReg, + imm: Imm5, + vs2: Reg, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + let imm = imm.bits() as u32; + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + imm, + reg_to_gpr_num(vs2), + funct7, + ) +} + pub fn encode_valu_rr(op: VecAluOpRR, vd: WritableReg, vs: Reg, masking: VecOpMasking) -> u32 { let funct7 = (op.funct6() << 1) | masking.encode(); diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 2f37b57ae513..34933c3c5930 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -352,6 +352,17 @@ fn vec_mask_operands VReg>( VecOpMasking::Disabled => {} } } +fn vec_mask_late_operands VReg>( + mask: &VecOpMasking, + collector: &mut OperandCollector<'_, F>, +) { + match mask { + VecOpMasking::Enabled { reg } => { + collector.reg_fixed_late_use(*reg, pv_reg(0).into()); + } + VecOpMasking::Disabled => {} + } +} fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut OperandCollector<'_, F>) { match inst { @@ -642,6 +653,32 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan // gen_prologue is called at emit stage. // no need let reg alloc know. } + &Inst::VecAluRRRImm5 { + op, + vd, + vd_src, + vs2, + ref mask, + .. + } => { + debug_assert_eq!(vd_src.class(), RegClass::Vector); + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + + // If the operation forbids source/destination overlap we need to + // ensure that the source and destination registers are different. + if op.forbids_src_dst_overlaps() { + collector.reg_late_use(vs2); + collector.reg_use(vd_src); + collector.reg_reuse_def(vd, 1); // `vd` == `vd_src`. + vec_mask_late_operands(mask, collector); + } else { + collector.reg_use(vs2); + collector.reg_use(vd_src); + collector.reg_reuse_def(vd, 1); // `vd` == `vd_src`. + vec_mask_operands(mask, collector); + } + } &Inst::VecAluRRR { op, vd, @@ -1614,14 +1651,15 @@ impl Inst { &MInst::Mov { rd, rm, ty } => { let rd = format_reg(rd.to_reg(), allocs); let rm = format_reg(rm, allocs); - let v = if ty == F32 { - "fmv.s" - } else if ty == F64 { - "fmv.d" - } else { - "mv" + + let op = match ty { + F32 => "fmv.s", + F64 => "fmv.d", + ty if ty.is_vector() => "vmv1r.v", + _ => "mv", }; - format!("{} {},{}", v, rd, rm) + + format!("{op} {rd},{rm}") } &MInst::MovFromPReg { rd, rm } => { let rd = format_reg(rd.to_reg(), allocs); @@ -1654,6 +1692,29 @@ impl Inst { &MInst::Udf { trap_code } => format!("udf##trap_code={}", trap_code), &MInst::EBreak {} => String::from("ebreak"), &MInst::ECall {} => String::from("ecall"), + &Inst::VecAluRRRImm5 { + op, + vd, + imm, + vs2, + ref mask, + ref vstate, + .. + } => { + let vs2_s = format_reg(vs2, allocs); + let vd_s = format_reg(vd.to_reg(), allocs); + let mask = format_mask(mask, allocs); + + // Some opcodes interpret the immediate as unsigned, lets show the + // correct number here. + let imm_s = if op.imm_is_unsigned() { + format!("{}", imm.bits()) + } else { + format!("{}", imm) + }; + + format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}") + } &Inst::VecAluRRR { op, vd, diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 8114d78398ae..08992e83f9ea 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -1,8 +1,8 @@ use crate::isa::riscv64::inst::AllocationConsumer; use crate::isa::riscv64::inst::EmitState; use crate::isa::riscv64::lower::isle::generated_code::{ - VecAMode, VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAvl, VecElementWidth, - VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode, + VecAMode, VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAluOpRRRImm5, VecAvl, + VecElementWidth, VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode, }; use crate::machinst::RegClass; use crate::Reg; @@ -252,6 +252,51 @@ impl VecOpMasking { } } +impl VecAluOpRRRImm5 { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRRRImm5::VslideupVI => 0b001110, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRRImm5::VslideupVI => VecOpCategory::OPIVI, + } + } + + pub fn imm_is_unsigned(&self) -> bool { + match self { + VecAluOpRRRImm5::VslideupVI => true, + } + } + + /// Some instructions do not allow the source and destination registers to overlap. + pub fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRRImm5::VslideupVI => true, + } + } +} + +impl fmt::Display for VecAluOpRRRImm5 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut s = format!("{self:?}"); + s.make_ascii_lowercase(); + let (opcode, category) = s.split_at(s.len() - 2); + f.write_str(&format!("{opcode}.{category}")) + } +} + impl VecAluOpRRR { pub fn opcode(&self) -> u32 { // Vector Opcode @@ -287,7 +332,10 @@ impl VecAluOpRRR { VecAluOpRRR::VmaxVV | VecAluOpRRR::VmaxVX => 0b000111, VecAluOpRRR::VslidedownVX => 0b001111, VecAluOpRRR::VfrsubVF => 0b100111, - VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 0b010111, + VecAluOpRRR::VmergeVVM + | VecAluOpRRR::VmergeVXM + | VecAluOpRRR::VfmergeVFM + | VecAluOpRRR::VcompressVM => 0b010111, VecAluOpRRR::VfdivVV | VecAluOpRRR::VfdivVF | VecAluOpRRR::VsadduVV @@ -341,7 +389,8 @@ impl VecAluOpRRR { | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV | VecAluOpRRR::VredmaxuVS - | VecAluOpRRR::VredminuVS => VecOpCategory::OPMVV, + | VecAluOpRRR::VredminuVS + | VecAluOpRRR::VcompressVM => VecOpCategory::OPMVV, VecAluOpRRR::VwaddVX | VecAluOpRRR::VwadduVX | VecAluOpRRR::VwadduWX @@ -401,6 +450,7 @@ impl VecAluOpRRR { match self { VecAluOpRRR::VrgatherVV | VecAluOpRRR::VrgatherVX + | VecAluOpRRR::VcompressVM | VecAluOpRRR::VwadduVV | VecAluOpRRR::VwadduVX | VecAluOpRRR::VwaddVV @@ -461,6 +511,7 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VsadduVI => 0b100000, VecAluOpRRImm5::VsaddVI => 0b100001, VecAluOpRRImm5::VrgatherVI => 0b001100, + VecAluOpRRImm5::VmvrV => 0b100111, } } @@ -478,7 +529,8 @@ impl VecAluOpRRImm5 { | VecAluOpRRImm5::VmergeVIM | VecAluOpRRImm5::VsadduVI | VecAluOpRRImm5::VsaddVI - | VecAluOpRRImm5::VrgatherVI => VecOpCategory::OPIVI, + | VecAluOpRRImm5::VrgatherVI + | VecAluOpRRImm5::VmvrV => VecOpCategory::OPIVI, } } @@ -488,7 +540,8 @@ impl VecAluOpRRImm5 { | VecAluOpRRImm5::VsrlVI | VecAluOpRRImm5::VsraVI | VecAluOpRRImm5::VslidedownVI - | VecAluOpRRImm5::VrgatherVI => true, + | VecAluOpRRImm5::VrgatherVI + | VecAluOpRRImm5::VmvrV => true, VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VandVI diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index b0fc2d98c0fb..9cb076670b78 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -126,6 +126,7 @@ (VredmaxuVS) (VredminuVS) (VrgatherVV) + (VcompressVM) ;; Vector-Scalar Opcodes (VaddVX) @@ -166,6 +167,13 @@ (VmsltVX) )) + + +;; Register-Imm ALU Ops that modify the destination register +(type VecAluOpRRRImm5 (enum + (VslideupVI) +)) + ;; Register-Imm ALU Ops (type VecAluOpRRImm5 (enum ;; Regular VI Opcodes @@ -182,6 +190,9 @@ (VslidedownVI) (VmergeVIM) (VrgatherVI) + ;; This opcode represents multiple instructions `vmv1r`/`vmv2r`/`vmv4r`/etc... + ;; The immediate field specifies how many registers should be copied. + (VmvrV) )) ;; Imm only ALU Ops @@ -276,6 +287,21 @@ ;; of the usual RISC-V register order. ;; See Section 10.1 of the RISC-V Vector Extension Specification. + +;; Helper for emitting `MInst.VecAluRRRImm5` instructions. +;; These instructions modify the destination register. +(decl vec_alu_rrr_imm5 (VecAluOpRRRImm5 VReg VReg Imm5 VecOpMasking VState) VReg) +(rule (vec_alu_rrr_imm5 op vd_src vs2 imm mask vstate) + (let ((vd WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VecAluRRRImm5 op vd vd_src vs2 imm mask vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRRRImm5` instructions where the immediate +;; is zero extended instead of sign extended. +(decl vec_alu_rrr_uimm5 (VecAluOpRRRImm5 VReg VReg UImm5 VecOpMasking VState) VReg) +(rule (vec_alu_rrr_uimm5 op vd_src vs2 imm mask vstate) + (vec_alu_rrr_imm5 op vd_src vs2 (uimm5_bitcast_to_imm5 imm) mask vstate)) + ;; Helper for emitting `MInst.VecAluRRR` instructions. (decl vec_alu_rrr (VecAluOpRRR Reg Reg VecOpMasking VState) Reg) (rule (vec_alu_rrr op vs2 vs1 mask vstate) @@ -758,6 +784,14 @@ (rule (rv_vslidedown_vi vs2 imm mask vstate) (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm mask vstate)) +;; Helper for emitting the `vslideup.vi` instruction. +;; Unlike other `vi` instructions the immediate is zero extended. +;; This is implemented as a 2 source operand instruction, since it only +;; partially modifies the destination register. +(decl rv_vslideup_vvi (VReg VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vslideup_vvi vd vs2 imm mask vstate) + (vec_alu_rrr_uimm5 (VecAluOpRRRImm5.VslideupVI) vd vs2 imm mask vstate)) + ;; Helper for emitting the `vmv.x.s` instruction. ;; This instruction copies the first element of the source vector to the destination X register. ;; Masked versions of this instuction are not supported. @@ -878,6 +912,17 @@ (rule (rv_vrgather_vi vs2 imm mask vstate) (vec_alu_rr_uimm5 (VecAluOpRRImm5.VrgatherVI) vs2 imm mask vstate)) +;; Helper for emitting the `vcompress.vm` instruction. +;; +;; The vector compress instruction allows elements selected by a vector mask +;; register from a source vector register group to be packed into contiguous +;; elements at the start of the destination vector register group. +;; +;; The mask register is specified through vs1 +(decl rv_vcompress_vm (VReg VReg VState) VReg) +(rule (rv_vcompress_vm vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VcompressVM) vs2 vs1 (unmasked) vstate)) + ;; Helper for emitting the `vmslt.vx` (Vector Mask Set Less Than) instruction. (decl rv_vmslt_vx (VReg XReg VecOpMasking VState) VReg) (rule (rv_vmslt_vx vs2 vs1 mask vstate) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 80410961b0d2..dd26e352ba3a 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1744,3 +1744,31 @@ (rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low (uwiden_low x))))) (rv_vzext_vf8 x (unmasked) out_ty)) + +;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We don't have a dedicated instruction for this, rearrange the register elements +;; and use a vadd. +;; +;; We do this by building two masks, one for the even elements and one for the odd +;; elements. Using vcompress we can extract the elements and group them together. +;; +;; This is likely not the optimal way of doing this. LLVM does this using a bunch +;; of vrgathers (See: https://godbolt.org/z/jq8Wj8WG4), that doesen't seem to be +;; too much better than this. +;; +;; However V8 does something better. They use 2 vcompresses using LMUL2, that means +;; that they can do the whole thing in 3 instructions (2 vcompress + vadd). We don't +;; support LMUL > 1, so we can't do that. +(rule (lower (has_type (ty_vec_fits_in_register ty) (iadd_pairwise x y))) + (if-let half_size (u64_to_uimm5 (u64_udiv (ty_lane_count ty) 2))) + (let ((odd_mask VReg (gen_vec_mask 0x5555555555555555)) + (lhs_lo VReg (rv_vcompress_vm x odd_mask ty)) + (lhs_hi VReg (rv_vcompress_vm y odd_mask ty)) + (lhs VReg (rv_vslideup_vvi lhs_lo lhs_hi half_size (unmasked) ty)) + + (even_mask VReg (gen_vec_mask 0xAAAAAAAAAAAAAAAA)) + (rhs_lo VReg (rv_vcompress_vm x even_mask ty)) + (rhs_hi VReg (rv_vcompress_vm y even_mask ty)) + (rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty))) + (rv_vadd_vv lhs rhs (unmasked) ty))) diff --git a/cranelift/codegen/src/machinst/reg.rs b/cranelift/codegen/src/machinst/reg.rs index 61b804f1f45f..0e56e728bb3c 100644 --- a/cranelift/codegen/src/machinst/reg.rs +++ b/cranelift/codegen/src/machinst/reg.rs @@ -4,7 +4,9 @@ use alloc::{string::String, vec::Vec}; use core::{fmt::Debug, hash::Hash}; -use regalloc2::{Allocation, Operand, OperandConstraint, PReg, PRegSet, VReg}; +use regalloc2::{ + Allocation, Operand, OperandConstraint, OperandKind, OperandPos, PReg, PRegSet, VReg, +}; #[cfg(feature = "enable-serde")] use serde::{Deserialize, Serialize}; @@ -405,6 +407,20 @@ impl<'a, F: Fn(VReg) -> VReg> OperandCollector<'a, F> { } } + /// Add a register "fixed use", which ties a vreg to a particular + /// RealReg at the end of the instruction. + pub fn reg_fixed_late_use(&mut self, reg: Reg, rreg: Reg) { + debug_assert!(reg.is_virtual()); + let rreg = rreg.to_real_reg().expect("fixed reg is not a RealReg"); + debug_assert!(self.is_allocatable_preg(rreg.into())); + self.add_operand(Operand::new( + reg.into(), + OperandConstraint::FixedReg(rreg.into()), + OperandKind::Use, + OperandPos::Late, + )); + } + /// Add a register "fixed use", which ties a vreg to a particular /// RealReg at this point. pub fn reg_fixed_use(&mut self, reg: Reg, rreg: Reg) { diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif new file mode 100644 index 000000000000..edbb4edbc2f6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd_pairwise.clif @@ -0,0 +1,222 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %iadd_pairwise_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd_pairwise v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; auipc a3,0; ld a3,12(a3); j 12; .8byte 0x5555555555555555 +; vmv.s.x v8,a3 #avl=2, #vtype=(e64, m1, ta, ma) +; vcompress.vm v12,v1,v8 #avl=16, #vtype=(e8, m1, ta, ma) +; vcompress.vm v13,v3,v8 #avl=16, #vtype=(e8, m1, ta, ma) +; vslideup.vi v12,v13,8 #avl=16, #vtype=(e8, m1, ta, ma) +; auipc a1,0; ld a1,12(a1); j 12; .8byte 0xaaaaaaaaaaaaaaaa +; vmv.s.x v18,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vcompress.vm v22,v1,v18 #avl=16, #vtype=(e8, m1, ta, ma) +; vcompress.vm v23,v3,v18 #avl=16, #vtype=(e8, m1, ta, ma) +; vslideup.vi v22,v23,8 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v26,v12,v22 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v26,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; auipc a3, 0 +; ld a3, 0xc(a3) +; j 0xc +; .byte 0x55, 0x55, 0x55, 0x55 +; .byte 0x55, 0x55, 0x55, 0x55 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe4, 0x06, 0x42 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x26, 0x14, 0x5e +; .byte 0xd7, 0x26, 0x34, 0x5e +; .byte 0x57, 0x36, 0xd4, 0x3a +; auipc a1, 0 +; ld a1, 0xc(a1) +; j 0xc +; .byte 0xaa, 0xaa, 0xaa, 0xaa +; .byte 0xaa, 0xaa, 0xaa, 0xaa +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe9, 0x05, 0x42 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x2b, 0x19, 0x5e +; .byte 0xd7, 0x2b, 0x39, 0x5e +; .byte 0x57, 0x3b, 0x74, 0x3b +; .byte 0x57, 0x0d, 0xcb, 0x02 +; .byte 0x27, 0x0d, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd_pairwise v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; auipc a3,0; ld a3,12(a3); j 12; .8byte 0x5555555555555555 +; vmv.s.x v8,a3 #avl=2, #vtype=(e64, m1, ta, ma) +; vcompress.vm v12,v1,v8 #avl=8, #vtype=(e16, m1, ta, ma) +; vcompress.vm v13,v3,v8 #avl=8, #vtype=(e16, m1, ta, ma) +; vslideup.vi v12,v13,4 #avl=8, #vtype=(e16, m1, ta, ma) +; auipc a1,0; ld a1,12(a1); j 12; .8byte 0xaaaaaaaaaaaaaaaa +; vmv.s.x v18,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vcompress.vm v22,v1,v18 #avl=8, #vtype=(e16, m1, ta, ma) +; vcompress.vm v23,v3,v18 #avl=8, #vtype=(e16, m1, ta, ma) +; vslideup.vi v22,v23,4 #avl=8, #vtype=(e16, m1, ta, ma) +; vadd.vv v26,v12,v22 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v26,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; auipc a3, 0 +; ld a3, 0xc(a3) +; j 0xc +; .byte 0x55, 0x55, 0x55, 0x55 +; .byte 0x55, 0x55, 0x55, 0x55 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe4, 0x06, 0x42 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x26, 0x14, 0x5e +; .byte 0xd7, 0x26, 0x34, 0x5e +; .byte 0x57, 0x36, 0xd2, 0x3a +; auipc a1, 0 +; ld a1, 0xc(a1) +; j 0xc +; .byte 0xaa, 0xaa, 0xaa, 0xaa +; .byte 0xaa, 0xaa, 0xaa, 0xaa +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe9, 0x05, 0x42 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x2b, 0x19, 0x5e +; .byte 0xd7, 0x2b, 0x39, 0x5e +; .byte 0x57, 0x3b, 0x72, 0x3b +; .byte 0x57, 0x0d, 0xcb, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x0d, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd_pairwise v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; auipc a3,0; ld a3,12(a3); j 12; .8byte 0x5555555555555555 +; vmv.s.x v8,a3 #avl=2, #vtype=(e64, m1, ta, ma) +; vcompress.vm v12,v1,v8 #avl=4, #vtype=(e32, m1, ta, ma) +; vcompress.vm v13,v3,v8 #avl=4, #vtype=(e32, m1, ta, ma) +; vslideup.vi v12,v13,2 #avl=4, #vtype=(e32, m1, ta, ma) +; auipc a1,0; ld a1,12(a1); j 12; .8byte 0xaaaaaaaaaaaaaaaa +; vmv.s.x v18,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vcompress.vm v22,v1,v18 #avl=4, #vtype=(e32, m1, ta, ma) +; vcompress.vm v23,v3,v18 #avl=4, #vtype=(e32, m1, ta, ma) +; vslideup.vi v22,v23,2 #avl=4, #vtype=(e32, m1, ta, ma) +; vadd.vv v26,v12,v22 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v26,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; auipc a3, 0 +; ld a3, 0xc(a3) +; j 0xc +; .byte 0x55, 0x55, 0x55, 0x55 +; .byte 0x55, 0x55, 0x55, 0x55 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe4, 0x06, 0x42 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x26, 0x14, 0x5e +; .byte 0xd7, 0x26, 0x34, 0x5e +; .byte 0x57, 0x36, 0xd1, 0x3a +; auipc a1, 0 +; ld a1, 0xc(a1) +; j 0xc +; .byte 0xaa, 0xaa, 0xaa, 0xaa +; .byte 0xaa, 0xaa, 0xaa, 0xaa +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe9, 0x05, 0x42 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x2b, 0x19, 0x5e +; .byte 0xd7, 0x2b, 0x39, 0x5e +; .byte 0x57, 0x3b, 0x71, 0x3b +; .byte 0x57, 0x0d, 0xcb, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x0d, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif b/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif index f735777efe6e..05cdb017753f 100644 --- a/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif +++ b/cranelift/filetests/filetests/runtests/simd-iaddpairwise-64bit.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target riscv64 has_v function %iaddp_i8x8(i8x8, i8x8) -> i8x8 { block0(v0: i8x8, v1: i8x8): diff --git a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif index 2f3c8e35b429..972a9f6c880b 100644 --- a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif +++ b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif @@ -8,6 +8,7 @@ target x86_64 target x86_64 sse41 target x86_64 sse42 target x86_64 sse42 has_avx +target riscv64 has_v function %iaddp_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif index 8c1354d439d0..b98ee715e908 100644 --- a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif +++ b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif @@ -4,6 +4,7 @@ target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target riscv64 has_v function %wpdps(i16x8, i16x8) -> i32x4 { block0(v0: i16x8, v1: i16x8):