From 4ea15391706f176d7e5943f6a004306e8d9288fc Mon Sep 17 00:00:00 2001 From: dheaton-arm Date: Wed, 13 Jul 2022 16:10:07 +0100 Subject: [PATCH 1/2] Convert `fma`, `valltrue` & `vanytrue` to ISLE (AArch64) Ported the existing implementations of the following opcodes to ISLE on AArch64: - `fma` - Introduced missing support for `fma` on vector values, as per the docs. - `valltrue` - `vanytrue` Also fixed `fcmp` on scalar values in the interpreter, and enabled interpreter tests in `simd-fma.clif`. This introduces the `FMLA` machine instruction. Copyright (c) 2022 Arm Limited --- cranelift/codegen/src/isa/aarch64/inst.isle | 55 ++++++- .../codegen/src/isa/aarch64/inst/args.rs | 5 +- .../codegen/src/isa/aarch64/inst/emit.rs | 19 ++- .../src/isa/aarch64/inst/emit_tests.rs | 50 +++++- .../codegen/src/isa/aarch64/inst/imms.rs | 8 - cranelift/codegen/src/isa/aarch64/inst/mod.rs | 35 ++-- .../codegen/src/isa/aarch64/inst/regs.rs | 13 +- cranelift/codegen/src/isa/aarch64/lower.isle | 86 +++++++++- .../codegen/src/isa/aarch64/lower/isle.rs | 10 ++ .../codegen/src/isa/aarch64/lower_inst.rs | 155 ++---------------- cranelift/codegen/src/machinst/isle.rs | 17 ++ cranelift/codegen/src/prelude.isle | 13 +- .../filetests/isa/aarch64/floating-point.clif | 36 ++++ .../filetests/isa/aarch64/simd-valltrue.clif | 94 +++++++++++ .../filetests/runtests/simd-fma-64bit.clif | 47 ++++++ .../filetests/runtests/simd-fma.clif | 2 + .../runtests/simd-valltrue-64bit.clif | 58 +++++++ .../runtests/simd-vanytrue-64bit.clif | 58 +++++++ cranelift/interpreter/src/step.rs | 49 +++++- 19 files changed, 604 insertions(+), 206 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-fma-64bit.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index e81868bccc9f..fdbc25e3a2c0 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -335,8 +335,10 @@ (rn Reg)) ;; 3-op FPU instruction. + ;; 16-bit scalars require half-precision floating-point support (FEAT_FP16). (FpuRRRR (fpu_op FPUOp3) + (size ScalarSize) (rd WritableReg) (rn Reg) (rm Reg) @@ -478,7 +480,7 @@ (rd WritableReg) (rn Reg) (idx u8) - (size VectorSize)) + (size ScalarSize)) ;; Signed move from a vector element to a GPR. (MovFromVecSigned @@ -1011,8 +1013,7 @@ ;; A floating-point unit (FPU) operation with three args. (type FPUOp3 (enum - (MAdd32) - (MAdd64) + (MAdd) )) ;; A conversion from an FP to an integer value. @@ -1143,6 +1144,8 @@ (Fmin) ;; Floating-point multiply (Fmul) + ;; Floating-point fused multiply-add vectors + (Fmla) ;; Add pairwise (Addp) ;; Zip vectors (primary) [meaning, high halves] @@ -1364,6 +1367,9 @@ (decl imm12_from_negated_u64 (Imm12) u64) (extern extractor imm12_from_negated_u64 imm12_from_negated_u64) +(decl pure lshr_from_u64 (Type u64) ShiftOpAndAmt) +(extern constructor lshr_from_u64 lshr_from_u64) + (decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt) (extern constructor lshl_from_imm64 lshl_from_imm64) @@ -1491,6 +1497,15 @@ (rule (fpu_rr op src size) (let ((dst WritableReg (temp_writable_reg $F64)) (_ Unit (emit (MInst.FpuRR op size dst src)))) + dst)) + +;; Helper for emitting `MInst.VecRRR` instructions which use three registers, +;; one of which is both source and output. +(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg) +(rule (vec_rrr_inplace op src_dst src2 src3 size) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_1 Unit (emit (MInst.FpuMove128 dst src_dst))) + (_2 Unit (emit (MInst.VecRRR op dst src2 src3 size)))) dst)) ;; Helper for emitting `MInst.FpuRRR` instructions. @@ -1500,6 +1515,13 @@ (_ Unit (emit (MInst.FpuRRR op size dst src1 src2)))) dst)) +;; Helper for emitting `MInst.FpuRRRR` instructions. +(decl fpu_rrrr (FPUOp3 ScalarSize Reg Reg Reg) Reg) +(rule (fpu_rrrr size op src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg $F64)) + (_ Unit (emit (MInst.FpuRRRR size op dst src1 src2 src3)))) + dst)) + ;; Helper for emitting `MInst.FpuCmp` instructions. (decl fpu_cmp (ScalarSize Reg Reg) ProducesFlags) (rule (fpu_cmp size rn rm) @@ -1541,6 +1563,15 @@ (_ Unit (emit (MInst.AluRRRShift op (operand_size ty) dst src1 src2 shift)))) dst)) +;; Helper for emitting `cmp` instructions, setting flags, with a right-shifted +;; second operand register. +(decl cmp_rr_shift (OperandSize Reg Reg u64) ProducesFlags) +(rule (cmp_rr_shift size src1 src2 shift_amount) + (if-let shift (lshr_from_u64 $I64 shift_amount)) + (ProducesFlags.ProducesFlagsSideEffect + (MInst.AluRRRShift (ALUOp.SubS) size (writable_zero_reg) + src1 src2 shift))) + ;; Helper for emitting `MInst.AluRRRExtend` instructions. (decl alu_rrr_extend (ALUOp Type Reg Reg ExtendOp) Reg) (rule (alu_rrr_extend op ty src1 src2 extend) @@ -1741,7 +1772,7 @@ dst)) ;; Helper for emitting `MInst.MovFromVec` instructions. -(decl mov_from_vec (Reg u8 VectorSize) Reg) +(decl mov_from_vec (Reg u8 ScalarSize) Reg) (rule (mov_from_vec rn idx size) (let ((dst WritableReg (temp_writable_reg $I64)) (_ Unit (emit (MInst.MovFromVec dst rn idx size)))) @@ -1817,6 +1848,22 @@ (MInst.CSNeg dst cond if_true if_false) dst))) +;; Helper for generating `MInst.CCmpImm` instructions. +(decl ccmp_imm (OperandSize u8 Reg UImm5 NZCV Cond) ConsumesFlags) +(rule (ccmp_imm size 1 rn imm nzcv cond) + (let ((dst WritableReg (temp_writable_reg $I64))) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs + (MInst.CCmpImm size rn imm nzcv cond) + (MInst.CSet dst cond) + (value_reg dst)))) + +(rule (ccmp_imm size _ty_bits rn imm nzcv cond) + (let ((dst WritableReg (temp_writable_reg $I64))) + (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs + (MInst.CCmpImm size rn imm nzcv cond) + (MInst.CSetm dst cond) + (value_reg dst)))) + ;; Helpers for generating `add` instructions. (decl add (Type Reg Reg) Reg) diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index ee5e3774ae9a..7ce8a048d183 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -620,7 +620,7 @@ impl ScalarSize { /// Convert to an integer operand size. pub fn operand_size(&self) -> OperandSize { match self { - ScalarSize::Size32 => OperandSize::Size32, + ScalarSize::Size8 | ScalarSize::Size16 | ScalarSize::Size32 => OperandSize::Size32, ScalarSize::Size64 => OperandSize::Size64, _ => panic!("Unexpected operand_size request for: {:?}", self), } @@ -687,8 +687,11 @@ impl VectorSize { debug_assert!(ty.is_vector()); match ty { + B8X8 => VectorSize::Size8x8, B8X16 => VectorSize::Size8x16, + B16X4 => VectorSize::Size16x4, B16X8 => VectorSize::Size16x8, + B32X2 => VectorSize::Size32x2, B32X4 => VectorSize::Size32x4, B64X2 => VectorSize::Size64x2, F32X2 => VectorSize::Size32x2, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 1fef41ce4cbc..ab210acda8fe 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1790,6 +1790,7 @@ impl MachInstEmit for Inst { } &Inst::FpuRRRR { fpu_op, + size, rd, rn, rm, @@ -1800,9 +1801,9 @@ impl MachInstEmit for Inst { let rm = allocs.next(rm); let ra = allocs.next(ra); let top17 = match fpu_op { - FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0, - FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0, + FPUOp3::MAdd => 0b000_11111_00_0_00000_0, }; + let top17 = top17 | size.ftype() << 7; sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } &Inst::VecMisc { op, rd, rn, size } => { @@ -2209,11 +2210,11 @@ impl MachInstEmit for Inst { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); let (q, imm5, shift, mask) = match size { - VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111), - VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111), - VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011), - VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001), - _ => unreachable!(), + ScalarSize::Size8 => (0b0, 0b00001, 1, 0b1111), + ScalarSize::Size16 => (0b0, 0b00010, 2, 0b0111), + ScalarSize::Size32 => (0b0, 0b00100, 3, 0b0011), + ScalarSize::Size64 => (0b1, 0b01000, 4, 0b0001), + _ => panic!("Unexpected scalar FP operand size: {:?}", size), }; debug_assert_eq!(idx & mask, idx); let imm5 = imm5 | ((idx as u32) << shift); @@ -2542,7 +2543,8 @@ impl MachInstEmit for Inst { | VecALUOp::Fdiv | VecALUOp::Fmax | VecALUOp::Fmin - | VecALUOp::Fmul => true, + | VecALUOp::Fmul + | VecALUOp::Fmla => true, _ => false, }; let enc_float_size = match (is_float, size) { @@ -2617,6 +2619,7 @@ impl MachInstEmit for Inst { VecALUOp::Fmax => (0b000_01110_00_1, 0b111101), VecALUOp::Fmin => (0b000_01110_10_1, 0b111101), VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), + VecALUOp::Fmla => (0b000_01110_00_1, 0b110011), VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110), VecALUOp::Sqrdmulh => { diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 2439b96bfaec..01d3e0fe48b5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2266,7 +2266,7 @@ fn test_aarch64_binemit() { rd: writable_xreg(3), rn: vreg(27), idx: 14, - size: VectorSize::Size8x16, + size: ScalarSize::Size8, }, "633F1D0E", "umov w3, v27.b[14]", @@ -2276,7 +2276,7 @@ fn test_aarch64_binemit() { rd: writable_xreg(24), rn: vreg(5), idx: 3, - size: VectorSize::Size16x8, + size: ScalarSize::Size16, }, "B83C0E0E", "umov w24, v5.h[3]", @@ -2286,7 +2286,7 @@ fn test_aarch64_binemit() { rd: writable_xreg(12), rn: vreg(17), idx: 1, - size: VectorSize::Size32x4, + size: ScalarSize::Size32, }, "2C3E0C0E", "mov w12, v17.s[1]", @@ -2296,7 +2296,7 @@ fn test_aarch64_binemit() { rd: writable_xreg(21), rn: vreg(20), idx: 0, - size: VectorSize::Size64x2, + size: ScalarSize::Size64, }, "953E084E", "mov x21, v20.d[0]", @@ -4054,6 +4054,42 @@ fn test_aarch64_binemit() { "fmul v2.2d, v0.2d, v5.2d", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fmla, + rd: writable_vreg(2), + rn: vreg(0), + rm: vreg(5), + size: VectorSize::Size32x2, + }, + "02CC250E", + "fmla v2.2s, v0.2s, v5.2s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fmla, + rd: writable_vreg(2), + rn: vreg(0), + rm: vreg(5), + size: VectorSize::Size32x4, + }, + "02CC254E", + "fmla v2.4s, v0.4s, v5.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fmla, + rd: writable_vreg(2), + rn: vreg(0), + rm: vreg(5), + size: VectorSize::Size64x2, + }, + "02CC654E", + "fmla v2.2d, v0.2d, v5.2d", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Addp, @@ -5911,7 +5947,8 @@ fn test_aarch64_binemit() { insns.push(( Inst::FpuRRRR { - fpu_op: FPUOp3::MAdd32, + fpu_op: FPUOp3::MAdd, + size: ScalarSize::Size32, rd: writable_vreg(15), rn: vreg(30), rm: vreg(31), @@ -5923,7 +5960,8 @@ fn test_aarch64_binemit() { insns.push(( Inst::FpuRRRR { - fpu_op: FPUOp3::MAdd64, + fpu_op: FPUOp3::MAdd, + size: ScalarSize::Size64, rd: writable_vreg(15), rn: vreg(30), rm: vreg(31), diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs index 47a30b40a3de..c18737693b96 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs @@ -292,14 +292,6 @@ impl Imm12 { } } - /// Create a zero immediate of this format. - pub fn zero() -> Self { - Imm12 { - bits: 0, - shift12: false, - } - } - /// Bits for 2-bit "shift" field in e.g. AddI. pub fn shift_bits(&self) -> u32 { if self.shift12 { diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index d3b141dd70b6..a35e97e1c59a 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -960,7 +960,7 @@ fn aarch64_get_operands VReg>(inst: &Inst, collector: &mut Operan &Inst::VecRRR { alu_op, rd, rn, rm, .. } => { - if alu_op == VecALUOp::Bsl { + if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla { collector.reg_mod(rd); } else { collector.reg_def(rd); @@ -1705,7 +1705,7 @@ impl Inst { } &Inst::FpuMoveFromVec { rd, rn, idx, size } => { let rd = pretty_print_vreg_scalar(rd.to_reg(), size.lane_size(), allocs); - let rn = pretty_print_vreg_element(rn, idx as usize, size, allocs); + let rn = pretty_print_vreg_element(rn, idx as usize, size.lane_size(), allocs); format!("mov {}, {}", rd, rn) } &Inst::FpuExtend { rd, rn, size } => { @@ -1777,14 +1777,14 @@ impl Inst { } &Inst::FpuRRRR { fpu_op, + size, rd, rn, rm, ra, } => { - let (op, size) = match fpu_op { - FPUOp3::MAdd32 => ("fmadd", ScalarSize::Size32), - FPUOp3::MAdd64 => ("fmadd", ScalarSize::Size64), + let op = match fpu_op { + FPUOp3::MAdd => "fmadd", }; let rd = pretty_print_vreg_scalar(rd.to_reg(), size, allocs); let rn = pretty_print_vreg_scalar(rn, size, allocs); @@ -1965,16 +1965,17 @@ impl Inst { format!("fmov {}, {}", rd, imm) } &Inst::MovToVec { rd, rn, idx, size } => { - let rd = pretty_print_vreg_element(rd.to_reg(), idx as usize, size, allocs); + let rd = + pretty_print_vreg_element(rd.to_reg(), idx as usize, size.lane_size(), allocs); let rn = pretty_print_ireg(rn, size.operand_size(), allocs); format!("mov {}, {}", rd, rn) } &Inst::MovFromVec { rd, rn, idx, size } => { let op = match size { - VectorSize::Size8x16 => "umov", - VectorSize::Size16x8 => "umov", - VectorSize::Size32x4 => "mov", - VectorSize::Size64x2 => "mov", + ScalarSize::Size8 => "umov", + ScalarSize::Size16 => "umov", + ScalarSize::Size32 => "mov", + ScalarSize::Size64 => "mov", _ => unimplemented!(), }; let rd = pretty_print_ireg(rd.to_reg(), size.operand_size(), allocs); @@ -1989,7 +1990,7 @@ impl Inst { scalar_size, } => { let rd = pretty_print_ireg(rd.to_reg(), scalar_size, allocs); - let rn = pretty_print_vreg_element(rn, idx as usize, size, allocs); + let rn = pretty_print_vreg_element(rn, idx as usize, size.lane_size(), allocs); format!("smov {}, {}", rd, rn) } &Inst::VecDup { rd, rn, size } => { @@ -1999,7 +2000,7 @@ impl Inst { } &Inst::VecDupFromFpu { rd, rn, size } => { let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs); - let rn = pretty_print_vreg_element(rn, 0, size, allocs); + let rn = pretty_print_vreg_element(rn, 0, size.lane_size(), allocs); format!("dup {}, {}", rd, rn) } &Inst::VecDupFPImm { rd, imm, size } => { @@ -2075,8 +2076,13 @@ impl Inst { src_idx, size, } => { - let rd = pretty_print_vreg_element(rd.to_reg(), dest_idx as usize, size, allocs); - let rn = pretty_print_vreg_element(rn, src_idx as usize, size, allocs); + let rd = pretty_print_vreg_element( + rd.to_reg(), + dest_idx as usize, + size.lane_size(), + allocs, + ); + let rn = pretty_print_vreg_element(rn, src_idx as usize, size.lane_size(), allocs); format!("mov {}, {}", rd, rn) } &Inst::VecRRLong { @@ -2220,6 +2226,7 @@ impl Inst { VecALUOp::Fmax => ("fmax", size), VecALUOp::Fmin => ("fmin", size), VecALUOp::Fmul => ("fmul", size), + VecALUOp::Fmla => ("fmla", size), VecALUOp::Addp => ("addp", size), VecALUOp::Zip1 => ("zip1", size), VecALUOp::Sqrdmulh => ("sqrdmulh", size), diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index fbae85ecb7e9..3c1114a5153b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -331,14 +331,15 @@ pub fn show_vreg_vector(reg: Reg, size: VectorSize) -> String { } /// Show an indexed vector element. -pub fn show_vreg_element(reg: Reg, idx: u8, size: VectorSize) -> String { +pub fn show_vreg_element(reg: Reg, idx: u8, size: ScalarSize) -> String { assert_eq!(RegClass::Float, reg.class()); let s = show_reg(reg); let suffix = match size { - VectorSize::Size8x8 | VectorSize::Size8x16 => ".b", - VectorSize::Size16x4 | VectorSize::Size16x8 => ".h", - VectorSize::Size32x2 | VectorSize::Size32x4 => ".s", - VectorSize::Size64x2 => ".d", + ScalarSize::Size8 => ".b", + ScalarSize::Size16 => ".h", + ScalarSize::Size32 => ".s", + ScalarSize::Size64 => ".d", + _ => panic!("Unexpected vector element size: {:?}", size), }; format!("{}{}[{}]", s, suffix, idx) } @@ -373,7 +374,7 @@ pub fn pretty_print_vreg_vector( pub fn pretty_print_vreg_element( reg: Reg, idx: usize, - size: VectorSize, + size: ScalarSize, allocs: &mut AllocationConsumer<'_>, ) -> String { let reg = allocs.next(reg); diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 808a7324a577..28094cd5251a 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -132,6 +132,69 @@ (rule (lower (scalar_to_vector x @ (value_type (int_bool_fits_in_32 _)))) (mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32))) +;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; cmeq vtmp.2d, vm.2d, #0 +;; addp dtmp, vtmp.2d +;; fcmp dtmp, dtmp +;; cset xd, eq +;; +;; Note that after the ADDP the value of the temporary register will be either +;; 0 when all input elements are true, i.e. non-zero, or a NaN otherwise +;; (either -1 or -2 when represented as an integer); NaNs are the only +;; floating-point numbers that compare unequal to themselves. +(rule (lower (has_type out_ty (vall_true x @ (value_type (multi_lane 64 2))))) + (let ((x1 Reg (cmeq0 x (VectorSize.Size64x2))) + (x2 Reg (addp x1 x1 (VectorSize.Size64x2)))) + (with_flags (fpu_cmp (ScalarSize.Size64) x2 x2) + (materialize_bool_result (ty_bits out_ty) (Cond.Eq))))) + +(rule (lower (has_type out_ty (vall_true x @ (value_type (multi_lane 32 2))))) + (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64)))) + (with_flags (cmp_rr_shift (OperandSize.Size64) (zero_reg) x1 32) + (ccmp_imm + (OperandSize.Size32) + (ty_bits out_ty) + x1 + (u8_into_uimm5 0) + (nzcv $false $true $false $false) + (Cond.Ne))))) + +;; This operation is implemented by using uminv to create a scalar value, which +;; is then compared against zero. +;; +;; uminv bn, vm.16b +;; mov xm, vn.d[0] +;; cmp xm, #0 +;; cset xm, ne +(rule (lower (has_type out_ty (vall_true x @ (value_type (lane_fits_in_32 ty))))) + (if (not_vec32x2 ty)) + (let ((x1 Reg (vec_lanes (VecLanesOp.Uminv) x (vector_size ty))) + (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64)))) + (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0)) + (materialize_bool_result (ty_bits out_ty) (Cond.Ne))))) + +;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; This operation is implemented by using umaxp to create a scalar value, which +;; is then compared against zero. +;; +;; umaxp vn.4s, vm.4s, vm.4s +;; mov xm, vn.d[0] +;; cmp xm, #0 +;; cset xm, ne +(rule (lower (vany_true x @ (value_type (ty_vec128 ty)))) + (let ((x1 Reg (vec_rrr (VecALUOp.Umaxp) x x (VectorSize.Size32x4))) + (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64)))) + (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0)) + (materialize_bool_result (ty_bits ty) (Cond.Ne))))) + +(rule (lower (vany_true x @ (value_type ty))) + (if (ty_vec64 ty)) + (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64)))) + (with_flags (cmp_imm (OperandSize.Size64) x1 (u8_into_imm12 0)) + (materialize_bool_result (ty_bits ty) (Cond.Ne))))) + ;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high y)))) @@ -308,6 +371,13 @@ (rule (lower (has_type $F64 (nearest x))) (fpu_round (FpuRoundMode.Nearest64) x)) +;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z))) + (vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fma x y z))) + (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z)) ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -726,7 +796,7 @@ (rule (lower (has_type (fits_in_64 out) (uextend (extractlane vec @ (value_type in) (u8_from_uimm8 lane))))) - (mov_from_vec (put_in_reg vec) lane (vector_size in))) + (mov_from_vec (put_in_reg vec) lane (lane_size in))) ;; Atomic loads will also automatically zero their upper bits so the `uextend` ;; instruction can effectively get skipped here. @@ -744,7 +814,7 @@ (rule (lower (has_type $I128 (uextend (extractlane vec @ (value_type in) (u8_from_uimm8 lane))))) - (value_regs (mov_from_vec (put_in_reg vec) lane (vector_size in)) (imm $I64 (ImmExtend.Zero) 0))) + (value_regs (mov_from_vec (put_in_reg vec) lane (lane_size in)) (imm $I64 (ImmExtend.Zero) 0))) ;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -790,7 +860,7 @@ (u8_from_uimm8 lane))))) (let ((lo Reg (mov_from_vec (put_in_reg vec) lane - (VectorSize.Size64x2))) + (ScalarSize.Size64))) (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63)))) (value_regs lo hi))) @@ -1404,26 +1474,26 @@ (rule (lower (has_type $I8 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))) - (mov_from_vec nbits 0 (VectorSize.Size8x16)))) + (mov_from_vec nbits 0 (ScalarSize.Size8)))) ;; Note that this uses `addp` instead of `addv` as it's usually cheaper. (rule (lower (has_type $I16 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) (added Reg (addp nbits nbits (VectorSize.Size8x8)))) - (mov_from_vec added 0 (VectorSize.Size8x16)))) + (mov_from_vec added 0 (ScalarSize.Size8)))) (rule (lower (has_type $I32 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) (added Reg (addv nbits (VectorSize.Size8x8)))) - (mov_from_vec added 0 (VectorSize.Size8x16)))) + (mov_from_vec added 0 (ScalarSize.Size8)))) (rule (lower (has_type $I64 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) (added Reg (addv nbits (VectorSize.Size8x8)))) - (mov_from_vec added 0 (VectorSize.Size8x16)))) + (mov_from_vec added 0 (ScalarSize.Size8)))) (rule (lower (has_type $I128 (popcnt x))) (let ((val ValueRegs x) @@ -1431,7 +1501,7 @@ (tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x16))) (added Reg (addv nbits (VectorSize.Size8x16)))) - (value_regs (mov_from_vec added 0 (VectorSize.Size8x16)) (imm $I64 (ImmExtend.Zero) 0)))) + (value_regs (mov_from_vec added 0 (ScalarSize.Size8)) (imm $I64 (ImmExtend.Zero) 0)))) (rule (lower (has_type $I8X16 (popcnt x))) (vec_cnt x (VectorSize.Size8x16))) diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index 7d23b9c31198..f49bcf175f03 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -106,6 +106,16 @@ where ImmShift::maybe_from_u64(n.into()).unwrap() } + fn lshr_from_u64(&mut self, ty: Type, n: u64) -> Option { + let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?; + if let Ok(bits) = u8::try_from(ty_bits(ty)) { + let shiftimm = shiftimm.mask(bits); + Some(ShiftOpAndAmt::new(ShiftOp::LSR, shiftimm)) + } else { + None + } + } + fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option { let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?; let shiftee_bits = ty_bits(ty); diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 842342d5b921..57f522b31b03 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -457,7 +457,7 @@ pub(crate) fn lower_insn_to_regs>( } (true, false) => { let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true); + let size = ScalarSize::from_bits(oty_bits); ctx.emit(Inst::MovFromVec { rd, @@ -685,7 +685,12 @@ pub(crate) fn lower_insn_to_regs>( let ty = ty.unwrap(); if ty_has_int_representation(ty) { - ctx.emit(Inst::MovFromVec { rd, rn, idx, size }); + ctx.emit(Inst::MovFromVec { + rd, + rn, + idx, + size: size.lane_size(), + }); // Plain moves are faster on some processors. } else if idx == 0 { ctx.emit(Inst::gen_move(rd, rn, ty)); @@ -729,115 +734,7 @@ pub(crate) fn lower_insn_to_regs>( Opcode::ScalarToVector => implemented_in_isle(ctx), - Opcode::VallTrue if ctx.input_ty(insn, 0).lane_bits() == 64 => { - let input_ty = ctx.input_ty(insn, 0); - - if input_ty.lane_count() != 2 { - return Err(CodegenError::Unsupported(format!( - "VallTrue: unsupported type {:?}", - input_ty - ))); - } - - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap(); - - // cmeq vtmp.2d, vm.2d, #0 - // addp dtmp, vtmp.2d - // fcmp dtmp, dtmp - // cset xd, eq - // - // Note that after the ADDP the value of the temporary register will - // be either 0 when all input elements are true, i.e. non-zero, or a - // NaN otherwise (either -1 or -2 when represented as an integer); - // NaNs are the only floating-point numbers that compare unequal to - // themselves. - - ctx.emit(Inst::VecMisc { - op: VecMisc2::Cmeq0, - rd: tmp, - rn: rm, - size: VectorSize::Size64x2, - }); - ctx.emit(Inst::VecRRPair { - op: VecPairOp::Addp, - rd: tmp, - rn: tmp.to_reg(), - }); - ctx.emit(Inst::FpuCmp { - size: ScalarSize::Size64, - rn: tmp.to_reg(), - rm: tmp.to_reg(), - }); - materialize_bool_result(ctx, insn, rd, Cond::Eq); - } - - Opcode::VanyTrue | Opcode::VallTrue => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let src_ty = ctx.input_ty(insn, 0); - let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap(); - - // This operation is implemented by using umaxp or uminv to - // create a scalar value, which is then compared against zero. - // - // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b - // mov xm, vn.d[0] - // cmp xm, #0 - // cset xm, ne - - let s = VectorSize::from_ty(src_ty); - let size = if s == VectorSize::Size64x2 { - // `vall_true` with 64-bit elements is handled elsewhere. - debug_assert_ne!(op, Opcode::VallTrue); - - VectorSize::Size32x4 - } else { - s - }; - - if op == Opcode::VanyTrue { - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Umaxp, - rd: tmp, - rn: rm, - rm, - size, - }); - } else { - if size == VectorSize::Size32x2 { - return Err(CodegenError::Unsupported(format!( - "VallTrue: Unsupported type: {:?}", - src_ty - ))); - } - - ctx.emit(Inst::VecLanes { - op: VecLanesOp::Uminv, - rd: tmp, - rn: rm, - size, - }); - }; - - ctx.emit(Inst::MovFromVec { - rd, - rn: tmp.to_reg(), - idx: 0, - size: VectorSize::Size64x2, - }); - - ctx.emit(Inst::AluRRImm12 { - alu_op: ALUOp::SubS, - size: OperandSize::Size64, - rd: writable_zero_reg(), - rn: rd.to_reg(), - imm12: Imm12::zero(), - }); - - materialize_bool_result(ctx, insn, rd, Cond::Ne); - } + Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx), Opcode::VhighBits => { let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); @@ -904,7 +801,7 @@ pub(crate) fn lower_insn_to_regs>( rd: dst_r, rn: tmp_v0.to_reg(), idx: 0, - size: VectorSize::Size16x8, + size: ScalarSize::Size16, }); } I16X8 => { @@ -962,7 +859,7 @@ pub(crate) fn lower_insn_to_regs>( rd: dst_r, rn: tmp_v0.to_reg(), idx: 0, - size: VectorSize::Size16x8, + size: ScalarSize::Size16, }); } I32X4 => { @@ -1018,7 +915,7 @@ pub(crate) fn lower_insn_to_regs>( rd: dst_r, rn: tmp_v0.to_reg(), idx: 0, - size: VectorSize::Size32x4, + size: ScalarSize::Size32, }); } I64X2 => { @@ -1031,13 +928,13 @@ pub(crate) fn lower_insn_to_regs>( rd: dst_r, rn: src_v, idx: 0, - size: VectorSize::Size64x2, + size: ScalarSize::Size64, }); ctx.emit(Inst::MovFromVec { rd: tmp_r0, rn: src_v, idx: 1, - size: VectorSize::Size64x2, + size: ScalarSize::Size64, }); ctx.emit(Inst::AluRRImmShift { alu_op: ALUOp::Lsr, @@ -1166,31 +1063,7 @@ pub(crate) fn lower_insn_to_regs>( Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => implemented_in_isle(ctx), - Opcode::Fma => { - let ty = ty.unwrap(); - let bits = ty_bits(ty); - let fpu_op = match bits { - 32 => FPUOp3::MAdd32, - 64 => FPUOp3::MAdd64, - _ => { - return Err(CodegenError::Unsupported(format!( - "Fma: Unsupported type: {:?}", - ty - ))) - } - }; - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::FpuRRRR { - fpu_op, - rn, - rm, - ra, - rd, - }); - } + Opcode::Fma => implemented_in_isle(ctx), Opcode::Fcopysign => { // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 695b02fc3837..1914c85aa3aa 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -357,6 +357,15 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn ty_vec64(&mut self, ty: Type) -> Option { + if ty.is_vector() && ty.bits() == 64 { + Some(ty) + } else { + None + } + } + #[inline] fn ty_vec128(&mut self, ty: Type) -> Option { if ty.is_vector() && ty.bits() == 128 { @@ -588,6 +597,14 @@ macro_rules! isle_prelude_methods { } } + fn not_vec32x2(&mut self, ty: Type) -> Option { + if ty.lane_bits() == 32 && ty.lane_count() == 2 { + None + } else { + Some(ty) + } + } + fn not_i64x2(&mut self, ty: Type) -> Option<()> { if ty == I64X2 { None diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 661ecb9fed93..877fc4d9b792 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -264,8 +264,11 @@ (extern const $B32X4 Type) (extern const $B64X2 Type) +(extern const $I8X8 Type) (extern const $I8X16 Type) +(extern const $I16X4 Type) (extern const $I16X8 Type) +(extern const $I32X2 Type) (extern const $I32X4 Type) (extern const $I64X2 Type) @@ -359,6 +362,10 @@ (decl ty_scalar_float (Type) Type) (extern extractor ty_scalar_float ty_scalar_float) +;; A pure constructor that only matches 64-bit vector types. +(decl pure ty_vec64 (Type) Type) +(extern constructor ty_vec64 ty_vec64) + ;; An extractor that only matches 128-bit vector types. (decl ty_vec128 (Type) Type) (extern extractor ty_vec128 ty_vec128) @@ -373,7 +380,11 @@ (decl ty_vec128_int (Type) Type) (extern extractor ty_vec128_int ty_vec128_int) -;; An extractor that matches everything except i64x2 +;; A pure constructor that matches everything except vectors with size 32X2. +(decl pure not_vec32x2 (Type) Type) +(extern constructor not_vec32x2 not_vec32x2) + +;; An extractor that matches everything except I64X2 (decl not_i64x2 () Type) (extern extractor not_i64x2 not_i64x2) diff --git a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif index 8a4412a8511b..fc7df58b2fd9 100644 --- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif +++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif @@ -910,3 +910,39 @@ block0(v0: f64x2): ; block0: ; frintn v0.2d, v0.2d ; ret + +function %f78(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; mov v17.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.4s, v17.4s, v1.4s +; ret + +function %f79(f32x2, f32x2, f32x2) -> f32x2 { +block0(v0: f32x2, v1: f32x2, v2: f32x2): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; mov v17.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.2s, v17.2s, v1.2s +; ret + +function %f80(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; mov v17.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.2d, v17.2d, v1.2d +; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif b/cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif new file mode 100644 index 000000000000..c969b1e9be86 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif @@ -0,0 +1,94 @@ +test compile precise-output +set unwind_info=false +target aarch64 + +function %fn0(b8x8) -> b1 { +block0(v0: b8x8): + v1 = vall_true v0 + return v1 +} + +; block0: +; uminv b3, v0.8b +; mov x5, v3.d[0] +; subs xzr, x5, #0 +; cset x0, ne +; ret + +function %fn1(b8x16) -> b1 { +block0(v0: b8x16): + v1 = vall_true v0 + return v1 +} + +; block0: +; uminv b3, v0.16b +; mov x5, v3.d[0] +; subs xzr, x5, #0 +; cset x0, ne +; ret + +function %fn2(b16x4) -> b1 { +block0(v0: b16x4): + v1 = vall_true v0 + return v1 +} + +; block0: +; uminv h3, v0.4h +; mov x5, v3.d[0] +; subs xzr, x5, #0 +; cset x0, ne +; ret + +function %fn3(b16x8) -> b1 { +block0(v0: b16x8): + v1 = vall_true v0 + return v1 +} + +; block0: +; uminv h3, v0.8h +; mov x5, v3.d[0] +; subs xzr, x5, #0 +; cset x0, ne +; ret + +function %fn4(b32x2) -> b1 { +block0(v0: b32x2): + v1 = vall_true v0 + return v1 +} + +; block0: +; mov x3, v0.d[0] +; subs xzr, xzr, x3, LSR 32 +; ccmp w3, #0, #nZcv, ne +; cset x0, ne +; ret + +function %fn5(b32x4) -> b1 { +block0(v0: b32x4): + v1 = vall_true v0 + return v1 +} + +; block0: +; uminv s3, v0.4s +; mov x5, v3.d[0] +; subs xzr, x5, #0 +; cset x0, ne +; ret + +function %fn6(b64x2) -> b1 { +block0(v0: b64x2): + v1 = vall_true v0 + return v1 +} + +; block0: +; cmeq v3.2d, v0.2d, #0 +; addp v5.2d, v3.2d, v3.2d +; fcmp d5, d5 +; cset x0, eq +; ret diff --git a/cranelift/filetests/filetests/runtests/simd-fma-64bit.clif b/cranelift/filetests/filetests/runtests/simd-fma-64bit.clif new file mode 100644 index 000000000000..5f98b80d8a13 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fma-64bit.clif @@ -0,0 +1,47 @@ +test interpret +test run +target aarch64 +; x86_64 panics: `not implemented: unable to move type: f32x2` + +function %fma_f32x2(f32x2, f32x2, f32x2) -> f32x2 { +block0(v0: f32x2, v1: f32x2, v2: f32x2): + v3 = fma v0, v1, v2 + return v3 +} +; run: %fma_f32x2([0x9.0 0x83.0], [0x9.0 0x2.68091p6], [0x9.0 0x9.88721p1]) == [0x1.680000p6 0x1.3b88e6p14] +; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0] +; run: %fma_f32x2([0x0.0 0x0.0], [-0x0.0 -0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0] +; run: %fma_f32x2([-0x0.0 -0x0.0], [0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0] + +; run: %fma_f32x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf] +; run: %fma_f32x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf] +; run: %fma_f32x2([-Inf -Inf], [Inf Inf], [-Inf -Inf]) == [-Inf -Inf] +; run: %fma_f32x2([-Inf -Inf], [Inf Inf], [-Inf -Inf]) == [-Inf -Inf] + +; F32 Epsilon / Max / Min Positive +; run: %fma_f32x2([0x1.000000p-23 0x0.0], [0x1.000000p-23 0x0.0], [0x1.000000p-23 0x1.000000p-23]) == [0x1.000002p-23 0x1.000000p-23] +; run: %fma_f32x2([0x1.fffffep127 0x0.0], [0x1.fffffep127 0x0.0], [0x1.fffffep127 0x1.fffffep127]) == [+Inf 0x1.fffffep127] +; run: %fma_f32x2([0x1.000000p-126 0x1.000000p-126], [0x1.000000p-126 0x1.000000p-126], [0x1.000000p-126 0x1.000000p-126]) == [0x1.000000p-126 0x1.000000p-126] +; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126]) == [0x1.000000p-126 0x1.000000p-126] + +; F32 Subnormals +; run: %fma_f32x2([0x0.800000p-126 0x0.800000p-126], [0x0.800000p-126 0x0.800000p-126], [0x0.800000p-126 0x0.0]) == [0x0.800000p-126 0x0.0] +; run: %fma_f32x2([0x0.0 0x0.000002p-126], [0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.000002p-126] +; run: %fma_f32x2([0x0.000002p-126 0x0.000002p-126], [0x0.000002p-126 0x0.000002p-126], [0x0.0 0x0.0]) == [0x0.0 0x0.0] +; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.000002p-126 0x0.000002p-126]) == [0x0.000002p-126 0x0.000002p-126] + +;; The IEEE754 Standard does not make a lot of guarantees about what +;; comes out of NaN producing operations, we just check if its a NaN +function %fma_is_nan_f32x2(f32x2, f32x2, f32x2) -> b1 { +block0(v0: f32x2, v1: f32x2, v2: f32x2): + v3 = fma v0, v1, v2 + v4 = fcmp ne v3, v3 + v5 = vall_true v4 + return v5 +} +; run: %fma_is_nan_f32x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == true +; run: %fma_is_nan_f32x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == true +; run: %fma_is_nan_f32x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == true +; run: %fma_is_nan_f32x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == true +; run: %fma_is_nan_f32x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == true +; run: %fma_is_nan_f32x2([NaN NaN], [NaN NaN], [NaN NaN]) == true diff --git a/cranelift/filetests/filetests/runtests/simd-fma.clif b/cranelift/filetests/filetests/runtests/simd-fma.clif index b5eb7de5b577..cfb1e6b119fc 100644 --- a/cranelift/filetests/filetests/runtests/simd-fma.clif +++ b/cranelift/filetests/filetests/runtests/simd-fma.clif @@ -1,5 +1,7 @@ +test interpret test run target x86_64 has_avx has_fma +target aarch64 function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4, v2: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif b/cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif new file mode 100644 index 000000000000..6085304a4f2d --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif @@ -0,0 +1,58 @@ +test interpret +test run +target aarch64 +; s390x and x86_64 do not support 64-bit vectors. + +function %valltrue_b8x8_f() -> b1 { +block0: + v0 = bconst.b8 false + v1 = splat.b8x8 v0 + v2 = vall_true v1 + return v2 +} +; run: %valltrue_b8x8_f() == false + +function %valltrue_b8x8_t() -> b1 { +block0: + v0 = bconst.b8 true + v1 = splat.b8x8 v0 + v2 = vall_true v1 + return v2 +} +; run: %valltrue_b8x8_t() == true + +function %valltrue_b16x4_f() -> b1 { +block0: + v0 = bconst.b16 false + v1 = splat.b16x4 v0 + v2 = vall_true v1 + return v2 +} +; run: %valltrue_b16x4_f() == false + +function %valltrue_b16x4_t() -> b1 { +block0: + v0 = bconst.b16 true + v1 = splat.b16x4 v0 + v2 = vall_true v1 + return v2 +} +; run: %valltrue_b16x4_t() == true + +function %valltrue_b32x2_f() -> b1 { +block0: + v0 = bconst.b32 false + v1 = splat.b32x2 v0 + v2 = vall_true v1 + return v2 +} +; run: %valltrue_b32x2_f() == false + +function %valltrue_b32x2_t() -> b1 { +block0: + v0 = bconst.b32 true + v1 = splat.b32x2 v0 + v2 = vall_true v1 + return v2 +} +; run: %valltrue_b32x2_t() == true diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif new file mode 100644 index 000000000000..8ead6d2d3799 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif @@ -0,0 +1,58 @@ +test interpret +test run +target aarch64 +; s390x and x86_64 do not support 64-bit vectors. + +function %vanytrue_b8x8_f() -> b1 { +block0: + v0 = bconst.b8 false + v1 = splat.b8x8 v0 + v2 = vany_true v1 + return v2 +} +; run: %vanytrue_b8x8_f() == false + +function %vanytrue_b8x8_t() -> b1 { +block0: + v0 = bconst.b8 true + v1 = splat.b8x8 v0 + v2 = vany_true v1 + return v2 +} +; run: %vanytrue_b8x8_t() == true + +function %vanytrue_b16x4_f() -> b1 { +block0: + v0 = bconst.b16 false + v1 = splat.b16x4 v0 + v2 = vany_true v1 + return v2 +} +; run: %vanytrue_b16x4_f() == false + +function %vanytrue_b16x4_t() -> b1 { +block0: + v0 = bconst.b16 true + v1 = splat.b16x4 v0 + v2 = vany_true v1 + return v2 +} +; run: %vanytrue_b16x4_t() == true + +function %vanytrue_b32x2_f() -> b1 { +block0: + v0 = bconst.b32 false + v1 = splat.b32x2 v0 + v2 = vany_true v1 + return v2 +} +; run: %vanytrue_b32x2_f() == false + +function %vanytrue_b32x2_t() -> b1 { +block0: + v0 = bconst.b32 true + v1 = splat.b32x2 v0 + v2 = vany_true v1 + return v2 +} +; run: %vanytrue_b32x2_t() == true diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 50dcf0c5d836..eaff61fd40b9 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -715,10 +715,25 @@ where }; assign(count) } - Opcode::Fcmp => assign(Value::bool( - fcmp(inst.fp_cond_code().unwrap(), &arg(0)?, &arg(1)?)?, - ctrl_ty.as_bool(), - )?), + + Opcode::Fcmp => { + let arg0 = extractlanes(&arg(0)?, ctrl_ty)?; + let arg1 = extractlanes(&arg(1)?, ctrl_ty)?; + + assign(vectorizelanes( + &(arg0 + .into_iter() + .zip(arg1.into_iter()) + .map(|(x, y)| { + V::bool( + fcmp(inst.fp_cond_code().unwrap(), &x, &y).unwrap(), + ctrl_ty.lane_type().as_bool(), + ) + }) + .collect::>>()?), + ctrl_ty, + )?) + } Opcode::Ffcmp => { let arg0 = arg(0)?; let arg1 = arg(1)?; @@ -750,7 +765,21 @@ where Opcode::Fmul => binary(Value::mul, arg(0)?, arg(1)?)?, Opcode::Fdiv => binary(Value::div, arg(0)?, arg(1)?)?, Opcode::Sqrt => assign(Value::sqrt(arg(0)?)?), - Opcode::Fma => assign(Value::fma(arg(0)?, arg(1)?, arg(2)?)?), + Opcode::Fma => { + let arg0 = extractlanes(&arg(0)?, ctrl_ty)?; + let arg1 = extractlanes(&arg(1)?, ctrl_ty)?; + let arg2 = extractlanes(&arg(2)?, ctrl_ty)?; + + assign(vectorizelanes( + &(arg0 + .into_iter() + .zip(arg1.into_iter()) + .zip(arg2.into_iter()) + .map(|((x, y), z)| Value::fma(x, y, z)) + .collect::>>()?), + ctrl_ty, + )?) + } Opcode::Fneg => assign(Value::neg(arg(0)?)?), Opcode::Fabs => assign(Value::abs(arg(0)?)?), Opcode::Fcopysign => binary(Value::copysign, arg(0)?, arg(1)?)?, @@ -1205,8 +1234,8 @@ where let iterations = match lane_type { types::I8 | types::B1 | types::B8 => 1, types::I16 | types::B16 => 2, - types::I32 | types::B32 => 4, - types::I64 | types::B64 => 8, + types::I32 | types::B32 | types::F32 => 4, + types::I64 | types::B64 | types::F64 => 8, _ => unimplemented!("vectors with lanes wider than 64-bits are currently unsupported."), }; @@ -1219,6 +1248,8 @@ where let lane_val: V = if lane_type.is_bool() { Value::bool(lane != 0, lane_type)? + } else if lane_type.is_float() { + Value::float(lane as u64, lane_type)? } else { Value::int(lane, lane_type)? }; @@ -1242,8 +1273,8 @@ where let iterations = match lane_type { types::I8 | types::B1 | types::B8 => 1, types::I16 | types::B16 => 2, - types::I32 | types::B32 => 4, - types::I64 | types::B64 => 8, + types::I32 | types::B32 | types::F32 => 4, + types::I64 | types::B64 | types::F64 => 8, _ => unimplemented!("vectors with lanes wider than 64-bits are currently unsupported."), }; let mut result: [u8; 16] = [0; 16]; From 18cf0b4f539416fa67ceae4bcd6e2e9f3a07b715 Mon Sep 17 00:00:00 2001 From: dheaton-arm Date: Fri, 5 Aug 2022 10:03:20 +0100 Subject: [PATCH 2/2] Add comments for `Fmla` and `Bsl` Copyright (c) 2022 Arm Limited --- cranelift/codegen/src/isa/aarch64/inst.isle | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index fdbc25e3a2c0..f853ab711290 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -1109,6 +1109,8 @@ ;; Bitwise exclusive or (Eor) ;; Bitwise select + ;; This opcode should only be used with the `vec_rrr_inplace` + ;; constructor. (Bsl) ;; Unsigned maximum pairwise (Umaxp) @@ -1145,6 +1147,8 @@ ;; Floating-point multiply (Fmul) ;; Floating-point fused multiply-add vectors + ;; This opcode should only be used with the `vec_rrr_inplace` + ;; constructor. (Fmla) ;; Add pairwise (Addp) @@ -1502,9 +1506,9 @@ ;; Helper for emitting `MInst.VecRRR` instructions which use three registers, ;; one of which is both source and output. (decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg) -(rule (vec_rrr_inplace op src_dst src2 src3 size) +(rule (vec_rrr_inplace op src1 src2 src3 size) (let ((dst WritableReg (temp_writable_reg $I8X16)) - (_1 Unit (emit (MInst.FpuMove128 dst src_dst))) + (_1 Unit (emit (MInst.FpuMove128 dst src1))) (_2 Unit (emit (MInst.VecRRR op dst src2 src3 size)))) dst))