From 4ea15391706f176d7e5943f6a004306e8d9288fc Mon Sep 17 00:00:00 2001
From: dheaton-arm <Damian.Heaton@arm.com>
Date: Wed, 13 Jul 2022 16:10:07 +0100
Subject: [PATCH 1/2] Convert `fma`, `valltrue` & `vanytrue` to ISLE (AArch64)

Ported the existing implementations of the following opcodes to ISLE on
AArch64:
- `fma`
  - Introduced missing support for `fma` on vector values, as per the
    docs.
- `valltrue`
- `vanytrue`

Also fixed `fcmp` on scalar values in the interpreter, and enabled
interpreter tests in `simd-fma.clif`.

This introduces the `FMLA` machine instruction.

Copyright (c) 2022 Arm Limited
---
 cranelift/codegen/src/isa/aarch64/inst.isle   |  55 ++++++-
 .../codegen/src/isa/aarch64/inst/args.rs      |   5 +-
 .../codegen/src/isa/aarch64/inst/emit.rs      |  19 ++-
 .../src/isa/aarch64/inst/emit_tests.rs        |  50 +++++-
 .../codegen/src/isa/aarch64/inst/imms.rs      |   8 -
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  35 ++--
 .../codegen/src/isa/aarch64/inst/regs.rs      |  13 +-
 cranelift/codegen/src/isa/aarch64/lower.isle  |  86 +++++++++-
 .../codegen/src/isa/aarch64/lower/isle.rs     |  10 ++
 .../codegen/src/isa/aarch64/lower_inst.rs     | 155 ++----------------
 cranelift/codegen/src/machinst/isle.rs        |  17 ++
 cranelift/codegen/src/prelude.isle            |  13 +-
 .../filetests/isa/aarch64/floating-point.clif |  36 ++++
 .../filetests/isa/aarch64/simd-valltrue.clif  |  94 +++++++++++
 .../filetests/runtests/simd-fma-64bit.clif    |  47 ++++++
 .../filetests/runtests/simd-fma.clif          |   2 +
 .../runtests/simd-valltrue-64bit.clif         |  58 +++++++
 .../runtests/simd-vanytrue-64bit.clif         |  58 +++++++
 cranelift/interpreter/src/step.rs             |  49 +++++-
 19 files changed, 604 insertions(+), 206 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-fma-64bit.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index e81868bccc9f..fdbc25e3a2c0 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -335,8 +335,10 @@
         (rn Reg))
 
        ;; 3-op FPU instruction.
+       ;; 16-bit scalars require half-precision floating-point support (FEAT_FP16).
        (FpuRRRR
         (fpu_op FPUOp3)
+        (size ScalarSize)
         (rd WritableReg)
         (rn Reg)
         (rm Reg)
@@ -478,7 +480,7 @@
         (rd WritableReg)
         (rn Reg)
         (idx u8)
-        (size VectorSize))
+        (size ScalarSize))
 
        ;; Signed move from a vector element to a GPR.
        (MovFromVecSigned
@@ -1011,8 +1013,7 @@
 ;; A floating-point unit (FPU) operation with three args.
 (type FPUOp3
   (enum
-    (MAdd32)
-    (MAdd64)
+    (MAdd)
 ))
 
 ;; A conversion from an FP to an integer value.
@@ -1143,6 +1144,8 @@
     (Fmin)
     ;; Floating-point multiply
     (Fmul)
+    ;; Floating-point fused multiply-add vectors
+    (Fmla)
     ;; Add pairwise
     (Addp)
     ;; Zip vectors (primary) [meaning, high halves]
@@ -1364,6 +1367,9 @@
 (decl imm12_from_negated_u64 (Imm12) u64)
 (extern extractor imm12_from_negated_u64 imm12_from_negated_u64)
 
+(decl pure lshr_from_u64 (Type u64) ShiftOpAndAmt)
+(extern constructor lshr_from_u64 lshr_from_u64)
+
 (decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt)
 (extern constructor lshl_from_imm64 lshl_from_imm64)
 
@@ -1491,6 +1497,15 @@
 (rule (fpu_rr op src size)
       (let ((dst WritableReg (temp_writable_reg $F64))
             (_ Unit (emit (MInst.FpuRR op size dst src))))
+       dst))
+
+;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
+;; one of which is both source and output.
+(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
+(rule (vec_rrr_inplace op src_dst src2 src3 size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_1 Unit (emit (MInst.FpuMove128 dst src_dst)))
+            (_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
         dst))
 
 ;; Helper for emitting `MInst.FpuRRR` instructions.
@@ -1500,6 +1515,13 @@
             (_ Unit (emit (MInst.FpuRRR op size dst src1 src2))))
         dst))
 
+;; Helper for emitting `MInst.FpuRRRR` instructions.
+(decl fpu_rrrr (FPUOp3 ScalarSize Reg Reg Reg) Reg)
+(rule (fpu_rrrr size op src1 src2 src3)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuRRRR size op dst src1 src2 src3))))
+        dst))
+
 ;; Helper for emitting `MInst.FpuCmp` instructions.
 (decl fpu_cmp (ScalarSize Reg Reg) ProducesFlags)
 (rule (fpu_cmp size rn rm)
@@ -1541,6 +1563,15 @@
             (_ Unit (emit (MInst.AluRRRShift op (operand_size ty) dst src1 src2 shift))))
         dst))
 
+;; Helper for emitting `cmp` instructions, setting flags, with a right-shifted
+;; second operand register.
+(decl cmp_rr_shift (OperandSize Reg Reg u64) ProducesFlags)
+(rule (cmp_rr_shift size src1 src2 shift_amount)
+      (if-let shift (lshr_from_u64 $I64 shift_amount))
+      (ProducesFlags.ProducesFlagsSideEffect
+       (MInst.AluRRRShift (ALUOp.SubS) size (writable_zero_reg)
+        src1 src2 shift)))
+
 ;; Helper for emitting `MInst.AluRRRExtend` instructions.
 (decl alu_rrr_extend (ALUOp Type Reg Reg ExtendOp) Reg)
 (rule (alu_rrr_extend op ty src1 src2 extend)
@@ -1741,7 +1772,7 @@
         dst))
 
 ;; Helper for emitting `MInst.MovFromVec` instructions.
-(decl mov_from_vec (Reg u8 VectorSize) Reg)
+(decl mov_from_vec (Reg u8 ScalarSize) Reg)
 (rule (mov_from_vec rn idx size)
       (let ((dst WritableReg (temp_writable_reg $I64))
             (_ Unit (emit (MInst.MovFromVec dst rn idx size))))
@@ -1817,6 +1848,22 @@
          (MInst.CSNeg dst cond if_true if_false)
          dst)))
 
+;; Helper for generating `MInst.CCmpImm` instructions.
+(decl ccmp_imm (OperandSize u8 Reg UImm5 NZCV Cond) ConsumesFlags)
+(rule (ccmp_imm size 1 rn imm nzcv cond)
+      (let ((dst WritableReg (temp_writable_reg $I64)))
+        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
+         (MInst.CCmpImm size rn imm nzcv cond)
+         (MInst.CSet dst cond)
+         (value_reg dst))))
+
+(rule (ccmp_imm size _ty_bits rn imm nzcv cond)
+      (let ((dst WritableReg (temp_writable_reg $I64)))
+        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
+         (MInst.CCmpImm size rn imm nzcv cond)
+         (MInst.CSetm dst cond)
+         (value_reg dst))))
+
 ;; Helpers for generating `add` instructions.
 
 (decl add (Type Reg Reg) Reg)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index ee5e3774ae9a..7ce8a048d183 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -620,7 +620,7 @@ impl ScalarSize {
     /// Convert to an integer operand size.
     pub fn operand_size(&self) -> OperandSize {
         match self {
-            ScalarSize::Size32 => OperandSize::Size32,
+            ScalarSize::Size8 | ScalarSize::Size16 | ScalarSize::Size32 => OperandSize::Size32,
             ScalarSize::Size64 => OperandSize::Size64,
             _ => panic!("Unexpected operand_size request for: {:?}", self),
         }
@@ -687,8 +687,11 @@ impl VectorSize {
         debug_assert!(ty.is_vector());
 
         match ty {
+            B8X8 => VectorSize::Size8x8,
             B8X16 => VectorSize::Size8x16,
+            B16X4 => VectorSize::Size16x4,
             B16X8 => VectorSize::Size16x8,
+            B32X2 => VectorSize::Size32x2,
             B32X4 => VectorSize::Size32x4,
             B64X2 => VectorSize::Size64x2,
             F32X2 => VectorSize::Size32x2,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 1fef41ce4cbc..ab210acda8fe 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1790,6 +1790,7 @@ impl MachInstEmit for Inst {
             }
             &Inst::FpuRRRR {
                 fpu_op,
+                size,
                 rd,
                 rn,
                 rm,
@@ -1800,9 +1801,9 @@ impl MachInstEmit for Inst {
                 let rm = allocs.next(rm);
                 let ra = allocs.next(ra);
                 let top17 = match fpu_op {
-                    FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0,
-                    FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0,
+                    FPUOp3::MAdd => 0b000_11111_00_0_00000_0,
                 };
+                let top17 = top17 | size.ftype() << 7;
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
             &Inst::VecMisc { op, rd, rn, size } => {
@@ -2209,11 +2210,11 @@ impl MachInstEmit for Inst {
                 let rd = allocs.next_writable(rd);
                 let rn = allocs.next(rn);
                 let (q, imm5, shift, mask) = match size {
-                    VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111),
-                    VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111),
-                    VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011),
-                    VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001),
-                    _ => unreachable!(),
+                    ScalarSize::Size8 => (0b0, 0b00001, 1, 0b1111),
+                    ScalarSize::Size16 => (0b0, 0b00010, 2, 0b0111),
+                    ScalarSize::Size32 => (0b0, 0b00100, 3, 0b0011),
+                    ScalarSize::Size64 => (0b1, 0b01000, 4, 0b0001),
+                    _ => panic!("Unexpected scalar FP operand size: {:?}", size),
                 };
                 debug_assert_eq!(idx & mask, idx);
                 let imm5 = imm5 | ((idx as u32) << shift);
@@ -2542,7 +2543,8 @@ impl MachInstEmit for Inst {
                     | VecALUOp::Fdiv
                     | VecALUOp::Fmax
                     | VecALUOp::Fmin
-                    | VecALUOp::Fmul => true,
+                    | VecALUOp::Fmul
+                    | VecALUOp::Fmla => true,
                     _ => false,
                 };
                 let enc_float_size = match (is_float, size) {
@@ -2617,6 +2619,7 @@ impl MachInstEmit for Inst {
                     VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
                     VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
                     VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
+                    VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
                     VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
                     VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
                     VecALUOp::Sqrdmulh => {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 2439b96bfaec..01d3e0fe48b5 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2266,7 +2266,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(3),
             rn: vreg(27),
             idx: 14,
-            size: VectorSize::Size8x16,
+            size: ScalarSize::Size8,
         },
         "633F1D0E",
         "umov w3, v27.b[14]",
@@ -2276,7 +2276,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(24),
             rn: vreg(5),
             idx: 3,
-            size: VectorSize::Size16x8,
+            size: ScalarSize::Size16,
         },
         "B83C0E0E",
         "umov w24, v5.h[3]",
@@ -2286,7 +2286,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(12),
             rn: vreg(17),
             idx: 1,
-            size: VectorSize::Size32x4,
+            size: ScalarSize::Size32,
         },
         "2C3E0C0E",
         "mov w12, v17.s[1]",
@@ -2296,7 +2296,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(21),
             rn: vreg(20),
             idx: 0,
-            size: VectorSize::Size64x2,
+            size: ScalarSize::Size64,
         },
         "953E084E",
         "mov x21, v20.d[0]",
@@ -4054,6 +4054,42 @@ fn test_aarch64_binemit() {
         "fmul v2.2d, v0.2d, v5.2d",
     ));
 
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmla,
+            rd: writable_vreg(2),
+            rn: vreg(0),
+            rm: vreg(5),
+            size: VectorSize::Size32x2,
+        },
+        "02CC250E",
+        "fmla v2.2s, v0.2s, v5.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmla,
+            rd: writable_vreg(2),
+            rn: vreg(0),
+            rm: vreg(5),
+            size: VectorSize::Size32x4,
+        },
+        "02CC254E",
+        "fmla v2.4s, v0.4s, v5.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmla,
+            rd: writable_vreg(2),
+            rn: vreg(0),
+            rm: vreg(5),
+            size: VectorSize::Size64x2,
+        },
+        "02CC654E",
+        "fmla v2.2d, v0.2d, v5.2d",
+    ));
+
     insns.push((
         Inst::VecRRR {
             alu_op: VecALUOp::Addp,
@@ -5911,7 +5947,8 @@ fn test_aarch64_binemit() {
 
     insns.push((
         Inst::FpuRRRR {
-            fpu_op: FPUOp3::MAdd32,
+            fpu_op: FPUOp3::MAdd,
+            size: ScalarSize::Size32,
             rd: writable_vreg(15),
             rn: vreg(30),
             rm: vreg(31),
@@ -5923,7 +5960,8 @@ fn test_aarch64_binemit() {
 
     insns.push((
         Inst::FpuRRRR {
-            fpu_op: FPUOp3::MAdd64,
+            fpu_op: FPUOp3::MAdd,
+            size: ScalarSize::Size64,
             rd: writable_vreg(15),
             rn: vreg(30),
             rm: vreg(31),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
index 47a30b40a3de..c18737693b96 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -292,14 +292,6 @@ impl Imm12 {
         }
     }
 
-    /// Create a zero immediate of this format.
-    pub fn zero() -> Self {
-        Imm12 {
-            bits: 0,
-            shift12: false,
-        }
-    }
-
     /// Bits for 2-bit "shift" field in e.g. AddI.
     pub fn shift_bits(&self) -> u32 {
         if self.shift12 {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index d3b141dd70b6..a35e97e1c59a 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -960,7 +960,7 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
         &Inst::VecRRR {
             alu_op, rd, rn, rm, ..
         } => {
-            if alu_op == VecALUOp::Bsl {
+            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla {
                 collector.reg_mod(rd);
             } else {
                 collector.reg_def(rd);
@@ -1705,7 +1705,7 @@ impl Inst {
             }
             &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
                 let rd = pretty_print_vreg_scalar(rd.to_reg(), size.lane_size(), allocs);
-                let rn = pretty_print_vreg_element(rn, idx as usize, size, allocs);
+                let rn = pretty_print_vreg_element(rn, idx as usize, size.lane_size(), allocs);
                 format!("mov {}, {}", rd, rn)
             }
             &Inst::FpuExtend { rd, rn, size } => {
@@ -1777,14 +1777,14 @@ impl Inst {
             }
             &Inst::FpuRRRR {
                 fpu_op,
+                size,
                 rd,
                 rn,
                 rm,
                 ra,
             } => {
-                let (op, size) = match fpu_op {
-                    FPUOp3::MAdd32 => ("fmadd", ScalarSize::Size32),
-                    FPUOp3::MAdd64 => ("fmadd", ScalarSize::Size64),
+                let op = match fpu_op {
+                    FPUOp3::MAdd => "fmadd",
                 };
                 let rd = pretty_print_vreg_scalar(rd.to_reg(), size, allocs);
                 let rn = pretty_print_vreg_scalar(rn, size, allocs);
@@ -1965,16 +1965,17 @@ impl Inst {
                 format!("fmov {}, {}", rd, imm)
             }
             &Inst::MovToVec { rd, rn, idx, size } => {
-                let rd = pretty_print_vreg_element(rd.to_reg(), idx as usize, size, allocs);
+                let rd =
+                    pretty_print_vreg_element(rd.to_reg(), idx as usize, size.lane_size(), allocs);
                 let rn = pretty_print_ireg(rn, size.operand_size(), allocs);
                 format!("mov {}, {}", rd, rn)
             }
             &Inst::MovFromVec { rd, rn, idx, size } => {
                 let op = match size {
-                    VectorSize::Size8x16 => "umov",
-                    VectorSize::Size16x8 => "umov",
-                    VectorSize::Size32x4 => "mov",
-                    VectorSize::Size64x2 => "mov",
+                    ScalarSize::Size8 => "umov",
+                    ScalarSize::Size16 => "umov",
+                    ScalarSize::Size32 => "mov",
+                    ScalarSize::Size64 => "mov",
                     _ => unimplemented!(),
                 };
                 let rd = pretty_print_ireg(rd.to_reg(), size.operand_size(), allocs);
@@ -1989,7 +1990,7 @@ impl Inst {
                 scalar_size,
             } => {
                 let rd = pretty_print_ireg(rd.to_reg(), scalar_size, allocs);
-                let rn = pretty_print_vreg_element(rn, idx as usize, size, allocs);
+                let rn = pretty_print_vreg_element(rn, idx as usize, size.lane_size(), allocs);
                 format!("smov {}, {}", rd, rn)
             }
             &Inst::VecDup { rd, rn, size } => {
@@ -1999,7 +2000,7 @@ impl Inst {
             }
             &Inst::VecDupFromFpu { rd, rn, size } => {
                 let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
-                let rn = pretty_print_vreg_element(rn, 0, size, allocs);
+                let rn = pretty_print_vreg_element(rn, 0, size.lane_size(), allocs);
                 format!("dup {}, {}", rd, rn)
             }
             &Inst::VecDupFPImm { rd, imm, size } => {
@@ -2075,8 +2076,13 @@ impl Inst {
                 src_idx,
                 size,
             } => {
-                let rd = pretty_print_vreg_element(rd.to_reg(), dest_idx as usize, size, allocs);
-                let rn = pretty_print_vreg_element(rn, src_idx as usize, size, allocs);
+                let rd = pretty_print_vreg_element(
+                    rd.to_reg(),
+                    dest_idx as usize,
+                    size.lane_size(),
+                    allocs,
+                );
+                let rn = pretty_print_vreg_element(rn, src_idx as usize, size.lane_size(), allocs);
                 format!("mov {}, {}", rd, rn)
             }
             &Inst::VecRRLong {
@@ -2220,6 +2226,7 @@ impl Inst {
                     VecALUOp::Fmax => ("fmax", size),
                     VecALUOp::Fmin => ("fmin", size),
                     VecALUOp::Fmul => ("fmul", size),
+                    VecALUOp::Fmla => ("fmla", size),
                     VecALUOp::Addp => ("addp", size),
                     VecALUOp::Zip1 => ("zip1", size),
                     VecALUOp::Sqrdmulh => ("sqrdmulh", size),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index fbae85ecb7e9..3c1114a5153b 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -331,14 +331,15 @@ pub fn show_vreg_vector(reg: Reg, size: VectorSize) -> String {
 }
 
 /// Show an indexed vector element.
-pub fn show_vreg_element(reg: Reg, idx: u8, size: VectorSize) -> String {
+pub fn show_vreg_element(reg: Reg, idx: u8, size: ScalarSize) -> String {
     assert_eq!(RegClass::Float, reg.class());
     let s = show_reg(reg);
     let suffix = match size {
-        VectorSize::Size8x8 | VectorSize::Size8x16 => ".b",
-        VectorSize::Size16x4 | VectorSize::Size16x8 => ".h",
-        VectorSize::Size32x2 | VectorSize::Size32x4 => ".s",
-        VectorSize::Size64x2 => ".d",
+        ScalarSize::Size8 => ".b",
+        ScalarSize::Size16 => ".h",
+        ScalarSize::Size32 => ".s",
+        ScalarSize::Size64 => ".d",
+        _ => panic!("Unexpected vector element size: {:?}", size),
     };
     format!("{}{}[{}]", s, suffix, idx)
 }
@@ -373,7 +374,7 @@ pub fn pretty_print_vreg_vector(
 pub fn pretty_print_vreg_element(
     reg: Reg,
     idx: usize,
-    size: VectorSize,
+    size: ScalarSize,
     allocs: &mut AllocationConsumer<'_>,
 ) -> String {
     let reg = allocs.next(reg);
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index 808a7324a577..28094cd5251a 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -132,6 +132,69 @@
 (rule (lower (scalar_to_vector x @ (value_type (int_bool_fits_in_32 _))))
       (mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32)))
 
+;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; cmeq vtmp.2d, vm.2d, #0
+;; addp dtmp, vtmp.2d
+;; fcmp dtmp, dtmp
+;; cset xd, eq
+;;
+;; Note that after the ADDP the value of the temporary register will be either
+;; 0 when all input elements are true, i.e. non-zero, or a NaN otherwise
+;; (either -1 or -2 when represented as an integer); NaNs are the only
+;; floating-point numbers that compare unequal to themselves.
+(rule (lower (has_type out_ty (vall_true x @ (value_type (multi_lane 64 2)))))
+      (let ((x1 Reg (cmeq0 x (VectorSize.Size64x2)))
+            (x2 Reg (addp x1 x1 (VectorSize.Size64x2))))
+       (with_flags (fpu_cmp (ScalarSize.Size64) x2 x2)
+                   (materialize_bool_result (ty_bits out_ty) (Cond.Eq)))))
+
+(rule (lower (has_type out_ty (vall_true x @ (value_type (multi_lane 32 2)))))
+      (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
+       (with_flags (cmp_rr_shift (OperandSize.Size64) (zero_reg) x1 32)
+                   (ccmp_imm
+                    (OperandSize.Size32)
+                    (ty_bits out_ty)
+                    x1
+                    (u8_into_uimm5 0)
+                    (nzcv $false $true $false $false)
+                    (Cond.Ne)))))
+
+;; This operation is implemented by using uminv to create a scalar value, which
+;; is then compared against zero.
+;;
+;; uminv bn, vm.16b
+;; mov xm, vn.d[0]
+;; cmp xm, #0
+;; cset xm, ne
+(rule (lower (has_type out_ty (vall_true x @ (value_type (lane_fits_in_32 ty)))))
+      (if (not_vec32x2 ty))
+      (let ((x1 Reg (vec_lanes (VecLanesOp.Uminv) x (vector_size ty)))
+            (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
+       (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
+                   (materialize_bool_result (ty_bits out_ty) (Cond.Ne)))))
+
+;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; This operation is implemented by using umaxp to create a scalar value, which
+;; is then compared against zero.
+;;
+;; umaxp vn.4s, vm.4s, vm.4s
+;; mov xm, vn.d[0]
+;; cmp xm, #0
+;; cset xm, ne
+(rule (lower (vany_true x @ (value_type (ty_vec128 ty))))
+      (let ((x1 Reg (vec_rrr (VecALUOp.Umaxp) x x (VectorSize.Size32x4)))
+            (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
+       (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
+                   (materialize_bool_result (ty_bits ty) (Cond.Ne)))))
+
+(rule (lower (vany_true x @ (value_type ty)))
+      (if (ty_vec64 ty))
+      (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
+       (with_flags (cmp_imm (OperandSize.Size64) x1 (u8_into_imm12 0))
+                   (materialize_bool_result (ty_bits ty) (Cond.Ne)))))
+
 ;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high y))))
@@ -308,6 +371,13 @@
 (rule (lower (has_type $F64 (nearest x)))
       (fpu_round (FpuRoundMode.Nearest64) x))
 
+;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
+      (vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty)))
+
+(rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
+      (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
 
 ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -726,7 +796,7 @@
 (rule (lower (has_type (fits_in_64 out)
                        (uextend (extractlane vec @ (value_type in)
                                              (u8_from_uimm8 lane)))))
-      (mov_from_vec (put_in_reg vec) lane (vector_size in)))
+      (mov_from_vec (put_in_reg vec) lane (lane_size in)))
 
 ;; Atomic loads will also automatically zero their upper bits so the `uextend`
 ;; instruction can effectively get skipped here.
@@ -744,7 +814,7 @@
 (rule (lower (has_type $I128
                        (uextend (extractlane vec @ (value_type in)
                                              (u8_from_uimm8 lane)))))
-      (value_regs (mov_from_vec (put_in_reg vec) lane (vector_size in)) (imm $I64 (ImmExtend.Zero) 0)))
+      (value_regs (mov_from_vec (put_in_reg vec) lane (lane_size in)) (imm $I64 (ImmExtend.Zero) 0)))
 
 ;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -790,7 +860,7 @@
                                              (u8_from_uimm8 lane)))))
       (let ((lo Reg (mov_from_vec (put_in_reg vec)
                                   lane
-                                  (VectorSize.Size64x2)))
+                                  (ScalarSize.Size64)))
             (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
         (value_regs lo hi)))
 
@@ -1404,26 +1474,26 @@
 (rule (lower (has_type $I8 (popcnt x)))
       (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
             (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))))
-        (mov_from_vec nbits 0 (VectorSize.Size8x16))))
+        (mov_from_vec nbits 0 (ScalarSize.Size8))))
 
 ;; Note that this uses `addp` instead of `addv` as it's usually cheaper.
 (rule (lower (has_type $I16 (popcnt x)))
       (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
             (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
             (added Reg (addp nbits nbits (VectorSize.Size8x8))))
-        (mov_from_vec added 0 (VectorSize.Size8x16))))
+        (mov_from_vec added 0 (ScalarSize.Size8))))
 
 (rule (lower (has_type $I32 (popcnt x)))
       (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
             (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
             (added Reg (addv nbits (VectorSize.Size8x8))))
-        (mov_from_vec added 0 (VectorSize.Size8x16))))
+        (mov_from_vec added 0 (ScalarSize.Size8))))
 
 (rule (lower (has_type $I64 (popcnt x)))
       (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64)))
             (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
             (added Reg (addv nbits (VectorSize.Size8x8))))
-        (mov_from_vec added 0 (VectorSize.Size8x16))))
+        (mov_from_vec added 0 (ScalarSize.Size8))))
 
 (rule (lower (has_type $I128 (popcnt x)))
       (let ((val ValueRegs x)
@@ -1431,7 +1501,7 @@
             (tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2)))
             (nbits Reg (vec_cnt tmp (VectorSize.Size8x16)))
             (added Reg (addv nbits (VectorSize.Size8x16))))
-        (value_regs (mov_from_vec added 0 (VectorSize.Size8x16)) (imm $I64 (ImmExtend.Zero) 0))))
+        (value_regs (mov_from_vec added 0 (ScalarSize.Size8)) (imm $I64 (ImmExtend.Zero) 0))))
 
 (rule (lower (has_type $I8X16 (popcnt x)))
       (vec_cnt x (VectorSize.Size8x16)))
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
index 7d23b9c31198..f49bcf175f03 100644
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -106,6 +106,16 @@ where
         ImmShift::maybe_from_u64(n.into()).unwrap()
     }
 
+    fn lshr_from_u64(&mut self, ty: Type, n: u64) -> Option<ShiftOpAndAmt> {
+        let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?;
+        if let Ok(bits) = u8::try_from(ty_bits(ty)) {
+            let shiftimm = shiftimm.mask(bits);
+            Some(ShiftOpAndAmt::new(ShiftOp::LSR, shiftimm))
+        } else {
+            None
+        }
+    }
+
     fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option<ShiftOpAndAmt> {
         let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?;
         let shiftee_bits = ty_bits(ty);
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 842342d5b921..57f522b31b03 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -457,7 +457,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
                 (true, false) => {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                    let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
+                    let size = ScalarSize::from_bits(oty_bits);
 
                     ctx.emit(Inst::MovFromVec {
                         rd,
@@ -685,7 +685,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let ty = ty.unwrap();
 
                 if ty_has_int_representation(ty) {
-                    ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
+                    ctx.emit(Inst::MovFromVec {
+                        rd,
+                        rn,
+                        idx,
+                        size: size.lane_size(),
+                    });
                 // Plain moves are faster on some processors.
                 } else if idx == 0 {
                     ctx.emit(Inst::gen_move(rd, rn, ty));
@@ -729,115 +734,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::ScalarToVector => implemented_in_isle(ctx),
 
-        Opcode::VallTrue if ctx.input_ty(insn, 0).lane_bits() == 64 => {
-            let input_ty = ctx.input_ty(insn, 0);
-
-            if input_ty.lane_count() != 2 {
-                return Err(CodegenError::Unsupported(format!(
-                    "VallTrue: unsupported type {:?}",
-                    input_ty
-                )));
-            }
-
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
-
-            // cmeq vtmp.2d, vm.2d, #0
-            // addp dtmp, vtmp.2d
-            // fcmp dtmp, dtmp
-            // cset xd, eq
-            //
-            // Note that after the ADDP the value of the temporary register will
-            // be either 0 when all input elements are true, i.e. non-zero, or a
-            // NaN otherwise (either -1 or -2 when represented as an integer);
-            // NaNs are the only floating-point numbers that compare unequal to
-            // themselves.
-
-            ctx.emit(Inst::VecMisc {
-                op: VecMisc2::Cmeq0,
-                rd: tmp,
-                rn: rm,
-                size: VectorSize::Size64x2,
-            });
-            ctx.emit(Inst::VecRRPair {
-                op: VecPairOp::Addp,
-                rd: tmp,
-                rn: tmp.to_reg(),
-            });
-            ctx.emit(Inst::FpuCmp {
-                size: ScalarSize::Size64,
-                rn: tmp.to_reg(),
-                rm: tmp.to_reg(),
-            });
-            materialize_bool_result(ctx, insn, rd, Cond::Eq);
-        }
-
-        Opcode::VanyTrue | Opcode::VallTrue => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let src_ty = ctx.input_ty(insn, 0);
-            let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
-
-            // This operation is implemented by using umaxp or uminv to
-            // create a scalar value, which is then compared against zero.
-            //
-            // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
-            // mov xm, vn.d[0]
-            // cmp xm, #0
-            // cset xm, ne
-
-            let s = VectorSize::from_ty(src_ty);
-            let size = if s == VectorSize::Size64x2 {
-                // `vall_true` with 64-bit elements is handled elsewhere.
-                debug_assert_ne!(op, Opcode::VallTrue);
-
-                VectorSize::Size32x4
-            } else {
-                s
-            };
-
-            if op == Opcode::VanyTrue {
-                ctx.emit(Inst::VecRRR {
-                    alu_op: VecALUOp::Umaxp,
-                    rd: tmp,
-                    rn: rm,
-                    rm,
-                    size,
-                });
-            } else {
-                if size == VectorSize::Size32x2 {
-                    return Err(CodegenError::Unsupported(format!(
-                        "VallTrue: Unsupported type: {:?}",
-                        src_ty
-                    )));
-                }
-
-                ctx.emit(Inst::VecLanes {
-                    op: VecLanesOp::Uminv,
-                    rd: tmp,
-                    rn: rm,
-                    size,
-                });
-            };
-
-            ctx.emit(Inst::MovFromVec {
-                rd,
-                rn: tmp.to_reg(),
-                idx: 0,
-                size: VectorSize::Size64x2,
-            });
-
-            ctx.emit(Inst::AluRRImm12 {
-                alu_op: ALUOp::SubS,
-                size: OperandSize::Size64,
-                rd: writable_zero_reg(),
-                rn: rd.to_reg(),
-                imm12: Imm12::zero(),
-            });
-
-            materialize_bool_result(ctx, insn, rd, Cond::Ne);
-        }
+        Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),
 
         Opcode::VhighBits => {
             let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
@@ -904,7 +801,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         rd: dst_r,
                         rn: tmp_v0.to_reg(),
                         idx: 0,
-                        size: VectorSize::Size16x8,
+                        size: ScalarSize::Size16,
                     });
                 }
                 I16X8 => {
@@ -962,7 +859,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         rd: dst_r,
                         rn: tmp_v0.to_reg(),
                         idx: 0,
-                        size: VectorSize::Size16x8,
+                        size: ScalarSize::Size16,
                     });
                 }
                 I32X4 => {
@@ -1018,7 +915,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         rd: dst_r,
                         rn: tmp_v0.to_reg(),
                         idx: 0,
-                        size: VectorSize::Size32x4,
+                        size: ScalarSize::Size32,
                     });
                 }
                 I64X2 => {
@@ -1031,13 +928,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         rd: dst_r,
                         rn: src_v,
                         idx: 0,
-                        size: VectorSize::Size64x2,
+                        size: ScalarSize::Size64,
                     });
                     ctx.emit(Inst::MovFromVec {
                         rd: tmp_r0,
                         rn: src_v,
                         idx: 1,
-                        size: VectorSize::Size64x2,
+                        size: ScalarSize::Size64,
                     });
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: ALUOp::Lsr,
@@ -1166,31 +1063,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => implemented_in_isle(ctx),
 
-        Opcode::Fma => {
-            let ty = ty.unwrap();
-            let bits = ty_bits(ty);
-            let fpu_op = match bits {
-                32 => FPUOp3::MAdd32,
-                64 => FPUOp3::MAdd64,
-                _ => {
-                    return Err(CodegenError::Unsupported(format!(
-                        "Fma: Unsupported type: {:?}",
-                        ty
-                    )))
-                }
-            };
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::FpuRRRR {
-                fpu_op,
-                rn,
-                rm,
-                ra,
-                rd,
-            });
-        }
+        Opcode::Fma => implemented_in_isle(ctx),
 
         Opcode::Fcopysign => {
             // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs
index 695b02fc3837..1914c85aa3aa 100644
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -357,6 +357,15 @@ macro_rules! isle_prelude_methods {
             }
         }
 
+        #[inline]
+        fn ty_vec64(&mut self, ty: Type) -> Option<Type> {
+            if ty.is_vector() && ty.bits() == 64 {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
         #[inline]
         fn ty_vec128(&mut self, ty: Type) -> Option<Type> {
             if ty.is_vector() && ty.bits() == 128 {
@@ -588,6 +597,14 @@ macro_rules! isle_prelude_methods {
             }
         }
 
+        fn not_vec32x2(&mut self, ty: Type) -> Option<Type> {
+            if ty.lane_bits() == 32 && ty.lane_count() == 2 {
+                None
+            } else {
+                Some(ty)
+            }
+        }
+
         fn not_i64x2(&mut self, ty: Type) -> Option<()> {
             if ty == I64X2 {
                 None
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index 661ecb9fed93..877fc4d9b792 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -264,8 +264,11 @@
 (extern const $B32X4 Type)
 (extern const $B64X2 Type)
 
+(extern const $I8X8 Type)
 (extern const $I8X16 Type)
+(extern const $I16X4 Type)
 (extern const $I16X8 Type)
+(extern const $I32X2 Type)
 (extern const $I32X4 Type)
 (extern const $I64X2 Type)
 
@@ -359,6 +362,10 @@
 (decl ty_scalar_float (Type) Type)
 (extern extractor ty_scalar_float ty_scalar_float)
 
+;; A pure constructor that only matches 64-bit vector types.
+(decl pure ty_vec64 (Type) Type)
+(extern constructor ty_vec64 ty_vec64)
+
 ;; An extractor that only matches 128-bit vector types.
 (decl ty_vec128 (Type) Type)
 (extern extractor ty_vec128 ty_vec128)
@@ -373,7 +380,11 @@
 (decl ty_vec128_int (Type) Type)
 (extern extractor ty_vec128_int ty_vec128_int)
 
-;; An extractor that matches everything except i64x2
+;; A pure constructor that matches everything except vectors with size 32X2.
+(decl pure not_vec32x2 (Type) Type)
+(extern constructor not_vec32x2 not_vec32x2)
+
+;; An extractor that matches everything except I64X2
 (decl not_i64x2 () Type)
 (extern extractor not_i64x2 not_i64x2)
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
index 8a4412a8511b..fc7df58b2fd9 100644
--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@@ -910,3 +910,39 @@ block0(v0: f64x2):
 ; block0:
 ;   frintn v0.2d, v0.2d
 ;   ret
+
+function %f78(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+  v3 = fma v0, v1, v2
+  return v3
+}
+
+; block0:
+;   mov v17.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v17.4s, v1.4s
+;   ret
+
+function %f79(f32x2, f32x2, f32x2) -> f32x2 {
+block0(v0: f32x2, v1: f32x2, v2: f32x2):
+  v3 = fma v0, v1, v2
+  return v3
+}
+
+; block0:
+;   mov v17.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.2s, v17.2s, v1.2s
+;   ret
+
+function %f80(f64x2, f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+  v3 = fma v0, v1, v2
+  return v3
+}
+
+; block0:
+;   mov v17.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.2d, v17.2d, v1.2d
+;   ret
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif b/cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif
new file mode 100644
index 000000000000..c969b1e9be86
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-valltrue.clif
@@ -0,0 +1,94 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %fn0(b8x8) -> b1 {
+block0(v0: b8x8):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   uminv b3, v0.8b
+;   mov x5, v3.d[0]
+;   subs xzr, x5, #0
+;   cset x0, ne
+;   ret
+
+function %fn1(b8x16) -> b1 {
+block0(v0: b8x16):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   uminv b3, v0.16b
+;   mov x5, v3.d[0]
+;   subs xzr, x5, #0
+;   cset x0, ne
+;   ret
+
+function %fn2(b16x4) -> b1 {
+block0(v0: b16x4):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   uminv h3, v0.4h
+;   mov x5, v3.d[0]
+;   subs xzr, x5, #0
+;   cset x0, ne
+;   ret
+
+function %fn3(b16x8) -> b1 {
+block0(v0: b16x8):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   uminv h3, v0.8h
+;   mov x5, v3.d[0]
+;   subs xzr, x5, #0
+;   cset x0, ne
+;   ret
+
+function %fn4(b32x2) -> b1 {
+block0(v0: b32x2):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   mov x3, v0.d[0]
+;   subs xzr, xzr, x3, LSR 32
+;   ccmp w3, #0, #nZcv, ne
+;   cset x0, ne
+;   ret
+
+function %fn5(b32x4) -> b1 {
+block0(v0: b32x4):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   uminv s3, v0.4s
+;   mov x5, v3.d[0]
+;   subs xzr, x5, #0
+;   cset x0, ne
+;   ret
+
+function %fn6(b64x2) -> b1 {
+block0(v0: b64x2):
+    v1 = vall_true v0
+    return v1
+}
+
+; block0:
+;   cmeq v3.2d, v0.2d, #0
+;   addp v5.2d, v3.2d, v3.2d
+;   fcmp d5, d5
+;   cset x0, eq
+;   ret
diff --git a/cranelift/filetests/filetests/runtests/simd-fma-64bit.clif b/cranelift/filetests/filetests/runtests/simd-fma-64bit.clif
new file mode 100644
index 000000000000..5f98b80d8a13
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-fma-64bit.clif
@@ -0,0 +1,47 @@
+test interpret
+test run
+target aarch64
+; x86_64 panics: `not implemented: unable to move type: f32x2`
+
+function %fma_f32x2(f32x2, f32x2, f32x2) -> f32x2 {
+block0(v0: f32x2, v1: f32x2, v2: f32x2):
+    v3 = fma v0, v1, v2
+    return v3
+}
+; run: %fma_f32x2([0x9.0 0x83.0], [0x9.0 0x2.68091p6], [0x9.0 0x9.88721p1]) == [0x1.680000p6 0x1.3b88e6p14]
+; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0]
+; run: %fma_f32x2([0x0.0 0x0.0], [-0x0.0 -0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
+; run: %fma_f32x2([-0x0.0 -0x0.0], [0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
+
+; run: %fma_f32x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf]
+; run: %fma_f32x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf]
+; run: %fma_f32x2([-Inf -Inf], [Inf Inf], [-Inf -Inf]) == [-Inf -Inf]
+; run: %fma_f32x2([-Inf -Inf], [Inf Inf], [-Inf -Inf]) == [-Inf -Inf]
+
+; F32 Epsilon / Max / Min Positive
+; run: %fma_f32x2([0x1.000000p-23 0x0.0], [0x1.000000p-23 0x0.0], [0x1.000000p-23 0x1.000000p-23]) == [0x1.000002p-23 0x1.000000p-23]
+; run: %fma_f32x2([0x1.fffffep127 0x0.0], [0x1.fffffep127 0x0.0], [0x1.fffffep127 0x1.fffffep127]) == [+Inf 0x1.fffffep127]
+; run: %fma_f32x2([0x1.000000p-126 0x1.000000p-126], [0x1.000000p-126 0x1.000000p-126], [0x1.000000p-126 0x1.000000p-126]) == [0x1.000000p-126 0x1.000000p-126]
+; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126]) == [0x1.000000p-126 0x1.000000p-126]
+
+; F32 Subnormals
+; run: %fma_f32x2([0x0.800000p-126 0x0.800000p-126], [0x0.800000p-126 0x0.800000p-126], [0x0.800000p-126 0x0.0]) == [0x0.800000p-126 0x0.0]
+; run: %fma_f32x2([0x0.0 0x0.000002p-126], [0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.000002p-126]
+; run: %fma_f32x2([0x0.000002p-126 0x0.000002p-126], [0x0.000002p-126 0x0.000002p-126], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
+; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.000002p-126 0x0.000002p-126]) == [0x0.000002p-126 0x0.000002p-126]
+
+;; The IEEE754 Standard does not make a lot of guarantees about what
+;; comes out of NaN producing operations, we just check if its a NaN
+function %fma_is_nan_f32x2(f32x2, f32x2, f32x2) -> b1 {
+block0(v0: f32x2, v1: f32x2, v2: f32x2):
+    v3 = fma v0, v1, v2
+    v4 = fcmp ne v3, v3
+    v5 = vall_true v4
+    return v5
+}
+; run: %fma_is_nan_f32x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == true
+; run: %fma_is_nan_f32x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == true
+; run: %fma_is_nan_f32x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == true
+; run: %fma_is_nan_f32x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == true
+; run: %fma_is_nan_f32x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == true
+; run: %fma_is_nan_f32x2([NaN NaN], [NaN NaN], [NaN NaN]) == true
diff --git a/cranelift/filetests/filetests/runtests/simd-fma.clif b/cranelift/filetests/filetests/runtests/simd-fma.clif
index b5eb7de5b577..cfb1e6b119fc 100644
--- a/cranelift/filetests/filetests/runtests/simd-fma.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fma.clif
@@ -1,5 +1,7 @@
+test interpret
 test run
 target x86_64 has_avx has_fma
+target aarch64
 
 function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4, v2: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif b/cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif
new file mode 100644
index 000000000000..6085304a4f2d
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-valltrue-64bit.clif
@@ -0,0 +1,58 @@
+test interpret
+test run
+target aarch64
+; s390x and x86_64 do not support 64-bit vectors.
+
+function %valltrue_b8x8_f() -> b1 {
+block0:
+    v0 = bconst.b8 false
+    v1 = splat.b8x8 v0
+    v2 = vall_true v1
+    return v2
+}
+; run: %valltrue_b8x8_f() == false
+
+function %valltrue_b8x8_t() -> b1 {
+block0:
+    v0 = bconst.b8 true
+    v1 = splat.b8x8 v0
+    v2 = vall_true v1
+    return v2
+}
+; run: %valltrue_b8x8_t() == true
+
+function %valltrue_b16x4_f() -> b1 {
+block0:
+    v0 = bconst.b16 false
+    v1 = splat.b16x4 v0
+    v2 = vall_true v1
+    return v2
+}
+; run: %valltrue_b16x4_f() == false
+
+function %valltrue_b16x4_t() -> b1 {
+block0:
+    v0 = bconst.b16 true
+    v1 = splat.b16x4 v0
+    v2 = vall_true v1
+    return v2
+}
+; run: %valltrue_b16x4_t() == true
+
+function %valltrue_b32x2_f() -> b1 {
+block0:
+    v0 = bconst.b32 false
+    v1 = splat.b32x2 v0
+    v2 = vall_true v1
+    return v2
+}
+; run: %valltrue_b32x2_f() == false
+
+function %valltrue_b32x2_t() -> b1 {
+block0:
+    v0 = bconst.b32 true
+    v1 = splat.b32x2 v0
+    v2 = vall_true v1
+    return v2
+}
+; run: %valltrue_b32x2_t() == true
diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif
new file mode 100644
index 000000000000..8ead6d2d3799
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-vanytrue-64bit.clif
@@ -0,0 +1,58 @@
+test interpret
+test run
+target aarch64
+; s390x and x86_64 do not support 64-bit vectors.
+
+function %vanytrue_b8x8_f() -> b1 {
+block0:
+    v0 = bconst.b8 false
+    v1 = splat.b8x8 v0
+    v2 = vany_true v1
+    return v2
+}
+; run: %vanytrue_b8x8_f() == false
+
+function %vanytrue_b8x8_t() -> b1 {
+block0:
+    v0 = bconst.b8 true
+    v1 = splat.b8x8 v0
+    v2 = vany_true v1
+    return v2
+}
+; run: %vanytrue_b8x8_t() == true
+
+function %vanytrue_b16x4_f() -> b1 {
+block0:
+    v0 = bconst.b16 false
+    v1 = splat.b16x4 v0
+    v2 = vany_true v1
+    return v2
+}
+; run: %vanytrue_b16x4_f() == false
+
+function %vanytrue_b16x4_t() -> b1 {
+block0:
+    v0 = bconst.b16 true
+    v1 = splat.b16x4 v0
+    v2 = vany_true v1
+    return v2
+}
+; run: %vanytrue_b16x4_t() == true
+
+function %vanytrue_b32x2_f() -> b1 {
+block0:
+    v0 = bconst.b32 false
+    v1 = splat.b32x2 v0
+    v2 = vany_true v1
+    return v2
+}
+; run: %vanytrue_b32x2_f() == false
+
+function %vanytrue_b32x2_t() -> b1 {
+block0:
+    v0 = bconst.b32 true
+    v1 = splat.b32x2 v0
+    v2 = vany_true v1
+    return v2
+}
+; run: %vanytrue_b32x2_t() == true
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index 50dcf0c5d836..eaff61fd40b9 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -715,10 +715,25 @@ where
             };
             assign(count)
         }
-        Opcode::Fcmp => assign(Value::bool(
-            fcmp(inst.fp_cond_code().unwrap(), &arg(0)?, &arg(1)?)?,
-            ctrl_ty.as_bool(),
-        )?),
+
+        Opcode::Fcmp => {
+            let arg0 = extractlanes(&arg(0)?, ctrl_ty)?;
+            let arg1 = extractlanes(&arg(1)?, ctrl_ty)?;
+
+            assign(vectorizelanes(
+                &(arg0
+                    .into_iter()
+                    .zip(arg1.into_iter())
+                    .map(|(x, y)| {
+                        V::bool(
+                            fcmp(inst.fp_cond_code().unwrap(), &x, &y).unwrap(),
+                            ctrl_ty.lane_type().as_bool(),
+                        )
+                    })
+                    .collect::<ValueResult<SimdVec<V>>>()?),
+                ctrl_ty,
+            )?)
+        }
         Opcode::Ffcmp => {
             let arg0 = arg(0)?;
             let arg1 = arg(1)?;
@@ -750,7 +765,21 @@ where
         Opcode::Fmul => binary(Value::mul, arg(0)?, arg(1)?)?,
         Opcode::Fdiv => binary(Value::div, arg(0)?, arg(1)?)?,
         Opcode::Sqrt => assign(Value::sqrt(arg(0)?)?),
-        Opcode::Fma => assign(Value::fma(arg(0)?, arg(1)?, arg(2)?)?),
+        Opcode::Fma => {
+            let arg0 = extractlanes(&arg(0)?, ctrl_ty)?;
+            let arg1 = extractlanes(&arg(1)?, ctrl_ty)?;
+            let arg2 = extractlanes(&arg(2)?, ctrl_ty)?;
+
+            assign(vectorizelanes(
+                &(arg0
+                    .into_iter()
+                    .zip(arg1.into_iter())
+                    .zip(arg2.into_iter())
+                    .map(|((x, y), z)| Value::fma(x, y, z))
+                    .collect::<ValueResult<SimdVec<V>>>()?),
+                ctrl_ty,
+            )?)
+        }
         Opcode::Fneg => assign(Value::neg(arg(0)?)?),
         Opcode::Fabs => assign(Value::abs(arg(0)?)?),
         Opcode::Fcopysign => binary(Value::copysign, arg(0)?, arg(1)?)?,
@@ -1205,8 +1234,8 @@ where
     let iterations = match lane_type {
         types::I8 | types::B1 | types::B8 => 1,
         types::I16 | types::B16 => 2,
-        types::I32 | types::B32 => 4,
-        types::I64 | types::B64 => 8,
+        types::I32 | types::B32 | types::F32 => 4,
+        types::I64 | types::B64 | types::F64 => 8,
         _ => unimplemented!("vectors with lanes wider than 64-bits are currently unsupported."),
     };
 
@@ -1219,6 +1248,8 @@ where
 
         let lane_val: V = if lane_type.is_bool() {
             Value::bool(lane != 0, lane_type)?
+        } else if lane_type.is_float() {
+            Value::float(lane as u64, lane_type)?
         } else {
             Value::int(lane, lane_type)?
         };
@@ -1242,8 +1273,8 @@ where
     let iterations = match lane_type {
         types::I8 | types::B1 | types::B8 => 1,
         types::I16 | types::B16 => 2,
-        types::I32 | types::B32 => 4,
-        types::I64 | types::B64 => 8,
+        types::I32 | types::B32 | types::F32 => 4,
+        types::I64 | types::B64 | types::F64 => 8,
         _ => unimplemented!("vectors with lanes wider than 64-bits are currently unsupported."),
     };
     let mut result: [u8; 16] = [0; 16];

From 18cf0b4f539416fa67ceae4bcd6e2e9f3a07b715 Mon Sep 17 00:00:00 2001
From: dheaton-arm <Damian.Heaton@arm.com>
Date: Fri, 5 Aug 2022 10:03:20 +0100
Subject: [PATCH 2/2] Add comments for `Fmla` and `Bsl`

Copyright (c) 2022 Arm Limited
---
 cranelift/codegen/src/isa/aarch64/inst.isle | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index fdbc25e3a2c0..f853ab711290 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1109,6 +1109,8 @@
     ;; Bitwise exclusive or
     (Eor)
     ;; Bitwise select
+    ;; This opcode should only be used with the `vec_rrr_inplace`
+    ;; constructor.
     (Bsl)
     ;; Unsigned maximum pairwise
     (Umaxp)
@@ -1145,6 +1147,8 @@
     ;; Floating-point multiply
     (Fmul)
     ;; Floating-point fused multiply-add vectors
+    ;; This opcode should only be used with the `vec_rrr_inplace`
+    ;; constructor.
     (Fmla)
     ;; Add pairwise
     (Addp)
@@ -1502,9 +1506,9 @@
 ;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
 ;; one of which is both source and output.
 (decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
-(rule (vec_rrr_inplace op src_dst src2 src3 size)
+(rule (vec_rrr_inplace op src1 src2 src3 size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_1 Unit (emit (MInst.FpuMove128 dst src_dst)))
+            (_1 Unit (emit (MInst.FpuMove128 dst src1)))
             (_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
         dst))