From 0df12d649a206efd8e06d44e5b1a68e424ccd789 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Fri, 26 Aug 2022 16:51:28 -0700
Subject: [PATCH] AArch64: port load and store operations to ISLE.

This retains `lower_amode` in the handwritten code (@akirilov-arm
reports that there is an upcoming patch to port this), but tweaks it
slightly to take a `Value` rather than an `Inst`.
---
 cranelift/codegen/src/ir/dynamic_type.rs      |  17 ++
 cranelift/codegen/src/ir/mod.rs               |   2 +-
 cranelift/codegen/src/isa/aarch64/abi.rs      |   2 +-
 cranelift/codegen/src/isa/aarch64/inst.isle   |  91 ++++++++-
 .../codegen/src/isa/aarch64/inst/args.rs      |  15 --
 cranelift/codegen/src/isa/aarch64/lower.isle  | 172 +++++++++++++++++-
 cranelift/codegen/src/isa/aarch64/lower.rs    |  87 +++------
 .../codegen/src/isa/aarch64/lower/isle.rs     |  25 ++-
 .../codegen/src/isa/aarch64/lower_inst.rs     | 127 +------------
 cranelift/codegen/src/machinst/isle.rs        |  31 +++-
 cranelift/codegen/src/prelude.isle            |  15 +-
 .../filetests/isa/aarch64/amodes.clif         | 130 ++++++-------
 .../filetests/isa/aarch64/heap_addr.clif      |  10 +-
 .../isa/aarch64/simd-lane-access-compile.clif |   4 +-
 .../filetests/filetests/isa/aarch64/simd.clif |  10 +-
 15 files changed, 442 insertions(+), 296 deletions(-)

diff --git a/cranelift/codegen/src/ir/dynamic_type.rs b/cranelift/codegen/src/ir/dynamic_type.rs
index 91b13af98b34..f1ae30982114 100644
--- a/cranelift/codegen/src/ir/dynamic_type.rs
+++ b/cranelift/codegen/src/ir/dynamic_type.rs
@@ -1,6 +1,7 @@
 //! Dynamic IR types
 
 use crate::ir::entities::DynamicType;
+use crate::ir::types::*;
 use crate::ir::GlobalValue;
 use crate::ir::PrimaryMap;
 use crate::ir::Type;
@@ -36,3 +37,19 @@ impl DynamicTypeData {
 
 /// All allocated dynamic types.
 pub type DynamicTypes = PrimaryMap<DynamicType, DynamicTypeData>;
+
+/// Convert a dynamic-vector type to a fixed-vector type.
+pub fn dynamic_to_fixed(ty: Type) -> Type {
+    match ty {
+        I8X8XN => I8X8,
+        I8X16XN => I8X16,
+        I16X4XN => I16X4,
+        I16X8XN => I16X8,
+        I32X2XN => I32X2,
+        I32X4XN => I32X4,
+        I64X2XN => I64X2,
+        F32X4XN => F32X4,
+        F64X2XN => F64X2,
+        _ => unreachable!("unhandled type: {}", ty),
+    }
+}
diff --git a/cranelift/codegen/src/ir/mod.rs b/cranelift/codegen/src/ir/mod.rs
index 5dc5ad612da2..8ba18987daed 100644
--- a/cranelift/codegen/src/ir/mod.rs
+++ b/cranelift/codegen/src/ir/mod.rs
@@ -35,7 +35,7 @@ pub use crate::ir::builder::{
 };
 pub use crate::ir::constant::{ConstantData, ConstantPool};
 pub use crate::ir::dfg::{DataFlowGraph, ValueDef};
-pub use crate::ir::dynamic_type::{DynamicTypeData, DynamicTypes};
+pub use crate::ir::dynamic_type::{dynamic_to_fixed, DynamicTypeData, DynamicTypes};
 pub use crate::ir::entities::{
     Block, Constant, DynamicStackSlot, DynamicType, FuncRef, GlobalValue, Heap, Immediate, Inst,
     JumpTable, SigRef, StackSlot, Table, UserExternalNameRef, Value,
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index fb11d31d60bd..5eb355338237 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -5,7 +5,7 @@ use crate::ir::types;
 use crate::ir::types::*;
 use crate::ir::MemFlags;
 use crate::ir::Opcode;
-use crate::ir::{ExternalName, LibCall, Signature};
+use crate::ir::{dynamic_to_fixed, ExternalName, LibCall, Signature};
 use crate::isa;
 use crate::isa::aarch64::{inst::EmitState, inst::*, settings as aarch64_settings};
 use crate::isa::unwind::UnwindInst;
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index 856516ef2bd9..d580846863db 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1502,10 +1502,13 @@
 (extern constructor cond_br_cond cond_br_cond)
 
 ;; Lower the address of a load or a store.
-(decl amode (Type Inst u32) AMode)
+(decl amode (Type Value u32) AMode)
 ;; TODO: Port lower_address() to ISLE.
 (extern constructor amode amode)
 
+(decl pair_amode (Value u32) PairAMode)
+(extern constructor pair_amode pair_amode)
+
 ;; Matches an `AMode` that is just a register.
 (decl pure amode_is_reg (AMode) Reg)
 ;; TODO: Implement in ISLE.
@@ -2337,6 +2340,92 @@
 (rule (udf trap_code)
       (SideEffectNoResult.Inst (MInst.Udf trap_code)))
 
+;; Helpers for generating various load instructions, with varying
+;; widths and sign/zero-extending properties.
+(decl aarch64_uload8 (AMode MemFlags) Reg)
+(rule (aarch64_uload8 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.ULoad8 dst amode flags))))
+        dst))
+(decl aarch64_sload8 (AMode MemFlags) Reg)
+(rule (aarch64_sload8 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.SLoad8 dst amode flags))))
+        dst))
+(decl aarch64_uload16 (AMode MemFlags) Reg)
+(rule (aarch64_uload16 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.ULoad16 dst amode flags))))
+        dst))
+(decl aarch64_sload16 (AMode MemFlags) Reg)
+(rule (aarch64_sload16 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.SLoad16 dst amode flags))))
+        dst))
+(decl aarch64_uload32 (AMode MemFlags) Reg)
+(rule (aarch64_uload32 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.ULoad32 dst amode flags))))
+        dst))
+(decl aarch64_sload32 (AMode MemFlags) Reg)
+(rule (aarch64_sload32 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.SLoad32 dst amode flags))))
+        dst))
+(decl aarch64_uload64 (AMode MemFlags) Reg)
+(rule (aarch64_uload64 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.ULoad64 dst amode flags))))
+        dst))
+(decl aarch64_fpuload32 (AMode MemFlags) Reg)
+(rule (aarch64_fpuload32 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuLoad32 dst amode flags))))
+        dst))
+(decl aarch64_fpuload64 (AMode MemFlags) Reg)
+(rule (aarch64_fpuload64 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuLoad64 dst amode flags))))
+        dst))
+(decl aarch64_fpuload128 (AMode MemFlags) Reg)
+(rule (aarch64_fpuload128 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $F64X2))
+            (_ Unit (emit (MInst.FpuLoad128 dst amode flags))))
+        dst))
+(decl aarch64_loadp64 (PairAMode MemFlags) ValueRegs)
+(rule (aarch64_loadp64 amode flags)
+      (let ((dst1 WritableReg (temp_writable_reg $I64))
+            (dst2 WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.LoadP64 dst1 dst2 amode flags))))
+        (value_regs dst1 dst2)))
+
+;; Helpers for generating various store instructions with varying
+;; widths.
+(decl aarch64_store8 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_store8 amode flags val)
+      (SideEffectNoResult.Inst (MInst.Store8 val amode flags)))
+(decl aarch64_store16 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_store16 amode flags val)
+      (SideEffectNoResult.Inst (MInst.Store16 val amode flags)))
+(decl aarch64_store32 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_store32 amode flags val)
+      (SideEffectNoResult.Inst (MInst.Store32 val amode flags)))
+(decl aarch64_store64 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_store64 amode flags val)
+      (SideEffectNoResult.Inst (MInst.Store64 val amode flags)))
+(decl aarch64_fpustore32 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_fpustore32 amode flags val)
+      (SideEffectNoResult.Inst (MInst.FpuStore32 val amode flags)))
+(decl aarch64_fpustore64 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_fpustore64 amode flags val)
+      (SideEffectNoResult.Inst (MInst.FpuStore64 val amode flags)))
+(decl aarch64_fpustore128 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_fpustore128 amode flags val)
+      (SideEffectNoResult.Inst (MInst.FpuStore128 val amode flags)))
+(decl aarch64_storep64 (PairAMode MemFlags Reg Reg) SideEffectNoResult)
+(rule (aarch64_storep64 amode flags val1 val2)
+      (SideEffectNoResult.Inst (MInst.StoreP64 val1 val2 amode flags)))
+
 ;; Immediate value helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Type of extension performed by an immediate helper
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index ce2d70c0925e..4428be2a8370 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -773,18 +773,3 @@ impl VectorSize {
         }
     }
 }
-
-pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {
-    match ty {
-        I8X8XN => I8X8,
-        I8X16XN => I8X16,
-        I16X4XN => I16X4,
-        I16X8XN => I16X8,
-        I32X2XN => I32X2,
-        I32X4XN => I32X4,
-        I64X2XN => I64X2,
-        F32X4XN => F32X4,
-        F64X2XN => F64X2,
-        _ => unreachable!("unhandled type: {}", ty),
-    }
-}
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index d86de45a68ad..2571b877719c 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1777,10 +1777,10 @@
 (rule (lower (has_type ty (splat (ireduce (iconst (u64_from_imm64 n))))))
       (splat_const n (vector_size ty)))
 
-(rule (lower (has_type ty (splat x @ (load flags _addr offset))))
+(rule (lower (has_type ty (splat x @ (load flags addr offset))))
       (if-let mem_op (is_sinkable_inst x))
       (let ((_ Unit (sink_inst mem_op))
-            (addr AMode (amode (lane_type ty) mem_op offset))
+            (addr AMode (amode (lane_type ty) addr offset))
             (address Reg (load_addr addr)))
            (ld1r address (vector_size ty) flags)))
 
@@ -2031,6 +2031,174 @@
 (rule (lower (return args))
       (lower_return (range 0 (value_slice_len args)) args))
 
+;;;; Rules for loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower
+       (has_type $I8 (load flags address offset)))
+      (aarch64_uload8 (amode $I8 address offset) flags))
+(rule (lower
+       (has_type $I16 (load flags address offset)))
+      (aarch64_uload16 (amode $I16 address offset) flags))
+(rule (lower
+       (has_type $I32 (load flags address offset)))
+      (aarch64_uload32 (amode $I32 address offset) flags))
+(rule (lower
+       (has_type $I64 (load flags address offset)))
+      (aarch64_uload64 (amode $I64 address offset) flags))
+(rule (lower
+       (has_type $R64 (load flags address offset)))
+      (aarch64_uload64 (amode $I64 address offset) flags))
+(rule (lower
+       (has_type $F32 (load flags address offset)))
+      (aarch64_fpuload32 (amode $F32 address offset) flags))
+(rule (lower
+       (has_type $F64 (load flags address offset)))
+      (aarch64_fpuload64 (amode $F64 address offset) flags))
+(rule (lower
+       (has_type $I128 (load flags address offset)))
+      (aarch64_loadp64 (pair_amode address offset) flags))
+(rule (lower
+       (has_type (ty_vec64 _)
+                        (load flags address offset)))
+      (aarch64_fpuload128 (amode $F64 address offset) flags))
+(rule (lower
+       (has_type (ty_vec128 _)
+                        (load flags address offset)))
+      (aarch64_fpuload128 (amode $I8X16 address offset) flags))
+(rule (lower
+       (has_type (ty_dyn_vec64 _)
+                        (load flags address offset)))
+      (aarch64_fpuload64 (amode $F64 address offset) flags))
+(rule (lower
+       (has_type (ty_dyn_vec128 _)
+                        (load flags address offset)))
+      (aarch64_fpuload128 (amode $I8X16 address offset) flags))
+
+(rule (lower
+       (uload8 flags address offset))
+      (aarch64_uload8 (amode $I8 address offset) flags))
+(rule (lower
+       (sload8 flags address offset))
+      (aarch64_sload8 (amode $I8 address offset) flags))
+(rule (lower
+       (uload16 flags address offset))
+      (aarch64_uload16 (amode $I16 address offset) flags))
+(rule (lower
+       (sload16 flags address offset))
+      (aarch64_sload16 (amode $I16 address offset) flags))
+(rule (lower
+       (uload32 flags address offset))
+      (aarch64_uload32 (amode $I32 address offset) flags))
+(rule (lower
+       (sload32 flags address offset))
+      (aarch64_sload32 (amode $I32 address offset) flags))
+
+(rule (lower
+       (sload8x8 flags address offset))
+      (vec_extend (VecExtendOp.Sxtl)
+                  (aarch64_fpuload64 (amode $F64 address offset) flags)
+                  $false
+                  (ScalarSize.Size16)))
+(rule (lower
+       (uload8x8 flags address offset))
+      (vec_extend (VecExtendOp.Uxtl)
+                  (aarch64_fpuload64 (amode $F64 address offset) flags)
+                  $false
+                  (ScalarSize.Size16)))
+(rule (lower
+       (sload16x4 flags address offset))
+      (vec_extend (VecExtendOp.Sxtl)
+                  (aarch64_fpuload64 (amode $F64 address offset) flags)
+                  $false
+                  (ScalarSize.Size32)))
+(rule (lower
+       (uload16x4 flags address offset))
+      (vec_extend (VecExtendOp.Uxtl)
+                  (aarch64_fpuload64 (amode $F64 address offset) flags)
+                  $false
+                  (ScalarSize.Size32)))
+(rule (lower
+       (sload32x2 flags address offset))
+      (vec_extend (VecExtendOp.Sxtl)
+                  (aarch64_fpuload64 (amode $F64 address offset) flags)
+                  $false
+                  (ScalarSize.Size64)))
+(rule (lower
+       (uload32x2 flags address offset))
+      (vec_extend (VecExtendOp.Uxtl)
+                  (aarch64_fpuload64 (amode $F64 address offset) flags)
+                  $false
+                  (ScalarSize.Size64)))
+
+;;;; Rules for stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower
+       (store flags value @ (value_type $I8) address offset))
+      (side_effect
+       (aarch64_store8 (amode $I8 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type $I16) address offset))
+      (side_effect
+       (aarch64_store16 (amode $I16 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type $I32) address offset))
+      (side_effect
+       (aarch64_store32 (amode $I32 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type $I64) address offset))
+      (side_effect
+       (aarch64_store64 (amode $I64 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type $R64) address offset))
+      (side_effect
+       (aarch64_store64 (amode $I64 address offset) flags value)))
+
+(rule (lower
+       (istore8 flags value address offset))
+      (side_effect
+       (aarch64_store8 (amode $I8 address offset) flags value)))
+(rule (lower
+       (istore16 flags value address offset))
+      (side_effect
+       (aarch64_store16 (amode $I16 address offset) flags value)))
+(rule (lower
+       (istore32 flags value address offset))
+      (side_effect
+       (aarch64_store32 (amode $I32 address offset) flags value)))
+
+(rule (lower
+       (store flags value @ (value_type $F32) address offset))
+      (side_effect
+       (aarch64_fpustore32 (amode $F32 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type $F64) address offset))
+      (side_effect
+       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
+
+(rule (lower
+       (store flags value @ (value_type $I128) address offset))
+      (side_effect
+       (aarch64_storep64 (pair_amode address offset) flags
+                         (value_regs_get value 0)
+                         (value_regs_get value 1))))
+
+(rule (lower
+       (store flags value @ (value_type (ty_vec64 _)) address offset))
+      (side_effect
+       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type (ty_vec128 _)) address offset))
+      (side_effect
+       (aarch64_fpustore128 (amode $I8X16 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type (ty_dyn_vec64 _)) address offset))
+      (side_effect
+       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type (ty_dyn_vec128 _)) address offset))
+      (side_effect
+       (aarch64_fpustore128 (amode $I8X16 address offset) flags value)))
+
 ;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (get_pinned_reg))
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 3ec6bf3bbe99..7921290a1d34 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -18,7 +18,7 @@ use crate::machinst::lower::*;
 use crate::machinst::{Reg, Writable};
 use crate::{machinst::*, trace};
 use crate::{CodegenError, CodegenResult};
-use smallvec::SmallVec;
+use smallvec::{smallvec, SmallVec};
 use std::cmp;
 
 pub mod isle;
@@ -507,19 +507,19 @@ type AddressAddend64List = SmallVec<[Reg; 4]>;
 /// then possibly support extensions at these leaves.
 fn collect_address_addends(
     ctx: &mut Lower<Inst>,
-    roots: &[InsnInput],
+    root: Value,
 ) -> (AddressAddend64List, AddressAddend32List, i64) {
     let mut result32: AddressAddend32List = SmallVec::new();
     let mut result64: AddressAddend64List = SmallVec::new();
     let mut offset: i64 = 0;
 
-    let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect();
+    let mut workqueue: SmallVec<[Value; 4]> = smallvec![root];
 
-    while let Some(input) = workqueue.pop() {
-        debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64);
-        if let Some((op, insn)) = maybe_input_insn_multi(
+    while let Some(value) = workqueue.pop() {
+        debug_assert_eq!(ty_bits(ctx.value_ty(value)), 64);
+        if let Some((op, insn)) = maybe_value_multi(
             ctx,
-            input,
+            value,
             &[
                 Opcode::Uextend,
                 Opcode::Sextend,
@@ -551,12 +551,12 @@ fn collect_address_addends(
                     }
                 }
                 Opcode::Uextend | Opcode::Sextend => {
-                    let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
+                    let reg = put_value_in_reg(ctx, value, NarrowValueMode::None);
                     result64.push(reg);
                 }
                 Opcode::Iadd => {
                     for input in 0..ctx.num_inputs(insn) {
-                        let addend = InsnInput { insn, input };
+                        let addend = ctx.input_as_value(insn, input);
                         workqueue.push(addend);
                     }
                 }
@@ -567,7 +567,7 @@ fn collect_address_addends(
                 _ => panic!("Unexpected opcode from maybe_input_insn_multi"),
             }
         } else {
-            let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64);
+            let reg = put_value_in_reg(ctx, value, NarrowValueMode::ZeroExtend64);
             result64.push(reg);
         }
     }
@@ -576,15 +576,11 @@ fn collect_address_addends(
 }
 
 /// Lower the address of a pair load or store.
-pub(crate) fn lower_pair_address(
-    ctx: &mut Lower<Inst>,
-    roots: &[InsnInput],
-    offset: i32,
-) -> PairAMode {
+pub(crate) fn lower_pair_address(ctx: &mut Lower<Inst>, addr: Value, offset: i32) -> PairAMode {
     // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
     // extends and addition ops. We update these as we consume address
     // components, so they represent the remaining addends not yet handled.
-    let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
+    let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, addr);
     let offset = args_offset + (offset as i64);
 
     trace!(
@@ -636,7 +632,7 @@ pub(crate) fn lower_pair_address(
 pub(crate) fn lower_address(
     ctx: &mut Lower<Inst>,
     elem_ty: Type,
-    roots: &[InsnInput],
+    addr: Value,
     offset: i32,
 ) -> AMode {
     // TODO: support base_reg + scale * index_reg. For this, we would need to
@@ -645,7 +641,7 @@ pub(crate) fn lower_address(
     // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
     // extends and addition ops. We update these as we consume address
     // components, so they represent the remaining addends not yet handled.
-    let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
+    let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, addr);
     let mut offset = args_offset + (offset as i64);
 
     trace!(
@@ -1088,14 +1084,26 @@ pub(crate) fn maybe_input_insn(
     None
 }
 
-/// Checks for an instance of any one of `ops` feeding the given input.
-pub(crate) fn maybe_input_insn_multi(
+/// Checks for an instance of `op` defining the given value.
+pub(crate) fn maybe_value(c: &mut Lower<Inst>, value: Value, op: Opcode) -> Option<IRInst> {
+    let inputs = c.get_value_as_source_or_const(value);
+    if let Some((src_inst, _)) = inputs.inst.as_inst() {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
+    }
+    None
+}
+
+/// Checks for an instance of any one of `ops` defining the given value.
+pub(crate) fn maybe_value_multi(
     c: &mut Lower<Inst>,
-    input: InsnInput,
+    value: Value,
     ops: &[Opcode],
 ) -> Option<(Opcode, IRInst)> {
     for &op in ops {
-        if let Some(inst) = maybe_input_insn(c, input, op) {
+        if let Some(inst) = maybe_value(c, value, op) {
             return Some((op, inst));
         }
     }
@@ -1452,41 +1460,6 @@ pub(crate) fn materialize_bool_result(
     }
 }
 
-fn load_op_to_ty(op: Opcode) -> Option<Type> {
-    match op {
-        Opcode::Sload8 | Opcode::Uload8 => Some(I8),
-        Opcode::Sload16 | Opcode::Uload16 => Some(I16),
-        Opcode::Sload32 | Opcode::Uload32 => Some(I32),
-        Opcode::Load => None,
-        Opcode::Sload8x8 | Opcode::Uload8x8 => Some(I8X8),
-        Opcode::Sload16x4 | Opcode::Uload16x4 => Some(I16X4),
-        Opcode::Sload32x2 | Opcode::Uload32x2 => Some(I32X2),
-        _ => None,
-    }
-}
-
-/// Helper to lower a load instruction; this is used in several places, because
-/// a load can sometimes be merged into another operation.
-pub(crate) fn lower_load<
-    F: FnMut(&mut Lower<Inst>, ValueRegs<Writable<Reg>>, Type, AMode) -> CodegenResult<()>,
->(
-    ctx: &mut Lower<Inst>,
-    ir_inst: IRInst,
-    inputs: &[InsnInput],
-    output: InsnOutput,
-    mut f: F,
-) -> CodegenResult<()> {
-    let op = ctx.data(ir_inst).opcode();
-
-    let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0));
-
-    let off = ctx.data(ir_inst).load_store_offset().unwrap();
-    let mem = lower_address(ctx, elem_ty, &inputs[..], off);
-    let rd = get_output_reg(ctx, output);
-
-    f(ctx, rd, elem_ty, mem)
-}
-
 //=============================================================================
 // Lowering-backend trait implementation.
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
index 8c54d9a3d9da..db1a59e5530f 100644
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -6,14 +6,14 @@ use generated_code::Context;
 
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    insn_inputs, lower_constant_f128, lower_constant_f32, lower_constant_f64, writable_zero_reg,
-    zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond,
-    CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC,
-    JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize,
-    PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
+    lower_constant_f128, lower_constant_f32, lower_constant_f64, writable_zero_reg, zero_reg,
+    AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, CondBrKind,
+    ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo,
+    MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, PairAMode, Reg,
+    ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
 };
 use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm};
-use crate::isa::aarch64::lower::{lower_address, lower_splat_const};
+use crate::isa::aarch64::lower::{lower_address, lower_pair_address, lower_splat_const};
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::valueregs;
 use crate::machinst::{isle::*, InputSourceInst};
@@ -484,13 +484,12 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         }
     }
 
-    fn amode(&mut self, ty: Type, mem_op: Inst, offset: u32) -> AMode {
-        lower_address(
-            self.lower_ctx,
-            ty,
-            &insn_inputs(self.lower_ctx, mem_op)[..],
-            offset as i32,
-        )
+    fn amode(&mut self, ty: Type, addr: Value, offset: u32) -> AMode {
+        lower_address(self.lower_ctx, ty, addr, offset as i32)
+    }
+
+    fn pair_amode(&mut self, addr: Value, offset: u32) -> PairAMode {
+        lower_pair_address(self.lower_ctx, addr, offset as i32)
     }
 
     fn amode_is_reg(&mut self, address: &AMode) -> Option<Reg> {
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index c72ddea6ef3f..1202e8351087 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -101,133 +101,10 @@ pub(crate) fn lower_insn_to_regs(
         | Opcode::Sload16x4
         | Opcode::Uload16x4
         | Opcode::Sload32x2
-        | Opcode::Uload32x2 => {
-            let sign_extend = match op {
-                Opcode::Sload8 | Opcode::Sload16 | Opcode::Sload32 => true,
-                _ => false,
-            };
-            let flags = ctx
-                .memflags(insn)
-                .expect("Load instruction should have memflags");
-
-            let out_ty = ctx.output_ty(insn, 0);
-            if out_ty == I128 {
-                let off = ctx.data(insn).load_store_offset().unwrap();
-                let mem = lower_pair_address(ctx, &inputs[..], off);
-                let dst = get_output_reg(ctx, outputs[0]);
-                ctx.emit(Inst::LoadP64 {
-                    rt: dst.regs()[0],
-                    rt2: dst.regs()[1],
-                    mem,
-                    flags,
-                });
-            } else {
-                lower_load(
-                    ctx,
-                    insn,
-                    &inputs[..],
-                    outputs[0],
-                    |ctx, dst, mut elem_ty, mem| {
-                        if elem_ty.is_dynamic_vector() {
-                            elem_ty = dynamic_to_fixed(elem_ty);
-                        }
-                        let rd = dst.only_reg().unwrap();
-                        let is_float = ty_has_float_or_vec_representation(elem_ty);
-                        ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
-                            (1, _, _) => Inst::ULoad8 { rd, mem, flags },
-                            (8, false, _) => Inst::ULoad8 { rd, mem, flags },
-                            (8, true, _) => Inst::SLoad8 { rd, mem, flags },
-                            (16, false, _) => Inst::ULoad16 { rd, mem, flags },
-                            (16, true, _) => Inst::SLoad16 { rd, mem, flags },
-                            (32, false, false) => Inst::ULoad32 { rd, mem, flags },
-                            (32, true, false) => Inst::SLoad32 { rd, mem, flags },
-                            (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
-                            (64, _, false) => Inst::ULoad64 { rd, mem, flags },
-                            // Note that we treat some of the vector loads as scalar floating-point loads,
-                            // which is correct in a little endian environment.
-                            (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
-                            (128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
-                            _ => {
-                                return Err(CodegenError::Unsupported(format!(
-                                    "Unsupported type in load: {:?}",
-                                    elem_ty
-                                )))
-                            }
-                        });
-
-                        let vec_extend = match op {
-                            Opcode::Sload8x8 => Some((VecExtendOp::Sxtl, ScalarSize::Size16)),
-                            Opcode::Uload8x8 => Some((VecExtendOp::Uxtl, ScalarSize::Size16)),
-                            Opcode::Sload16x4 => Some((VecExtendOp::Sxtl, ScalarSize::Size32)),
-                            Opcode::Uload16x4 => Some((VecExtendOp::Uxtl, ScalarSize::Size32)),
-                            Opcode::Sload32x2 => Some((VecExtendOp::Sxtl, ScalarSize::Size64)),
-                            Opcode::Uload32x2 => Some((VecExtendOp::Uxtl, ScalarSize::Size64)),
-                            _ => None,
-                        };
-
-                        if let Some((t, lane_size)) = vec_extend {
-                            let rd = dst.only_reg().unwrap();
-                            ctx.emit(Inst::VecExtend {
-                                t,
-                                rd,
-                                rn: rd.to_reg(),
-                                high_half: false,
-                                lane_size,
-                            });
-                        }
-
-                        Ok(())
-                    },
-                )?;
-            }
-        }
+        | Opcode::Uload32x2 => implemented_in_isle(ctx),
 
         Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
-            let off = ctx.data(insn).load_store_offset().unwrap();
-            let mut elem_ty = match op {
-                Opcode::Istore8 => I8,
-                Opcode::Istore16 => I16,
-                Opcode::Istore32 => I32,
-                Opcode::Store => ctx.input_ty(insn, 0),
-                _ => unreachable!(),
-            };
-            let is_float = ty_has_float_or_vec_representation(elem_ty);
-            let flags = ctx
-                .memflags(insn)
-                .expect("Store instruction should have memflags");
-
-            let dst = put_input_in_regs(ctx, inputs[0]);
-
-            if elem_ty == I128 {
-                let mem = lower_pair_address(ctx, &inputs[1..], off);
-                ctx.emit(Inst::StoreP64 {
-                    rt: dst.regs()[0],
-                    rt2: dst.regs()[1],
-                    mem,
-                    flags,
-                });
-            } else {
-                if elem_ty.is_dynamic_vector() {
-                    elem_ty = dynamic_to_fixed(elem_ty);
-                }
-                let rd = dst.only_reg().unwrap();
-                let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
-                ctx.emit(match (ty_bits(elem_ty), is_float) {
-                    (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
-                    (16, _) => Inst::Store16 { rd, mem, flags },
-                    (32, false) => Inst::Store32 { rd, mem, flags },
-                    (32, true) => Inst::FpuStore32 { rd, mem, flags },
-                    (64, false) => Inst::Store64 { rd, mem, flags },
-                    (64, true) => Inst::FpuStore64 { rd, mem, flags },
-                    (128, _) => Inst::FpuStore128 { rd, mem, flags },
-                    _ => {
-                        return Err(CodegenError::Unsupported(format!(
-                            "Unsupported type in store: {:?}",
-                            elem_ty
-                        )))
-                    }
-                });
-            }
+            implemented_in_isle(ctx)
         }
 
         Opcode::StackAddr => implemented_in_isle(ctx),
diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs
index d62b2f831e71..9532526f8b75 100644
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -9,8 +9,8 @@ use target_lexicon::Triple;
 pub use super::MachLabel;
 pub use crate::data_value::DataValue;
 pub use crate::ir::{
-    ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate,
-    SigRef, StackSlot,
+    dynamic_to_fixed, ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef,
+    GlobalValue, Immediate, SigRef, StackSlot,
 };
 pub use crate::isa::unwind::UnwindInst;
 pub use crate::machinst::{
@@ -397,6 +397,15 @@ macro_rules! isle_prelude_methods {
             }
         }
 
+        #[inline]
+        fn ty_vec64_ctor(&mut self, ty: Type) -> Option<Type> {
+            if ty.is_vector() && ty.bits() == 64 {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
         #[inline]
         fn ty_vec64(&mut self, ty: Type) -> Option<Type> {
             if ty.is_vector() && ty.bits() == 64 {
@@ -415,6 +424,24 @@ macro_rules! isle_prelude_methods {
             }
         }
 
+        #[inline]
+        fn ty_dyn_vec64(&mut self, ty: Type) -> Option<Type> {
+            if ty.is_dynamic_vector() && dynamic_to_fixed(ty).bits() == 64 {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn ty_dyn_vec128(&mut self, ty: Type) -> Option<Type> {
+            if ty.is_dynamic_vector() && dynamic_to_fixed(ty).bits() == 128 {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
         #[inline]
         fn ty_vec64_int(&mut self, ty: Type) -> Option<Type> {
             if ty.is_vector() && ty.bits() == 64 && ty.lane_type().is_int() {
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index 7119a0cb87be..98df224bcf23 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -381,14 +381,25 @@
 (decl ty_float_or_vec (Type) Type)
 (extern extractor ty_float_or_vec ty_float_or_vec)
 
-;; A pure constructor that only matches 64-bit vector types.
+;; A pure constructor/extractor that only matches 64-bit vector types.
 (decl pure ty_vec64 (Type) Type)
-(extern constructor ty_vec64 ty_vec64)
+(extern constructor ty_vec64 ty_vec64_ctor)
+(extern extractor ty_vec64 ty_vec64)
 
 ;; An extractor that only matches 128-bit vector types.
 (decl ty_vec128 (Type) Type)
 (extern extractor ty_vec128 ty_vec128)
 
+;; An extractor that only matches dynamic vector types with a 64-bit
+;; base type.
+(decl ty_dyn_vec64 (Type) Type)
+(extern extractor ty_dyn_vec64 ty_dyn_vec64)
+
+;; An extractor that only matches dynamic vector types with a 128-bit
+;; base type.
+(decl ty_dyn_vec128 (Type) Type)
+(extern extractor ty_dyn_vec128 ty_dyn_vec128)
+
 ;; An extractor that only matches 64-bit vector types with integer
 ;; lanes (I8X8, I16X4, I32X2)
 (decl ty_vec64_int (Type) Type)
diff --git a/cranelift/filetests/filetests/isa/aarch64/amodes.clif b/cranelift/filetests/filetests/isa/aarch64/amodes.clif
index c3254cc9469a..83b7b96bc206 100644
--- a/cranelift/filetests/filetests/isa/aarch64/amodes.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/amodes.clif
@@ -36,8 +36,8 @@ block0(v0: i32, v1: i32):
 }
 
 ; block0:
-;   mov w6, w0
-;   ldr w0, [x6, w1, UXTW]
+;   mov w5, w0
+;   ldr w0, [x5, w1, UXTW]
 ;   ret
 
 function %f8(i64, i32) -> i32 {
@@ -52,10 +52,10 @@ block0(v0: i64, v1: i32):
 }
 
 ; block0:
-;   add x6, x0, #68
-;   add x6, x6, x0
-;   add x6, x6, x1, SXTW
-;   ldr w0, [x6, w1, SXTW]
+;   add x5, x0, #68
+;   add x5, x5, x0
+;   add x5, x5, x1, SXTW
+;   ldr w0, [x5, w1, SXTW]
 ;   ret
 
 function %f9(i64, i64, i64) -> i32 {
@@ -85,10 +85,10 @@ block0(v0: i64, v1: i64, v2: i64):
 }
 
 ; block0:
-;   movz x8, #4100
-;   add x8, x8, x1
-;   add x8, x8, x2
-;   ldr w0, [x8, x0]
+;   movz x7, #4100
+;   add x7, x7, x1
+;   add x7, x7, x2
+;   ldr w0, [x7, x0]
 ;   ret
 
 function %f10() -> i32 {
@@ -99,8 +99,8 @@ block0:
 }
 
 ; block0:
-;   movz x2, #1234
-;   ldr w0, [x2]
+;   movz x1, #1234
+;   ldr w0, [x1]
 ;   ret
 
 function %f11(i64) -> i32 {
@@ -112,8 +112,8 @@ block0(v0: i64):
 }
 
 ; block0:
-;   add x4, x0, #8388608
-;   ldr w0, [x4]
+;   add x3, x0, #8388608
+;   ldr w0, [x3]
 ;   ret
 
 function %f12(i64) -> i32 {
@@ -125,8 +125,8 @@ block0(v0: i64):
 }
 
 ; block0:
-;   sub x4, x0, #4
-;   ldr w0, [x4]
+;   sub x3, x0, #4
+;   ldr w0, [x3]
 ;   ret
 
 function %f13(i64) -> i32 {
@@ -138,10 +138,10 @@ block0(v0: i64):
 }
 
 ; block0:
-;   movz w4, #51712
-;   movk w4, #15258, LSL #16
-;   add x4, x4, x0
-;   ldr w0, [x4]
+;   movz w3, #51712
+;   movk w3, #15258, LSL #16
+;   add x3, x3, x0
+;   ldr w0, [x3]
 ;   ret
 
 function %f14(i32) -> i32 {
@@ -152,8 +152,8 @@ block0(v0: i32):
 }
 
 ; block0:
-;   sxtw x4, w0
-;   ldr w0, [x4]
+;   sxtw x3, w0
+;   ldr w0, [x3]
 ;   ret
 
 function %f15(i32, i32) -> i32 {
@@ -166,8 +166,8 @@ block0(v0: i32, v1: i32):
 }
 
 ; block0:
-;   sxtw x6, w0
-;   ldr w0, [x6, w1, SXTW]
+;   sxtw x5, w0
+;   ldr w0, [x5, w1, SXTW]
 ;   ret
 
 function %f18(i64, i64, i64) -> i32 {
@@ -179,8 +179,8 @@ block0(v0: i64, v1: i64, v2: i64):
 }
 
 ; block0:
-;   movn w8, #4097
-;   ldrsh x0, [x8]
+;   movn w7, #4097
+;   ldrsh x0, [x7]
 ;   ret
 
 function %f19(i64, i64, i64) -> i32 {
@@ -192,8 +192,8 @@ block0(v0: i64, v1: i64, v2: i64):
 }
 
 ; block0:
-;   movz x8, #4098
-;   ldrsh x0, [x8]
+;   movz x7, #4098
+;   ldrsh x0, [x7]
 ;   ret
 
 function %f20(i64, i64, i64) -> i32 {
@@ -205,9 +205,9 @@ block0(v0: i64, v1: i64, v2: i64):
 }
 
 ; block0:
-;   movn w8, #4097
-;   sxtw x10, w8
-;   ldrsh x0, [x10]
+;   movn w7, #4097
+;   sxtw x9, w7
+;   ldrsh x0, [x9]
 ;   ret
 
 function %f21(i64, i64, i64) -> i32 {
@@ -219,9 +219,9 @@ block0(v0: i64, v1: i64, v2: i64):
 }
 
 ; block0:
-;   movz x8, #4098
-;   sxtw x10, w8
-;   ldrsh x0, [x10]
+;   movz x7, #4098
+;   sxtw x9, w7
+;   ldrsh x0, [x9]
 ;   ret
 
 function %i128(i64) -> i128 {
@@ -232,11 +232,11 @@ block0(v0: i64):
 }
 
 ; block0:
-;   mov x8, x0
-;   ldp x3, x1, [x8]
-;   mov x11, x3
+;   mov x6, x0
+;   ldp x7, x1, [x6]
+;   mov x11, x7
 ;   stp x11, x1, [x0]
-;   mov x0, x3
+;   mov x0, x7
 ;   ret
 
 function %i128_imm_offset(i64) -> i128 {
@@ -247,11 +247,11 @@ block0(v0: i64):
 }
 
 ; block0:
-;   mov x8, x0
-;   ldp x3, x1, [x8, #16]
-;   mov x11, x3
+;   mov x6, x0
+;   ldp x7, x1, [x6, #16]
+;   mov x11, x7
 ;   stp x11, x1, [x0, #16]
-;   mov x0, x3
+;   mov x0, x7
 ;   ret
 
 function %i128_imm_offset_large(i64) -> i128 {
@@ -262,11 +262,11 @@ block0(v0: i64):
 }
 
 ; block0:
-;   mov x8, x0
-;   ldp x3, x1, [x8, #504]
-;   mov x11, x3
+;   mov x6, x0
+;   ldp x7, x1, [x6, #504]
+;   mov x11, x7
 ;   stp x11, x1, [x0, #504]
-;   mov x0, x3
+;   mov x0, x7
 ;   ret
 
 function %i128_imm_offset_negative_large(i64) -> i128 {
@@ -277,11 +277,11 @@ block0(v0: i64):
 }
 
 ; block0:
-;   mov x8, x0
-;   ldp x3, x1, [x8, #-512]
-;   mov x11, x3
+;   mov x6, x0
+;   ldp x7, x1, [x6, #-512]
+;   mov x11, x7
 ;   stp x11, x1, [x0, #-512]
-;   mov x0, x3
+;   mov x0, x7
 ;   ret
 
 function %i128_add_offset(i64) -> i128 {
@@ -293,11 +293,11 @@ block0(v0: i64):
 }
 
 ; block0:
-;   mov x8, x0
-;   ldp x3, x1, [x8, #32]
-;   mov x11, x3
+;   mov x6, x0
+;   ldp x7, x1, [x6, #32]
+;   mov x11, x7
 ;   stp x11, x1, [x0, #32]
-;   mov x0, x3
+;   mov x0, x7
 ;   ret
 
 function %i128_32bit_sextend_simple(i32) -> i128 {
@@ -309,11 +309,11 @@ block0(v0: i32):
 }
 
 ; block0:
-;   sxtw x8, w0
-;   ldp x4, x1, [x8]
-;   sxtw x9, w0
-;   mov x0, x4
-;   stp x0, x1, [x9]
+;   sxtw x6, w0
+;   ldp x10, x1, [x6]
+;   sxtw x7, w0
+;   mov x0, x10
+;   stp x0, x1, [x7]
 ;   ret
 
 function %i128_32bit_sextend(i64, i32) -> i128 {
@@ -327,13 +327,13 @@ block0(v0: i64, v1: i32):
 }
 
 ; block0:
-;   mov x10, x0
-;   add x10, x10, x1, SXTW
-;   ldp x6, x7, [x10, #24]
+;   mov x8, x0
+;   add x8, x8, x1, SXTW
+;   ldp x10, x11, [x8, #24]
 ;   add x0, x0, x1, SXTW
-;   mov x15, x6
-;   mov x1, x7
+;   mov x15, x10
+;   mov x1, x11
 ;   stp x15, x1, [x0, #24]
-;   mov x0, x6
+;   mov x0, x10
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif b/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
index c8056c3d9e0b..74080e6d4126 100644
--- a/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
@@ -15,15 +15,15 @@ block0(v0: i64, v1: i32):
 
 ; block0:
 ;   mov w10, w1
-;   ldr x5, [x0]
-;   mov x11, x5
+;   ldr x11, [x0]
+;   mov x11, x11
 ;   subs xzr, x10, x11
 ;   b.ls label1 ; b label2
 ; block1:
-;   add x13, x0, x1, UXTW
+;   add x12, x0, x1, UXTW
 ;   subs xzr, x10, x11
-;   movz x14, #0
-;   csel x0, x14, x13, hi
+;   movz x13, #0
+;   csel x0, x13, x12, hi
 ;   csdb
 ;   ret
 ; block2:
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
index 76eb32cb8e8f..2f4f35f574f9 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
@@ -98,8 +98,8 @@ block0(v0: i64):
 }
 
 ; block0:
-;   ldr w2, [x0]
-;   fmov s0, w2
+;   ldr w4, [x0]
+;   fmov s0, w4
 ;   ret
 
 function %load32_zero_int(i32) -> i32x4 {
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd.clif b/cranelift/filetests/filetests/isa/aarch64/simd.clif
index 166d27b80b08..b26811e6fad8 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd.clif
@@ -86,9 +86,9 @@ block0(v0: i64, v1: i64):
 }
 
 ; block0:
-;   ldrb w4, [x0]
+;   ldrb w8, [x0]
 ;   ld1r { v0.16b }, [x1]
-;   dup v1.16b, w4
+;   dup v1.16b, w8
 ;   ret
 
 function %f8(i64, i64) -> i8x16, i8x16 {
@@ -100,9 +100,9 @@ block0(v0: i64, v1: i64):
 }
 
 ; block0:
-;   ldrb w4, [x0]
-;   dup v0.16b, w4
-;   dup v1.16b, w4
+;   ldrb w8, [x0]
+;   dup v0.16b, w8
+;   dup v1.16b, w8
 ;   ret
 
 function %f9() -> i32x2 {