diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 8abee1ddb0fb..54886b010ed2 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -287,6 +287,30 @@ fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -
         | machreg_to_vec(rd.to_reg())
 }
 
+fn enc_vec_rrr_long(
+    q: u32,
+    u: u32,
+    size: u32,
+    bit14: u32,
+    rm: Reg,
+    rn: Reg,
+    rd: Writable<Reg>,
+) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(u & 0b1, u);
+    debug_assert_eq!(size & 0b11, size);
+    debug_assert_eq!(bit14 & 0b1, bit14);
+
+    0b0_0_0_01110_00_1_00000_100000_00000_00000
+        | q << 30
+        | u << 29
+        | size << 22
+        | bit14 << 14
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
 fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
     (0b01011010110 << 21)
         | size << 31
@@ -2173,6 +2197,34 @@ impl MachInstEmit for Inst {
 
                 sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn));
             }
+            &Inst::VecRRRLong {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                high_half,
+            } => {
+                let (u, size, bit14) = match alu_op {
+                    VecRRRLongOp::Smull8 => (0b0, 0b00, 0b1),
+                    VecRRRLongOp::Smull16 => (0b0, 0b01, 0b1),
+                    VecRRRLongOp::Smull32 => (0b0, 0b10, 0b1),
+                    VecRRRLongOp::Umull8 => (0b1, 0b00, 0b1),
+                    VecRRRLongOp::Umull16 => (0b1, 0b01, 0b1),
+                    VecRRRLongOp::Umull32 => (0b1, 0b10, 0b1),
+                    VecRRRLongOp::Umlal8 => (0b1, 0b00, 0b0),
+                    VecRRRLongOp::Umlal16 => (0b1, 0b01, 0b0),
+                    VecRRRLongOp::Umlal32 => (0b1, 0b10, 0b0),
+                };
+                sink.put4(enc_vec_rrr_long(
+                    high_half as u32,
+                    u,
+                    size,
+                    bit14,
+                    rm,
+                    rn,
+                    rd,
+                ));
+            }
             &Inst::VecRRR {
                 rd,
                 rn,
@@ -2242,13 +2294,7 @@ impl MachInstEmit for Inst {
                     VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
                     VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
                     VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
-                    VecALUOp::Umlal => {
-                        debug_assert!(!size.is_128bits());
-                        (0b001_01110_00_1 | enc_size << 1, 0b100000)
-                    }
                     VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
-                    VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
-                    VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
                     VecALUOp::Sqrdmulh => {
                         debug_assert!(
                             size.lane_size() == ScalarSize::Size16
@@ -2258,12 +2304,12 @@ impl MachInstEmit for Inst {
                         (0b001_01110_00_1 | enc_size << 1, 0b101101)
                     }
                 };
-                let top11 = match alu_op {
-                    VecALUOp::Smull | VecALUOp::Smull2 => top11,
-                    _ if is_float => top11 | (q << 9) | enc_float_size << 1,
-                    _ => top11 | (q << 9),
+                let top11 = if is_float {
+                    top11 | enc_float_size << 1
+                } else {
+                    top11
                 };
-                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
+                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
             }
             &Inst::VecLoadReplicate { rd, rn, size } => {
                 let (q, size) = size.enc_size();
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 53f68a994a0b..d3afca2a777e 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -3651,18 +3651,6 @@ fn test_aarch64_binemit() {
         "addp v8.4s, v12.4s, v14.4s",
     ));
 
-    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Umlal,
-            rd: writable_vreg(9),
-            rn: vreg(20),
-            rm: vreg(17),
-            size: VectorSize::Size32x2,
-        },
-        "8982B12E",
-        "umlal v9.2d, v20.2s, v17.2s",
-    ));
-
     insns.push((
         Inst::VecRRR {
             alu_op: VecALUOp::Zip1,
@@ -3712,77 +3700,221 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull8,
             rd: writable_vreg(16),
             rn: vreg(12),
             rm: vreg(1),
-            size: VectorSize::Size8x16,
+            high_half: false,
         },
         "90C1210E",
         "smull v16.8h, v12.8b, v1.8b",
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull8,
+            rd: writable_vreg(15),
+            rn: vreg(11),
+            rm: vreg(2),
+            high_half: false,
+        },
+        "6FC1222E",
+        "umull v15.8h, v11.8b, v2.8b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal8,
+            rd: writable_vreg(4),
+            rn: vreg(8),
+            rm: vreg(16),
+            high_half: false,
+        },
+        "0481302E",
+        "umlal v4.8h, v8.8b, v16.8b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull16,
             rd: writable_vreg(2),
             rn: vreg(13),
             rm: vreg(6),
-            size: VectorSize::Size16x8,
+            high_half: false,
         },
         "A2C1660E",
         "smull v2.4s, v13.4h, v6.4h",
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull16,
+            rd: writable_vreg(3),
+            rn: vreg(14),
+            rm: vreg(7),
+            high_half: false,
+        },
+        "C3C1672E",
+        "umull v3.4s, v14.4h, v7.4h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal16,
+            rd: writable_vreg(7),
+            rn: vreg(14),
+            rm: vreg(21),
+            high_half: false,
+        },
+        "C781752E",
+        "umlal v7.4s, v14.4h, v21.4h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull32,
             rd: writable_vreg(8),
             rn: vreg(12),
             rm: vreg(14),
-            size: VectorSize::Size32x4,
+            high_half: false,
         },
         "88C1AE0E",
         "smull v8.2d, v12.2s, v14.2s",
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull2,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull32,
+            rd: writable_vreg(9),
+            rn: vreg(5),
+            rm: vreg(6),
+            high_half: false,
+        },
+        "A9C0A62E",
+        "umull v9.2d, v5.2s, v6.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal32,
+            rd: writable_vreg(9),
+            rn: vreg(20),
+            rm: vreg(17),
+            high_half: false,
+        },
+        "8982B12E",
+        "umlal v9.2d, v20.2s, v17.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull8,
             rd: writable_vreg(16),
             rn: vreg(12),
             rm: vreg(1),
-            size: VectorSize::Size8x16,
+            high_half: true,
         },
         "90C1214E",
         "smull2 v16.8h, v12.16b, v1.16b",
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull2,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull8,
+            rd: writable_vreg(29),
+            rn: vreg(22),
+            rm: vreg(10),
+            high_half: true,
+        },
+        "DDC22A6E",
+        "umull2 v29.8h, v22.16b, v10.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal8,
+            rd: writable_vreg(1),
+            rn: vreg(5),
+            rm: vreg(15),
+            high_half: true,
+        },
+        "A1802F6E",
+        "umlal2 v1.8h, v5.16b, v15.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull16,
             rd: writable_vreg(2),
             rn: vreg(13),
             rm: vreg(6),
-            size: VectorSize::Size16x8,
+            high_half: true,
         },
         "A2C1664E",
         "smull2 v2.4s, v13.8h, v6.8h",
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull2,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull16,
+            rd: writable_vreg(19),
+            rn: vreg(18),
+            rm: vreg(17),
+            high_half: true,
+        },
+        "53C2716E",
+        "umull2 v19.4s, v18.8h, v17.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal16,
+            rd: writable_vreg(11),
+            rn: vreg(10),
+            rm: vreg(12),
+            high_half: true,
+        },
+        "4B816C6E",
+        "umlal2 v11.4s, v10.8h, v12.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull32,
             rd: writable_vreg(8),
             rn: vreg(12),
             rm: vreg(14),
-            size: VectorSize::Size32x4,
+            high_half: true,
         },
         "88C1AE4E",
         "smull2 v8.2d, v12.4s, v14.4s",
     ));
 
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull32,
+            rd: writable_vreg(4),
+            rn: vreg(12),
+            rm: vreg(16),
+            high_half: true,
+        },
+        "84C1B06E",
+        "umull2 v4.2d, v12.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal32,
+            rd: writable_vreg(10),
+            rn: vreg(29),
+            rm: vreg(2),
+            high_half: true,
+        },
+        "AA83A26E",
+        "umlal2 v10.2d, v29.4s, v2.4s",
+    ));
+
     insns.push((
         Inst::VecRRR {
             alu_op: VecALUOp::Sqrdmulh,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 0312a7626ece..8c993492bd2e 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -303,14 +303,8 @@ pub enum VecALUOp {
     Fmul,
     /// Add pairwise
     Addp,
-    /// Unsigned multiply add long
-    Umlal,
     /// Zip vectors (primary) [meaning, high halves]
     Zip1,
-    /// Signed multiply long (low halves)
-    Smull,
-    /// Signed multiply long (high halves)
-    Smull2,
     /// Signed saturating rounding doubling multiply returning high half
     Sqrdmulh,
 }
@@ -402,6 +396,22 @@ pub enum VecRRNarrowOp {
     Fcvtn64,
 }
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecRRRLongOp {
+    /// Signed multiply long.
+    Smull8,
+    Smull16,
+    Smull32,
+    /// Unsigned multiply long.
+    Umull8,
+    Umull16,
+    Umull32,
+    /// Unsigned multiply add long
+    Umlal8,
+    Umlal16,
+    Umlal32,
+}
+
 /// A vector operation on a pair of elements with one register.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecPairOp {
@@ -1087,6 +1097,16 @@ pub enum Inst {
         rn: Reg,
     },
 
+    /// 2-operand vector instruction that produces a result with twice the
+    /// lane width and half the number of lanes.
+    VecRRRLong {
+        alu_op: VecRRRLongOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        high_half: bool,
+    },
+
     /// A vector ALU op.
     VecRRR {
         alu_op: VecALUOp,
@@ -2134,10 +2154,22 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(rd);
             collector.add_use(rn);
         }
+        &Inst::VecRRRLong {
+            alu_op, rd, rn, rm, ..
+        } => {
+            match alu_op {
+                VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => {
+                    collector.add_mod(rd)
+                }
+                _ => collector.add_def(rd),
+            };
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
         &Inst::VecRRR {
             alu_op, rd, rn, rm, ..
         } => {
-            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
+            if alu_op == VecALUOp::Bsl {
                 collector.add_mod(rd);
             } else {
                 collector.add_def(rd);
@@ -2944,6 +2976,22 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             map_def(mapper, rd);
             map_use(mapper, rn);
         }
+        &mut Inst::VecRRRLong {
+            alu_op,
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            match alu_op {
+                VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => {
+                    map_mod(mapper, rd)
+                }
+                _ => map_def(mapper, rd),
+            };
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
         &mut Inst::VecRRR {
             alu_op,
             ref mut rd,
@@ -2951,7 +2999,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             ref mut rm,
             ..
         } => {
-            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
+            if alu_op == VecALUOp::Bsl {
                 map_mod(mapper, rd);
             } else {
                 map_def(mapper, rd);
@@ -4147,24 +4195,80 @@ impl Inst {
                     VecALUOp::Fmin => ("fmin", size),
                     VecALUOp::Fmul => ("fmul", size),
                     VecALUOp::Addp => ("addp", size),
-                    VecALUOp::Umlal => ("umlal", size),
                     VecALUOp::Zip1 => ("zip1", size),
-                    VecALUOp::Smull => ("smull", size),
-                    VecALUOp::Smull2 => ("smull2", size),
                     VecALUOp::Sqrdmulh => ("sqrdmulh", size),
                 };
-                let rd_size = match alu_op {
-                    VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
-                    _ => size,
-                };
-                let rn_size = match alu_op {
-                    VecALUOp::Smull => size.halve(),
-                    _ => size,
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                let rm = show_vreg_vector(rm, mb_rru, size);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::VecRRRLong {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                high_half,
+            } => {
+                let (op, dest_size, src_size) = match (alu_op, high_half) {
+                    (VecRRRLongOp::Smull8, false) => {
+                        ("smull", VectorSize::Size16x8, VectorSize::Size8x8)
+                    }
+                    (VecRRRLongOp::Smull8, true) => {
+                        ("smull2", VectorSize::Size16x8, VectorSize::Size8x16)
+                    }
+                    (VecRRRLongOp::Smull16, false) => {
+                        ("smull", VectorSize::Size32x4, VectorSize::Size16x4)
+                    }
+                    (VecRRRLongOp::Smull16, true) => {
+                        ("smull2", VectorSize::Size32x4, VectorSize::Size16x8)
+                    }
+                    (VecRRRLongOp::Smull32, false) => {
+                        ("smull", VectorSize::Size64x2, VectorSize::Size32x2)
+                    }
+                    (VecRRRLongOp::Smull32, true) => {
+                        ("smull2", VectorSize::Size64x2, VectorSize::Size32x4)
+                    }
+                    (VecRRRLongOp::Umull8, false) => {
+                        ("umull", VectorSize::Size16x8, VectorSize::Size8x8)
+                    }
+                    (VecRRRLongOp::Umull8, true) => {
+                        ("umull2", VectorSize::Size16x8, VectorSize::Size8x16)
+                    }
+                    (VecRRRLongOp::Umull16, false) => {
+                        ("umull", VectorSize::Size32x4, VectorSize::Size16x4)
+                    }
+                    (VecRRRLongOp::Umull16, true) => {
+                        ("umull2", VectorSize::Size32x4, VectorSize::Size16x8)
+                    }
+                    (VecRRRLongOp::Umull32, false) => {
+                        ("umull", VectorSize::Size64x2, VectorSize::Size32x2)
+                    }
+                    (VecRRRLongOp::Umull32, true) => {
+                        ("umull2", VectorSize::Size64x2, VectorSize::Size32x4)
+                    }
+                    (VecRRRLongOp::Umlal8, false) => {
+                        ("umlal", VectorSize::Size16x8, VectorSize::Size8x8)
+                    }
+                    (VecRRRLongOp::Umlal8, true) => {
+                        ("umlal2", VectorSize::Size16x8, VectorSize::Size8x16)
+                    }
+                    (VecRRRLongOp::Umlal16, false) => {
+                        ("umlal", VectorSize::Size32x4, VectorSize::Size16x4)
+                    }
+                    (VecRRRLongOp::Umlal16, true) => {
+                        ("umlal2", VectorSize::Size32x4, VectorSize::Size16x8)
+                    }
+                    (VecRRRLongOp::Umlal32, false) => {
+                        ("umlal", VectorSize::Size64x2, VectorSize::Size32x2)
+                    }
+                    (VecRRRLongOp::Umlal32, true) => {
+                        ("umlal2", VectorSize::Size64x2, VectorSize::Size32x4)
+                    }
                 };
-                let rm_size = rn_size;
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
-                let rn = show_vreg_vector(rn, mb_rru, rn_size);
-                let rm = show_vreg_vector(rm, mb_rru, rm_size);
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_vreg_vector(rn, mb_rru, src_size);
+                let rm = show_vreg_vector(rm, mb_rru, src_size);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
             &Inst::VecMisc { op, rd, rn, size } => {
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index ededece15cd1..12535cf3826c 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1253,6 +1253,153 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
     None
 }
 
+/// Pattern match an extending vector multiplication.
+/// Returns a tuple of the opcode to use, the two input registers and whether
+/// it's the 'high half' version of the instruction.
+pub(crate) fn match_vec_long_mul<C: LowerCtx<I = Inst>>(
+    c: &mut C,
+    insn: IRInst,
+    ext_op: Opcode,
+) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)> {
+    let inputs = insn_inputs(c, insn);
+    if let Some(lhs) = maybe_input_insn(c, inputs[0], ext_op) {
+        if let Some(rhs) = maybe_input_insn(c, inputs[1], ext_op) {
+            let lhs_input = insn_inputs(c, lhs)[0];
+            let rhs_input = insn_inputs(c, rhs)[0];
+            let rn = put_input_in_reg(c, lhs_input, NarrowValueMode::None);
+            let rm = put_input_in_reg(c, rhs_input, NarrowValueMode::None);
+            let lane_type = c.output_ty(insn, 0).lane_type();
+            match (lane_type, ext_op) {
+                (I16, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull8, rn, rm, false)),
+                (I16, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull8, rn, rm, true)),
+                (I16, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull8, rn, rm, false)),
+                (I16, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull8, rn, rm, true)),
+                (I32, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull16, rn, rm, false)),
+                (I32, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull16, rn, rm, true)),
+                (I32, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull16, rn, rm, false)),
+                (I32, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull16, rn, rm, true)),
+                (I64, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull32, rn, rm, false)),
+                (I64, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull32, rn, rm, true)),
+                (I64, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull32, rn, rm, false)),
+                (I64, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull32, rn, rm, true)),
+                _ => {}
+            };
+        }
+    }
+    None
+}
+
+pub(crate) fn lower_i64x2_mul<C: LowerCtx<I = Inst>>(c: &mut C, insn: IRInst) {
+    let inputs = insn_inputs(c, insn);
+    let outputs = insn_outputs(c, insn);
+    let rd = get_output_reg(c, outputs[0]).regs()[0];
+    let rn = put_input_in_regs(c, inputs[0]).regs()[0];
+    let rm = put_input_in_regs(c, inputs[1]).regs()[0];
+
+    let tmp1 = c.alloc_tmp(I64X2).only_reg().unwrap();
+    let tmp2 = c.alloc_tmp(I64X2).only_reg().unwrap();
+
+    // This I64X2 multiplication is performed with several 32-bit
+    // operations.
+
+    // 64-bit numbers x and y, can be represented as:
+    //   x = a + 2^32(b)
+    //   y = c + 2^32(d)
+
+    // A 64-bit multiplication is:
+    //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
+    // note: `2^64(bd)` can be ignored, the value is too large to fit in
+    // 64 bits.
+
+    // This sequence implements a I64X2 multiply, where the registers
+    // `rn` and `rm` are split up into 32-bit components:
+    //   rn = |d|c|b|a|
+    //   rm = |h|g|f|e|
+    //
+    //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
+    //
+    //  The sequence is:
+    //  rev64 rd.4s, rm.4s
+    //  mul rd.4s, rd.4s, rn.4s
+    //  xtn tmp1.2s, rn.2d
+    //  addp rd.4s, rd.4s, rd.4s
+    //  xtn tmp2.2s, rm.2d
+    //  shll rd.2d, rd.2s, #32
+    //  umlal rd.2d, tmp2.2s, tmp1.2s
+
+    // Reverse the 32-bit elements in the 64-bit words.
+    //   rd = |g|h|e|f|
+    c.emit(Inst::VecMisc {
+        op: VecMisc2::Rev64,
+        rd,
+        rn: rm,
+        size: VectorSize::Size32x4,
+    });
+
+    // Calculate the high half components.
+    //   rd = |dg|ch|be|af|
+    //
+    // Note that this 32-bit multiply of the high half
+    // discards the bits that would overflow, same as
+    // if 64-bit operations were used. Also the Shll
+    // below would shift out the overflow bits anyway.
+    c.emit(Inst::VecRRR {
+        alu_op: VecALUOp::Mul,
+        rd,
+        rn: rd.to_reg(),
+        rm: rn,
+        size: VectorSize::Size32x4,
+    });
+
+    // Extract the low half components of rn.
+    //   tmp1 = |c|a|
+    c.emit(Inst::VecRRNarrow {
+        op: VecRRNarrowOp::Xtn64,
+        rd: tmp1,
+        rn,
+        high_half: false,
+    });
+
+    // Sum the respective high half components.
+    //   rd = |dg+ch|be+af||dg+ch|be+af|
+    c.emit(Inst::VecRRR {
+        alu_op: VecALUOp::Addp,
+        rd: rd,
+        rn: rd.to_reg(),
+        rm: rd.to_reg(),
+        size: VectorSize::Size32x4,
+    });
+
+    // Extract the low half components of rm.
+    //   tmp2 = |g|e|
+    c.emit(Inst::VecRRNarrow {
+        op: VecRRNarrowOp::Xtn64,
+        rd: tmp2,
+        rn: rm,
+        high_half: false,
+    });
+
+    // Shift the high half components, into the high half.
+    //   rd = |dg+ch << 32|be+af << 32|
+    c.emit(Inst::VecRRLong {
+        op: VecRRLongOp::Shll32,
+        rd,
+        rn: rd.to_reg(),
+        high_half: false,
+    });
+
+    // Multiply the low components together, and accumulate with the high
+    // half.
+    //   rd = |rd[1] + cg|rd[0] + ae|
+    c.emit(Inst::VecRRRLong {
+        alu_op: VecRRRLongOp::Umlal32,
+        rd,
+        rn: tmp2.to_reg(),
+        rm: tmp1.to_reg(),
+        high_half: false,
+    });
+}
+
 /// Specifies what [lower_icmp] should do when lowering
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) enum IcmpOutput {
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 2ea314b726f5..754e2f7b9501 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -244,174 +244,79 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Imul => {
-            let lhs = put_input_in_regs(ctx, inputs[0]);
-            let rhs = put_input_in_regs(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
-
-            let rd = dst.regs()[0];
-            let rn = lhs.regs()[0];
-            let rm = rhs.regs()[0];
-
             let ty = ty.unwrap();
-            match ty {
-                I128 => {
-                    assert_eq!(lhs.len(), 2);
-                    assert_eq!(rhs.len(), 2);
-                    assert_eq!(dst.len(), 2);
-
-                    // 128bit mul formula:
-                    //   dst_lo = lhs_lo * rhs_lo
-                    //   dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
-                    //
-                    // We can convert the above formula into the following
-                    // umulh   dst_hi, lhs_lo, rhs_lo
-                    // madd    dst_hi, lhs_lo, rhs_hi, dst_hi
-                    // madd    dst_hi, lhs_hi, rhs_lo, dst_hi
-                    // mul     dst_lo, lhs_lo, rhs_lo
-
-                    ctx.emit(Inst::AluRRR {
-                        alu_op: ALUOp::UMulH,
-                        rd: dst.regs()[1],
-                        rn: lhs.regs()[0],
-                        rm: rhs.regs()[0],
-                    });
-                    ctx.emit(Inst::AluRRRR {
-                        alu_op: ALUOp3::MAdd64,
-                        rd: dst.regs()[1],
-                        rn: lhs.regs()[0],
-                        rm: rhs.regs()[1],
-                        ra: dst.regs()[1].to_reg(),
-                    });
-                    ctx.emit(Inst::AluRRRR {
-                        alu_op: ALUOp3::MAdd64,
-                        rd: dst.regs()[1],
-                        rn: lhs.regs()[1],
-                        rm: rhs.regs()[0],
-                        ra: dst.regs()[1].to_reg(),
-                    });
-                    ctx.emit(Inst::AluRRRR {
-                        alu_op: ALUOp3::MAdd64,
-                        rd: dst.regs()[0],
-                        rn: lhs.regs()[0],
-                        rm: rhs.regs()[0],
-                        ra: zero_reg(),
-                    });
-                }
-                ty if !ty.is_vector() => {
-                    let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
-                    ctx.emit(Inst::AluRRRR {
-                        alu_op,
-                        rd,
-                        rn,
-                        rm,
-                        ra: zero_reg(),
-                    });
-                }
-                I64X2 => {
-                    let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
-                    let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
-
-                    // This I64X2 multiplication is performed with several 32-bit
-                    // operations.
-
-                    // 64-bit numbers x and y, can be represented as:
-                    //   x = a + 2^32(b)
-                    //   y = c + 2^32(d)
-
-                    // A 64-bit multiplication is:
-                    //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
-                    // note: `2^64(bd)` can be ignored, the value is too large to fit in
-                    // 64 bits.
-
-                    // This sequence implements a I64X2 multiply, where the registers
-                    // `rn` and `rm` are split up into 32-bit components:
-                    //   rn = |d|c|b|a|
-                    //   rm = |h|g|f|e|
-                    //
-                    //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
-                    //
-                    //  The sequence is:
-                    //  rev64 rd.4s, rm.4s
-                    //  mul rd.4s, rd.4s, rn.4s
-                    //  xtn tmp1.2s, rn.2d
-                    //  addp rd.4s, rd.4s, rd.4s
-                    //  xtn tmp2.2s, rm.2d
-                    //  shll rd.2d, rd.2s, #32
-                    //  umlal rd.2d, tmp2.2s, tmp1.2s
-
-                    // Reverse the 32-bit elements in the 64-bit words.
-                    //   rd = |g|h|e|f|
-                    ctx.emit(Inst::VecMisc {
-                        op: VecMisc2::Rev64,
-                        rd,
-                        rn: rm,
-                        size: VectorSize::Size32x4,
-                    });
-
-                    // Calculate the high half components.
-                    //   rd = |dg|ch|be|af|
-                    //
-                    // Note that this 32-bit multiply of the high half
-                    // discards the bits that would overflow, same as
-                    // if 64-bit operations were used. Also the Shll
-                    // below would shift out the overflow bits anyway.
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::Mul,
-                        rd,
-                        rn: rd.to_reg(),
-                        rm: rn,
-                        size: VectorSize::Size32x4,
-                    });
-
-                    // Extract the low half components of rn.
-                    //   tmp1 = |c|a|
-                    ctx.emit(Inst::VecRRNarrow {
-                        op: VecRRNarrowOp::Xtn64,
-                        rd: tmp1,
-                        rn,
-                        high_half: false,
-                    });
-
-                    // Sum the respective high half components.
-                    //   rd = |dg+ch|be+af||dg+ch|be+af|
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::Addp,
-                        rd: rd,
-                        rn: rd.to_reg(),
-                        rm: rd.to_reg(),
-                        size: VectorSize::Size32x4,
-                    });
-
-                    // Extract the low half components of rm.
-                    //   tmp2 = |g|e|
-                    ctx.emit(Inst::VecRRNarrow {
-                        op: VecRRNarrowOp::Xtn64,
-                        rd: tmp2,
-                        rn: rm,
-                        high_half: false,
-                    });
+            if ty == I128 {
+                let lhs = put_input_in_regs(ctx, inputs[0]);
+                let rhs = put_input_in_regs(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert_eq!(lhs.len(), 2);
+                assert_eq!(rhs.len(), 2);
+                assert_eq!(dst.len(), 2);
 
-                    // Shift the high half components, into the high half.
-                    //   rd = |dg+ch << 32|be+af << 32|
-                    ctx.emit(Inst::VecRRLong {
-                        op: VecRRLongOp::Shll32,
-                        rd,
-                        rn: rd.to_reg(),
-                        high_half: false,
-                    });
+                // 128bit mul formula:
+                //   dst_lo = lhs_lo * rhs_lo
+                //   dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
+                //
+                // We can convert the above formula into the following
+                // umulh   dst_hi, lhs_lo, rhs_lo
+                // madd    dst_hi, lhs_lo, rhs_hi, dst_hi
+                // madd    dst_hi, lhs_hi, rhs_lo, dst_hi
+                // mul     dst_lo, lhs_lo, rhs_lo
 
-                    // Multiply the low components together, and accumulate with the high
-                    // half.
-                    //   rd = |rd[1] + cg|rd[0] + ae|
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::Umlal,
-                        rd,
-                        rn: tmp2.to_reg(),
-                        rm: tmp1.to_reg(),
-                        size: VectorSize::Size32x2,
-                    });
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::UMulH,
+                    rd: dst.regs()[1],
+                    rn: lhs.regs()[0],
+                    rm: rhs.regs()[0],
+                });
+                ctx.emit(Inst::AluRRRR {
+                    alu_op: ALUOp3::MAdd64,
+                    rd: dst.regs()[1],
+                    rn: lhs.regs()[0],
+                    rm: rhs.regs()[1],
+                    ra: dst.regs()[1].to_reg(),
+                });
+                ctx.emit(Inst::AluRRRR {
+                    alu_op: ALUOp3::MAdd64,
+                    rd: dst.regs()[1],
+                    rn: lhs.regs()[1],
+                    rm: rhs.regs()[0],
+                    ra: dst.regs()[1].to_reg(),
+                });
+                ctx.emit(Inst::AluRRRR {
+                    alu_op: ALUOp3::MAdd64,
+                    rd: dst.regs()[0],
+                    rn: lhs.regs()[0],
+                    rm: rhs.regs()[0],
+                    ra: zero_reg(),
+                });
+            } else if ty.is_vector() {
+                for ext_op in &[
+                    Opcode::SwidenLow,
+                    Opcode::SwidenHigh,
+                    Opcode::UwidenLow,
+                    Opcode::UwidenHigh,
+                ] {
+                    if let Some((alu_op, rn, rm, high_half)) =
+                        match_vec_long_mul(ctx, insn, *ext_op)
+                    {
+                        let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                        ctx.emit(Inst::VecRRRLong {
+                            alu_op,
+                            rd,
+                            rn,
+                            rm,
+                            high_half,
+                        });
+                        return Ok(());
+                    }
                 }
-                ty if ty.is_vector() => {
+                if ty == I64X2 {
+                    lower_i64x2_mul(ctx, insn);
+                } else {
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                    let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Mul,
                         rd,
@@ -420,7 +325,18 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         size: VectorSize::from_ty(ty),
                     });
                 }
-                _ => panic!("Unable to emit mul for {}", ty),
+            } else {
+                let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::AluRRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm,
+                    ra: zero_reg(),
+                });
             }
         }
 
@@ -2740,19 +2656,19 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // => smull  tmp, a, b
                 //    smull2 y,   a, b
                 //    addp   y,   tmp, y
-                ctx.emit(Inst::VecRRR {
-                    alu_op: VecALUOp::Smull,
+                ctx.emit(Inst::VecRRRLong {
+                    alu_op: VecRRRLongOp::Smull16,
                     rd: tmp,
                     rn: r_a,
                     rm: r_b,
-                    size: VectorSize::Size16x8,
+                    high_half: false,
                 });
-                ctx.emit(Inst::VecRRR {
-                    alu_op: VecALUOp::Smull2,
+                ctx.emit(Inst::VecRRRLong {
+                    alu_op: VecRRRLongOp::Smull16,
                     rd: r_y,
                     rn: r_a,
                     rm: r_b,
-                    size: VectorSize::Size16x8,
+                    high_half: true,
                 });
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Addp,
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif b/cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif
new file mode 100644
index 000000000000..ca9b3e2fae21
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif
@@ -0,0 +1,159 @@
+test compile
+set unwind_info=false
+target aarch64
+
+function %fn1(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = swiden_low v0
+  v3 = swiden_low v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: sxtl
+; check: smull v0.8h, v0.8b, v1.8b
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn2(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = swiden_high v0
+  v3 = swiden_high v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: sxtl
+; check: smull2 v0.8h, v0.16b, v1.16b
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn3(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+  v2 = swiden_low v0
+  v3 = swiden_low v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: sxtl
+; check: smull v0.4s, v0.4h, v1.4h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn4(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+  v2 = swiden_high v0
+  v3 = swiden_high v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: sxtl
+; check: smull2 v0.4s, v0.8h, v1.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn5(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+  v2 = swiden_low v0
+  v3 = swiden_low v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: sxtl
+; check: smull v0.2d, v0.2s, v1.2s
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn6(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+  v2 = swiden_high v0
+  v3 = swiden_high v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: sxtl
+; check: smull2 v0.2d, v0.4s, v1.4s
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn7(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = uwiden_low v0
+  v3 = uwiden_low v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: uxtl
+; check: umull v0.8h, v0.8b, v1.8b
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn8(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = uwiden_high v0
+  v3 = uwiden_high v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: uxtl
+; check: umull2 v0.8h, v0.16b, v1.16b
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn9(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+  v2 = uwiden_low v0
+  v3 = uwiden_low v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: uxtl
+; check: umull v0.4s, v0.4h, v1.4h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn10(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+  v2 = uwiden_high v0
+  v3 = uwiden_high v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: uxtl
+; check: umull2 v0.4s, v0.8h, v1.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn11(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+  v2 = uwiden_low v0
+  v3 = uwiden_low v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: uxtl
+; check: umull v0.2d, v0.2s, v1.2s
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn12(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+  v2 = uwiden_high v0
+  v3 = uwiden_high v1
+  v4 = imul v2, v3
+  return v4
+}
+
+; check-not: uxtl2
+; check: umull2 v0.2d, v0.4s, v1.4s
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 8be5b24d8ec9..864cb10f9d66 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1908,7 +1908,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::I16x8Q15MulrSatS => {
             let (a, b) = pop2_with_bitcast(state, I16X8, builder);
-
             state.push1(builder.ins().sqmul_round_sat(a, b))
         }
         Operator::I16x8ExtMulLowI8x16S => {