From e0b911a4dfda7fe1d16da1fb1d9ee8eaed97acd7 Mon Sep 17 00:00:00 2001
From: Anton Kirilov <anton.kirilov@arm.com>
Date: Wed, 7 Oct 2020 11:29:55 +0100
Subject: [PATCH] Introduce the Cranelift IR instruction `LoadSplat`

It corresponds to WebAssembly's `load*_splat` operations, which
were previously represented as a combination of `Load` and `Splat`
instructions. However, there are architectures such as Armv8-A
that have a single machine instruction equivalent to the Wasm
operations. In order to generate it, it is necessary to merge the
`Load` and the `Splat` in the backend, which is not possible
because the load may have side effects. The new IR instruction
works around this limitation.

The AArch64 backend leverages the new instruction to improve code
generation.

Copyright (c) 2020, Arm Limited.
---
 .../codegen/meta/src/isa/x86/legalize.rs      |   2 +
 .../codegen/meta/src/shared/instructions.rs   |  19 +++
 .../codegen/src/isa/aarch64/inst/args.rs      |  15 ++
 .../codegen/src/isa/aarch64/inst/emit.rs      | 145 +++++++++++-------
 .../src/isa/aarch64/inst/emit_tests.rs        |  34 +++-
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  30 +++-
 .../codegen/src/isa/aarch64/lower_inst.rs     |  23 +++
 cranelift/codegen/src/isa/x86/enc_tables.rs   |  28 ++++
 cranelift/wasm/src/code_translator.rs         |  16 +-
 9 files changed, 237 insertions(+), 75 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index de78c3b3b7f9..681b3104d552 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -396,6 +396,7 @@ fn define_simd(
     let insertlane = insts.by_name("insertlane");
     let ishl = insts.by_name("ishl");
     let ishl_imm = insts.by_name("ishl_imm");
+    let load_splat = insts.by_name("load_splat");
     let raw_bitcast = insts.by_name("raw_bitcast");
     let scalar_to_vector = insts.by_name("scalar_to_vector");
     let splat = insts.by_name("splat");
@@ -820,6 +821,7 @@ fn define_simd(
     narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
     narrow.custom_legalize(fmin, "expand_minmax_vector");
     narrow.custom_legalize(fmax, "expand_minmax_vector");
+    narrow.custom_legalize(load_splat, "expand_load_splat");
 
     narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
     narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 2c16734f2766..9cb77493c705 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4409,5 +4409,24 @@ pub(crate) fn define(
         .other_side_effects(true),
     );
 
+    let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address");
+    let a = &Operand::new("a", TxN);
+
+    ig.push(
+        Inst::new(
+            "load_splat",
+            r#"
+        Load an element from memory at ``p + Offset`` and return a vector
+        whose lanes are all set to that element.
+
+        This is equivalent to ``load`` followed by ``splat``.
+        "#,
+            &formats.load,
+        )
+        .operands_in(vec![MemFlags, p, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
     ig.build()
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index f85c1028ff2e..95bf4bb63fb8 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -680,4 +680,19 @@ impl VectorSize {
             _ => *self,
         }
     }
+
+    /// Return the encoding bits that are used by some SIMD instructions
+    /// for a particular operand size.
+    pub fn enc_size(&self) -> (u32, u32) {
+        let q = self.is_128bits() as u32;
+        let size = match self.lane_size() {
+            ScalarSize::Size8 => 0b00,
+            ScalarSize::Size16 => 0b01,
+            ScalarSize::Size32 => 0b10,
+            ScalarSize::Size64 => 0b11,
+            _ => unreachable!(),
+        };
+
+        (q, size)
+    }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index d422fdc24f36..124fd36c87e7 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
     (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
 }
 
+fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(size & 0b11, size);
+    0b0_0_0011010_10_00000_110_0_00_00000_00000
+        | q << 30
+        | size << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_vec(rt.to_reg())
+}
+
 fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
 }
@@ -1380,14 +1390,7 @@ impl MachInstEmit for Inst {
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
             &Inst::VecMisc { op, rd, rn, size } => {
-                let enc_size = match size.lane_size() {
-                    ScalarSize::Size8 => 0b00,
-                    ScalarSize::Size16 => 0b01,
-                    ScalarSize::Size32 => 0b10,
-                    ScalarSize::Size64 => 0b11,
-                    _ => unreachable!(),
-                };
-                let q = if size.is_128bits() { 1 } else { 0 };
+                let (q, enc_size) = size.enc_size();
                 let (u, bits_12_16, size) = match op {
                     VecMisc2::Not => (0b1, 0b00101, 0b00),
                     VecMisc2::Neg => (0b1, 0b01011, enc_size),
@@ -1756,13 +1759,7 @@ impl MachInstEmit for Inst {
                 alu_op,
                 size,
             } => {
-                let enc_size = match size.lane_size() {
-                    ScalarSize::Size8 => 0b00,
-                    ScalarSize::Size16 => 0b01,
-                    ScalarSize::Size32 => 0b10,
-                    ScalarSize::Size64 => 0b11,
-                    _ => unreachable!(),
-                };
+                let (q, enc_size) = size.enc_size();
                 let is_float = match alu_op {
                     VecALUOp::Fcmeq
                     | VecALUOp::Fcmgt
@@ -1776,6 +1773,7 @@ impl MachInstEmit for Inst {
                     _ => false,
                 };
                 let enc_float_size = match (is_float, size) {
+                    (true, VectorSize::Size32x2) => 0b0,
                     (true, VectorSize::Size32x4) => 0b0,
                     (true, VectorSize::Size64x2) => 0b1,
                     (true, _) => unimplemented!(),
@@ -1783,58 +1781,73 @@ impl MachInstEmit for Inst {
                 };
 
                 let (top11, bit15_10) = match alu_op {
-                    VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
-                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
-                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
-                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
-                    VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
-                    VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
+                    VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
+                    VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
+                    VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
+                    VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
                     // The following logical instructions operate on bytes, so are not encoded differently
                     // for the different vector types.
-                    VecALUOp::And => (0b010_01110_00_1, 0b000111),
-                    VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
-                    VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
-                    VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
-                    VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
-                    VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
-                    VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
-                    VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::And => (0b000_01110_00_1, 0b000111),
+                    VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
+                    VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
+                    VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
+                    VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
+                    VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
+                    VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
                     VecALUOp::Mul => {
                         debug_assert_ne!(size, VectorSize::Size64x2);
-                        (0b010_01110_00_1 | enc_size << 1, 0b100111)
+                        (0b000_01110_00_1 | enc_size << 1, 0b100111)
                     }
-                    VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
-                    VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
-                    VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
-                    VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
-                    VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
-                    VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
-                    VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
-                    VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
-                    VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
-                    VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
-                    VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
-                    VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
-                    VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
-                    VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
+                    VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
+                    VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
+                    VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
+                    VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
+                    VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
+                    VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
+                    VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
+                    VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
                     VecALUOp::Umlal => {
                         debug_assert!(!size.is_128bits());
                         (0b001_01110_00_1 | enc_size << 1, 0b100000)
                     }
                 };
                 let top11 = if is_float {
-                    top11 | enc_float_size << 1
+                    top11 | (q << 9) | enc_float_size << 1
                 } else {
-                    top11
+                    top11 | (q << 9)
                 };
                 sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
             }
+            &Inst::VecLoadReplicate {
+                rd,
+                rn,
+                size,
+                srcloc,
+            } => {
+                let (q, size) = size.enc_size();
+
+                if let Some(srcloc) = srcloc {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                sink.put4(enc_ldst_vec(q, size, rn, rd));
+            }
             &Inst::MovToNZCV { rn } => {
                 sink.put4(0xd51b4200 | machreg_to_gpr(rn));
             }
@@ -2119,9 +2132,12 @@ impl MachInstEmit for Inst {
                     inst.emit(sink, emit_info, state);
                 }
 
-                let (reg, offset) = match mem {
-                    AMode::Unscaled(r, simm9) => (r, simm9.value()),
-                    AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                let (reg, index_reg, offset) = match mem {
+                    AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
+                    AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
+                    AMode::UnsignedOffset(r, uimm12scaled) => {
+                        (r, None, uimm12scaled.value() as i32)
+                    }
                     _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
                 };
                 let abs_offset = if offset < 0 {
@@ -2135,9 +2151,22 @@ impl MachInstEmit for Inst {
                     ALUOp::Add64
                 };
 
-                if offset == 0 {
-                    let mov = Inst::mov(rd, reg);
-                    mov.emit(sink, emit_info, state);
+                if let Some((idx, extendop)) = index_reg {
+                    let add = Inst::AluRRRExtend {
+                        alu_op: ALUOp::Add64,
+                        rd,
+                        rn: reg,
+                        rm: idx,
+                        extendop,
+                    };
+
+                    add.emit(sink, emit_info, state);
+                } else if offset == 0 {
+                    if reg != rd.to_reg() {
+                        let mov = Inst::mov(rd, reg);
+
+                        mov.emit(sink, emit_info, state);
+                    }
                 } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
                     let add = Inst::AluRRImm12 {
                         alu_op,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 6d981c2eaa93..48707610ffb6 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(28),
             rn: vreg(12),
             rm: vreg(4),
-            size: VectorSize::Size32x4,
+            size: VectorSize::Size32x2,
         },
-        "9CE5244E",
-        "fcmeq v28.4s, v12.4s, v4.4s",
+        "9CE5240E",
+        "fcmeq v28.2s, v12.2s, v4.2s",
     ));
 
     insns.push((
@@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(6),
             rn: vreg(9),
             rm: vreg(8),
-            size: VectorSize::Size8x16,
+            size: VectorSize::Size8x8,
         },
-        "2665286E",
-        "umax v6.16b, v9.16b, v8.16b",
+        "2665282E",
+        "umax v6.8b, v9.8b, v8.8b",
     ));
 
     insns.push((
@@ -3507,6 +3507,28 @@ fn test_aarch64_binemit() {
         "tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
     ));
 
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(31),
+            rn: xreg(0),
+            srcloc: None,
+            size: VectorSize::Size64x2,
+        },
+        "1FCC404D",
+        "ld1r { v31.2d }, [x0]",
+    ));
+
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(0),
+            rn: xreg(25),
+            srcloc: None,
+            size: VectorSize::Size8x8,
+        },
+        "20C3400D",
+        "ld1r { v0.8b }, [x25]",
+    ));
+
     insns.push((
         Inst::Extend {
             rd: writable_xreg(1),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 544d04c23cec..e9c0f15425a2 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -975,6 +975,14 @@ pub enum Inst {
         is_extension: bool,
     },
 
+    /// Load an element and replicate to all lanes of a vector.
+    VecLoadReplicate {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+        srcloc: Option<SourceLoc>,
+    },
+
     /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
     MovToNZCV {
         rn: Reg,
@@ -1609,7 +1617,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
                 collector.add_def(rd);
             }
         }
-
+        &Inst::VecLoadReplicate { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
             collector.add_use(rn);
             collector.add_use(rm);
@@ -1762,8 +1773,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::LoadExtName { rd, .. } => {
             collector.add_def(rd);
         }
-        &Inst::LoadAddr { rd, mem: _ } => {
+        &Inst::LoadAddr { rd, ref mem } => {
             collector.add_def(rd);
+            memarg_regs(mem, collector);
         }
         &Inst::VirtualSPOffsetAdj { .. } => {}
         &Inst::EmitIsland { .. } => {}
@@ -2189,6 +2201,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
                 map_def(mapper, rd);
             }
         }
+        &mut Inst::VecLoadReplicate {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuCmp32 {
             ref mut rn,
             ref mut rm,
@@ -3412,6 +3432,12 @@ impl Inst {
                 let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
                 format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
             }
+            &Inst::VecLoadReplicate { rd, rn, size, .. } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+
+                format!("ld1r {{ {} }}, [{}]", rd, rn)
+            }
             &Inst::MovToNZCV { rn } => {
                 let rn = rn.show_rru(mb_rru);
                 format!("msr nzcv, {}", rn)
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index fc28cb35818e..ecdcb9c6d1d2 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::LoadSplat => {
+            let off = ctx.data(insn).load_store_offset().unwrap();
+            let ty = ty.unwrap();
+            let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let rd = get_output_reg(ctx, outputs[0]);
+            let size = VectorSize::from_ty(ty);
+            let srcloc = if memflags.notrap() {
+                None
+            } else {
+                Some(ctx.srcloc(insn))
+            };
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+
+            ctx.emit(Inst::LoadAddr { rd: tmp, mem });
+            ctx.emit(Inst::VecLoadReplicate {
+                rd,
+                rn: tmp.to_reg(),
+                size,
+                srcloc,
+            });
+        }
+
         Opcode::Store
         | Opcode::Istore8
         | Opcode::Istore16
diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
index 72890cffd9f0..976f1581e3cf 100644
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -1892,3 +1892,31 @@ fn expand_tls_value(
         unreachable!();
     }
 }
+
+fn expand_load_splat(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+
+    pos.use_srcloc(inst);
+
+    let (ptr, offset, flags) = match pos.func.dfg[inst] {
+        ir::InstructionData::Load {
+            opcode: ir::Opcode::LoadSplat,
+            arg,
+            offset,
+            flags,
+        } => (arg, offset, flags),
+        _ => panic!(
+            "Expected load_splat: {}",
+            pos.func.dfg.display_inst(inst, None)
+        ),
+    };
+    let ty = pos.func.dfg.ctrl_typevar(inst);
+    let load = pos.ins().load(ty.lane_type(), flags, ptr, offset);
+
+    pos.func.dfg.replace(inst).splat(ty, load);
+}
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 7c827802ba4a..ef1804cf12b5 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1380,19 +1380,17 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::V128Load16Splat { memarg }
         | Operator::V128Load32Splat { memarg }
         | Operator::V128Load64Splat { memarg } => {
-            // TODO: For spec compliance, this is initially implemented as a combination of `load +
-            // splat` but could be implemented eventually as a single instruction (`load_splat`).
-            // See https://github.com/bytecodealliance/wasmtime/issues/1175.
-            translate_load(
+            let opcode = ir::Opcode::LoadSplat;
+            let result_ty = type_of(op);
+            let (flags, base, offset) = prepare_load(
                 memarg,
-                ir::Opcode::Load,
-                type_of(op).lane_type(),
+                mem_op_size(opcode, result_ty.lane_type()),
                 builder,
                 state,
                 environ,
             )?;
-            let splatted = builder.ins().splat(type_of(op), state.pop1());
-            state.push1(splatted)
+            let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
+            state.push1(dfg.first_result(load))
         }
         Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
             let vector = pop1_with_bitcast(state, type_of(op), builder);
@@ -2040,7 +2038,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
         ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
         ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
         ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
-        ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
+        ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(),
         _ => panic!("unknown size of mem op for {:?}", opcode),
     }
 }