diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 2c16734f2766..7f3d5564f3d2 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -843,6 +843,24 @@ pub(crate) fn define( .can_load(true), ); + let a = &Operand::new("a", TxN); + + ig.push( + Inst::new( + "load_splat", + r#" + Load an element from memory at ``p + Offset`` and return a vector + whose lanes are all set to that element. + + This is equivalent to ``load`` followed by ``splat``. + "#, + &formats.load, + ) + .operands_in(vec![MemFlags, p, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + ig.push( Inst::new( "store", diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index f85c1028ff2e..95bf4bb63fb8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -680,4 +680,19 @@ impl VectorSize { _ => *self, } } + + /// Return the encoding bits that are used by some SIMD instructions + /// for a particular operand size. + pub fn enc_size(&self) -> (u32, u32) { + let q = self.is_128bits() as u32; + let size = match self.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + ScalarSize::Size64 => 0b11, + _ => unreachable!(), + }; + + (q, size) + } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index abcb822b1ac2..4a9376b860bb 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 { (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd) } +fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(size & 0b11, size); + 0b0_0_0011010_10_00000_110_0_00_00000_00000 + | q << 30 + | size << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_vec(rt.to_reg()) +} + fn enc_extend(top22: u32, rd: Writable, rn: Reg) -> u32 { (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) } @@ -1364,14 +1374,7 @@ impl MachInstEmit for Inst { sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } &Inst::VecMisc { op, rd, rn, size } => { - let enc_size = match size.lane_size() { - ScalarSize::Size8 => 0b00, - ScalarSize::Size16 => 0b01, - ScalarSize::Size32 => 0b10, - ScalarSize::Size64 => 0b11, - _ => unreachable!(), - }; - let q = if size.is_128bits() { 1 } else { 0 }; + let (q, enc_size) = size.enc_size(); let (u, bits_12_16, size) = match op { VecMisc2::Not => (0b1, 0b00101, 0b00), VecMisc2::Neg => (0b1, 0b01011, enc_size), @@ -1740,13 +1743,7 @@ impl MachInstEmit for Inst { alu_op, size, } => { - let enc_size = match size.lane_size() { - ScalarSize::Size8 => 0b00, - ScalarSize::Size16 => 0b01, - ScalarSize::Size32 => 0b10, - ScalarSize::Size64 => 0b11, - _ => unreachable!(), - }; + let (_, enc_size) = size.enc_size(); let is_float = match alu_op { VecALUOp::Fcmeq | VecALUOp::Fcmgt @@ -1819,6 +1816,21 @@ impl MachInstEmit for Inst { }; sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); } + &Inst::VecLoadReplicate { + rd, + rn, + srcloc, + size, + } => { + let (q, size) = size.enc_size(); + + if let Some(srcloc) = srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + sink.put4(enc_ldst_vec(q, size, rn, rd)); + } &Inst::MovToNZCV { rn } => { sink.put4(0xd51b4200 | machreg_to_gpr(rn)); } @@ -2103,9 +2115,12 @@ impl MachInstEmit for Inst { inst.emit(sink, flags, state); } - let (reg, offset) = match mem { - AMode::Unscaled(r, simm9) => (r, simm9.value()), - AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32), + let (reg, index_reg, offset) = match mem { + AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0), + AMode::Unscaled(r, simm9) => (r, None, simm9.value()), + AMode::UnsignedOffset(r, uimm12scaled) => { + (r, None, uimm12scaled.value() as i32) + } _ => panic!("Unsupported case for LoadAddr: {:?}", mem), }; let abs_offset = if offset < 0 { @@ -2119,9 +2134,22 @@ impl MachInstEmit for Inst { ALUOp::Add64 }; - if offset == 0 { - let mov = Inst::mov(rd, reg); - mov.emit(sink, flags, state); + if let Some((idx, extendop)) = index_reg { + let add = Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd, + rn: reg, + rm: idx, + extendop, + }; + + add.emit(sink, flags, state); + } else if offset == 0 { + if reg != rd.to_reg() { + let mov = Inst::mov(rd, reg); + + mov.emit(sink, flags, state); + } } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { let add = Inst::AluRRImm12 { alu_op, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 1a29ba96591a..442cf2a7aea3 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -3508,6 +3508,28 @@ fn test_aarch64_binemit() { "tbx v3.16b, { v11.16b, v12.16b }, v19.16b", )); + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(31), + rn: xreg(0), + srcloc: None, + size: VectorSize::Size64x2, + }, + "1FCC404D", + "ld1r { v31.2d }, [x0]", + )); + + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(0), + rn: xreg(25), + srcloc: None, + size: VectorSize::Size8x8, + }, + "20C3400D", + "ld1r { v0.8b }, [x25]", + )); + insns.push(( Inst::Extend { rd: writable_xreg(1), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 544d04c23cec..1d8893f6465b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -975,6 +975,13 @@ pub enum Inst { is_extension: bool, }, + VecLoadReplicate { + rd: Writable, + rn: Reg, + srcloc: Option, + size: VectorSize, + }, + /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). MovToNZCV { rn: Reg, @@ -1609,7 +1616,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); } } - + &Inst::VecLoadReplicate { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { collector.add_use(rn); collector.add_use(rm); @@ -1762,8 +1772,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { &Inst::LoadExtName { rd, .. } => { collector.add_def(rd); } - &Inst::LoadAddr { rd, mem: _ } => { + &Inst::LoadAddr { rd, ref mem } => { collector.add_def(rd); + memarg_regs(mem, collector); } &Inst::VirtualSPOffsetAdj { .. } => {} &Inst::EmitIsland { .. } => {} @@ -2189,6 +2200,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); } } + &mut Inst::VecLoadReplicate { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuCmp32 { ref mut rn, ref mut rm, @@ -3412,6 +3431,12 @@ impl Inst { let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm) } + &Inst::VecLoadReplicate { rd, rn, size, .. } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = rn.show_rru(mb_rru); + + format!("ld1r {{ {} }}, [{}]", rd, rn) + } &Inst::MovToNZCV { rn } => { let rn = rn.show_rru(mb_rru); format!("msr nzcv, {}", rn) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index fc28cb35818e..f49dcddc0c9e 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs>( } } + Opcode::LoadSplat => { + let off = ctx.data(insn).load_store_offset().unwrap(); + let ty = ty.unwrap(); + let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off); + let memflags = ctx.memflags(insn).expect("memory flags"); + let rd = get_output_reg(ctx, outputs[0]); + let size = VectorSize::from_ty(ty); + let srcloc = if memflags.notrap() { + None + } else { + Some(ctx.srcloc(insn)) + }; + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + + ctx.emit(Inst::LoadAddr { rd: tmp, mem }); + ctx.emit(Inst::VecLoadReplicate { + rd, + rn: tmp.to_reg(), + srcloc, + size, + }); + } + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index f1ef84917758..8c40a4660418 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1376,19 +1376,33 @@ pub fn translate_operator( | Operator::V128Load16Splat { memarg } | Operator::V128Load32Splat { memarg } | Operator::V128Load64Splat { memarg } => { - // TODO: For spec compliance, this is initially implemented as a combination of `load + - // splat` but could be implemented eventually as a single instruction (`load_splat`). - // See https://github.com/bytecodealliance/wasmtime/issues/1175. - translate_load( - memarg, - ir::Opcode::Load, - type_of(op).lane_type(), - builder, - state, - environ, - )?; - let splatted = builder.ins().splat(type_of(op), state.pop1()); - state.push1(splatted) + if cfg!(target_arch = "aarch64") { + let opcode = ir::Opcode::LoadSplat; + let result_ty = type_of(op); + let (flags, base, offset) = prepare_load( + memarg, + mem_op_size(opcode, result_ty.lane_type()), + builder, + state, + environ, + )?; + let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base); + state.push1(dfg.first_result(load)) + } else { + // TODO: For spec compliance, this is initially implemented as a combination of `load + + // splat` but could be implemented eventually as a single instruction (`load_splat`). + // See https://github.com/bytecodealliance/wasmtime/issues/1175. + translate_load( + memarg, + ir::Opcode::Load, + type_of(op).lane_type(), + builder, + state, + environ, + )?; + let splatted = builder.ins().splat(type_of(op), state.pop1()); + state.push1(splatted) + } } Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => { let vector = pop1_with_bitcast(state, type_of(op), builder); @@ -2036,7 +2050,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 { ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1, ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2, ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4, - ir::Opcode::Store | ir::Opcode::Load => ty.bytes(), + ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(), _ => panic!("unknown size of mem op for {:?}", opcode), } }