Skip to content

Commit

Permalink
Introduce the Cranelift IR instruction LoadSplat
Browse files Browse the repository at this point in the history
It corresponds to WebAssembly's `load*_splat` operations, which
were previously represented as a combination of `Load` and `Splat`
instructions. However, there are architectures such as Armv8-A
that have a single machine instruction equivalent to the Wasm
operations. In order to generate it, it is necessary to merge the
`Load` and the `Splat` in the backend, which is not possible
because the load may have side effects. The new IR instruction
works around this limitation.

The AArch64 backend leverages the new instruction to improve code
generation.

Copyright (c) 2020, Arm Limited.
  • Loading branch information
akirilov-arm committed Oct 7, 2020
1 parent 84ac3fe commit 9b14e57
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 37 deletions.
18 changes: 18 additions & 0 deletions cranelift/codegen/meta/src/shared/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,24 @@ pub(crate) fn define(
.can_load(true),
);

let a = &Operand::new("a", TxN);

ig.push(
Inst::new(
"load_splat",
r#"
Load an element from memory at ``p + Offset`` and return a vector
whose lanes are all set to that element.
This is equivalent to ``load`` followed by ``splat``.
"#,
&formats.load,
)
.operands_in(vec![MemFlags, p, Offset])
.operands_out(vec![a])
.can_load(true),
);

ig.push(
Inst::new(
"store",
Expand Down
15 changes: 15 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -680,4 +680,19 @@ impl VectorSize {
_ => *self,
}
}

/// Return the encoding bits that are used by some SIMD instructions
/// for a particular operand size.
pub fn enc_size(&self) -> (u32, u32) {
let q = self.is_128bits() as u32;
let size = match self.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};

(q, size)
}
}
70 changes: 49 additions & 21 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
(op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
}

fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
debug_assert_eq!(q & 0b1, q);
debug_assert_eq!(size & 0b11, size);
0b0_0_0011010_10_00000_110_0_00_00000_00000
| q << 30
| size << 10
| machreg_to_gpr(rn) << 5
| machreg_to_vec(rt.to_reg())
}

fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
(top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
}
Expand Down Expand Up @@ -1364,14 +1374,7 @@ impl MachInstEmit for Inst {
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
}
&Inst::VecMisc { op, rd, rn, size } => {
let enc_size = match size.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
let q = if size.is_128bits() { 1 } else { 0 };
let (q, enc_size) = size.enc_size();
let (u, bits_12_16, size) = match op {
VecMisc2::Not => (0b1, 0b00101, 0b00),
VecMisc2::Neg => (0b1, 0b01011, enc_size),
Expand Down Expand Up @@ -1740,13 +1743,7 @@ impl MachInstEmit for Inst {
alu_op,
size,
} => {
let enc_size = match size.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
let (_, enc_size) = size.enc_size();
let is_float = match alu_op {
VecALUOp::Fcmeq
| VecALUOp::Fcmgt
Expand Down Expand Up @@ -1819,6 +1816,21 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate {
rd,
rn,
srcloc,
size,
} => {
let (q, size) = size.enc_size();

if let Some(srcloc) = srcloc {
// Register the offset at which the actual load instruction starts.
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}

sink.put4(enc_ldst_vec(q, size, rn, rd));
}
&Inst::MovToNZCV { rn } => {
sink.put4(0xd51b4200 | machreg_to_gpr(rn));
}
Expand Down Expand Up @@ -2103,9 +2115,12 @@ impl MachInstEmit for Inst {
inst.emit(sink, flags, state);
}

let (reg, offset) = match mem {
AMode::Unscaled(r, simm9) => (r, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
let (reg, index_reg, offset) = match mem {
AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => {
(r, None, uimm12scaled.value() as i32)
}
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
};
let abs_offset = if offset < 0 {
Expand All @@ -2119,9 +2134,22 @@ impl MachInstEmit for Inst {
ALUOp::Add64
};

if offset == 0 {
let mov = Inst::mov(rd, reg);
mov.emit(sink, flags, state);
if let Some((idx, extendop)) = index_reg {
let add = Inst::AluRRRExtend {
alu_op: ALUOp::Add64,
rd,
rn: reg,
rm: idx,
extendop,
};

add.emit(sink, flags, state);
} else if offset == 0 {
if reg != rd.to_reg() {
let mov = Inst::mov(rd, reg);

mov.emit(sink, flags, state);
}
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
let add = Inst::AluRRImm12 {
alu_op,
Expand Down
22 changes: 22 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3508,6 +3508,28 @@ fn test_aarch64_binemit() {
"tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
));

insns.push((
Inst::VecLoadReplicate {
rd: writable_vreg(31),
rn: xreg(0),
srcloc: None,
size: VectorSize::Size64x2,
},
"1FCC404D",
"ld1r { v31.2d }, [x0]",
));

insns.push((
Inst::VecLoadReplicate {
rd: writable_vreg(0),
rn: xreg(25),
srcloc: None,
size: VectorSize::Size8x8,
},
"20C3400D",
"ld1r { v0.8b }, [x25]",
));

insns.push((
Inst::Extend {
rd: writable_xreg(1),
Expand Down
29 changes: 27 additions & 2 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,13 @@ pub enum Inst {
is_extension: bool,
},

VecLoadReplicate {
rd: Writable<Reg>,
rn: Reg,
srcloc: Option<SourceLoc>,
size: VectorSize,
},

/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
MovToNZCV {
rn: Reg,
Expand Down Expand Up @@ -1609,7 +1616,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
}
}

&Inst::VecLoadReplicate { rd, rn, .. } => {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
collector.add_use(rn);
collector.add_use(rm);
Expand Down Expand Up @@ -1762,8 +1772,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
&Inst::LoadExtName { rd, .. } => {
collector.add_def(rd);
}
&Inst::LoadAddr { rd, mem: _ } => {
&Inst::LoadAddr { rd, ref mem } => {
collector.add_def(rd);
memarg_regs(mem, collector);
}
&Inst::VirtualSPOffsetAdj { .. } => {}
&Inst::EmitIsland { .. } => {}
Expand Down Expand Up @@ -2189,6 +2200,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_def(mapper, rd);
}
}
&mut Inst::VecLoadReplicate {
ref mut rd,
ref mut rn,
..
} => {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::FpuCmp32 {
ref mut rn,
ref mut rm,
Expand Down Expand Up @@ -3412,6 +3431,12 @@ impl Inst {
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
}
&Inst::VecLoadReplicate { rd, rn, size, .. } => {
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
let rn = rn.show_rru(mb_rru);

format!("ld1r {{ {} }}, [{}]", rd, rn)
}
&Inst::MovToNZCV { rn } => {
let rn = rn.show_rru(mb_rru);
format!("msr nzcv, {}", rn)
Expand Down
23 changes: 23 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}

Opcode::LoadSplat => {
let off = ctx.data(insn).load_store_offset().unwrap();
let ty = ty.unwrap();
let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
let memflags = ctx.memflags(insn).expect("memory flags");
let rd = get_output_reg(ctx, outputs[0]);
let size = VectorSize::from_ty(ty);
let srcloc = if memflags.notrap() {
None
} else {
Some(ctx.srcloc(insn))
};
let tmp = ctx.alloc_tmp(RegClass::I64, I64);

ctx.emit(Inst::LoadAddr { rd: tmp, mem });
ctx.emit(Inst::VecLoadReplicate {
rd,
rn: tmp.to_reg(),
srcloc,
size,
});
}

Opcode::Store
| Opcode::Istore8
| Opcode::Istore16
Expand Down
42 changes: 28 additions & 14 deletions cranelift/wasm/src/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1376,19 +1376,33 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
| Operator::V128Load16Splat { memarg }
| Operator::V128Load32Splat { memarg }
| Operator::V128Load64Splat { memarg } => {
// TODO: For spec compliance, this is initially implemented as a combination of `load +
// splat` but could be implemented eventually as a single instruction (`load_splat`).
// See https://github.com/bytecodealliance/wasmtime/issues/1175.
translate_load(
memarg,
ir::Opcode::Load,
type_of(op).lane_type(),
builder,
state,
environ,
)?;
let splatted = builder.ins().splat(type_of(op), state.pop1());
state.push1(splatted)
if cfg!(target_arch = "aarch64") {
let opcode = ir::Opcode::LoadSplat;
let result_ty = type_of(op);
let (flags, base, offset) = prepare_load(
memarg,
mem_op_size(opcode, result_ty.lane_type()),
builder,
state,
environ,
)?;
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
state.push1(dfg.first_result(load))
} else {
// TODO: For spec compliance, this is initially implemented as a combination of `load +
// splat` but could be implemented eventually as a single instruction (`load_splat`).
// See https://github.com/bytecodealliance/wasmtime/issues/1175.
translate_load(
memarg,
ir::Opcode::Load,
type_of(op).lane_type(),
builder,
state,
environ,
)?;
let splatted = builder.ins().splat(type_of(op), state.pop1());
state.push1(splatted)
}
}
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
let vector = pop1_with_bitcast(state, type_of(op), builder);
Expand Down Expand Up @@ -2036,7 +2050,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(),
_ => panic!("unknown size of mem op for {:?}", opcode),
}
}
Expand Down

0 comments on commit 9b14e57

Please sign in to comment.