From 124c458d7ea1724b48255f994a0477575a9b2863 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Fri, 6 Nov 2020 16:12:49 -0800 Subject: [PATCH] AArch64 SIMD: pattern-match load+splat into `LD1R` instruction. --- .../codegen/src/isa/aarch64/inst/args.rs | 13 ++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 11 ++ cranelift/codegen/src/isa/aarch64/lower.rs | 45 +++++ .../codegen/src/isa/aarch64/lower_inst.rs | 167 ++++++++---------- cranelift/codegen/src/isa/x64/lower.rs | 12 +- cranelift/codegen/src/machinst/inst_common.rs | 20 +++ cranelift/codegen/src/machinst/lower.rs | 7 +- .../filetests/filetests/isa/aarch64/simd.clif | 66 +++++++ 8 files changed, 239 insertions(+), 102 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 7bd181c86b03..66f0d071d48c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -209,6 +209,19 @@ impl AMode { pub fn label(label: MemLabel) -> AMode { AMode::Label(label) } + + /// Does the address resolve to just a register value, with no offset or + /// other computation? + pub fn is_reg(&self) -> Option { + match self { + &AMode::UnsignedOffset(r, uimm12) if uimm12.value() == 0 => Some(r), + &AMode::Unscaled(r, imm9) if imm9.value() == 0 => Some(r), + &AMode::RegOffset(r, off, _) if off == 0 => Some(r), + &AMode::FPOffset(off, _) if off == 0 => Some(fp_reg()), + &AMode::SPOffset(off, _) if off == 0 => Some(stack_reg()), + _ => None, + } + } } /// A memory argument to a load/store-pair. diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index c7112299f88f..857bdc98ac98 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -1482,6 +1482,17 @@ impl Inst { } } } + + /// Generate a LoadAddr instruction (load address of an amode into + /// register). Elides when possible (when amode is just a register). Returns + /// destination register: either `rd` or a register directly from the amode. + pub fn gen_load_addr(rd: Writable, mem: AMode) -> (Reg, Option) { + if let Some(r) = mem.is_reg() { + (r, None) + } else { + (rd.to_reg(), Some(Inst::LoadAddr { rd, mem })) + } + } } //============================================================================= diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 2ff739bfb954..d6555b01c8aa 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1157,6 +1157,51 @@ pub(crate) fn normalize_bool_result>( } } +/// This is target-word-size dependent. And it excludes booleans and reftypes. +pub(crate) fn is_valid_atomic_transaction_ty(ty: Type) -> bool { + match ty { + I8 | I16 | I32 | I64 => true, + _ => false, + } +} + +fn load_op_to_ty(op: Opcode) -> Option { + match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8), + Opcode::Sload16 | Opcode::Uload16 | Opcode::Sload16Complex | Opcode::Uload16Complex => { + Some(I16) + } + Opcode::Sload32 | Opcode::Uload32 | Opcode::Sload32Complex | Opcode::Uload32Complex => { + Some(I32) + } + Opcode::Load | Opcode::LoadComplex => None, + Opcode::Sload8x8 | Opcode::Uload8x8 => Some(I8X8), + Opcode::Sload16x4 | Opcode::Uload16x4 => Some(I16X4), + Opcode::Sload32x2 | Opcode::Uload32x2 => Some(I32X2), + _ => None, + } +} + +/// Helper to lower a load instruction; this is used in several places, because +/// a load can sometimes be merged into another operation. +pub(crate) fn lower_load, F: FnMut(&mut C, Writable, Type, AMode)>( + ctx: &mut C, + ir_inst: IRInst, + inputs: &[InsnInput], + output: InsnOutput, + mut f: F, +) { + let op = ctx.data(ir_inst).opcode(); + + let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0)); + + let off = ctx.data(ir_inst).load_store_offset().unwrap(); + let mem = lower_address(ctx, elem_ty, &inputs[..], off); + let rd = get_output_reg(ctx, output); + + f(ctx, rd, elem_ty, mem); +} + //============================================================================= // Lowering-backend trait implementation. diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 8b72bf59e6c4..4e0991cee0d4 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -17,30 +17,17 @@ use regalloc::{RegClass, Writable}; use alloc::boxed::Box; use alloc::vec::Vec; use core::convert::TryFrom; -use smallvec::SmallVec; use super::lower::*; -/// This is target-word-size dependent. And it excludes booleans and reftypes. -fn is_valid_atomic_transaction_ty(ty: Type) -> bool { - match ty { - I8 | I16 | I32 | I64 => true, - _ => false, - } -} - /// Actually codegen an instruction's results into registers. pub(crate) fn lower_insn_to_regs>( ctx: &mut C, insn: IRInst, ) -> CodegenResult<()> { let op = ctx.data(insn).opcode(); - let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) - .map(|i| InsnInput { insn, input: i }) - .collect(); - let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) - .map(|i| InsnOutput { insn, output: i }) - .collect(); + let inputs = insn_inputs(ctx, insn); + let outputs = insn_outputs(ctx, insn); let ty = if outputs.len() > 0 { Some(ctx.output_ty(insn, 0)) } else { @@ -1128,25 +1115,6 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Uload16x4 | Opcode::Sload32x2 | Opcode::Uload32x2 => { - let off = ctx.data(insn).load_store_offset().unwrap(); - let elem_ty = match op { - Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { - I8 - } - Opcode::Sload16 - | Opcode::Uload16 - | Opcode::Sload16Complex - | Opcode::Uload16Complex => I16, - Opcode::Sload32 - | Opcode::Uload32 - | Opcode::Sload32Complex - | Opcode::Uload32Complex => I32, - Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), - Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8, - Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4, - Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2, - _ => unreachable!(), - }; let sign_extend = match op { Opcode::Sload8 | Opcode::Sload8Complex @@ -1156,65 +1124,52 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Sload32Complex => true, _ => false, }; - let is_float = ty_has_float_or_vec_representation(elem_ty); - let mem = lower_address(ctx, elem_ty, &inputs[..], off); - let rd = get_output_reg(ctx, outputs[0]); - - ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { - (1, _, _) => Inst::ULoad8 { rd, mem }, - (8, false, _) => Inst::ULoad8 { rd, mem }, - (8, true, _) => Inst::SLoad8 { rd, mem }, - (16, false, _) => Inst::ULoad16 { rd, mem }, - (16, true, _) => Inst::SLoad16 { rd, mem }, - (32, false, false) => Inst::ULoad32 { rd, mem }, - (32, true, false) => Inst::SLoad32 { rd, mem }, - (32, _, true) => Inst::FpuLoad32 { rd, mem }, - (64, _, false) => Inst::ULoad64 { rd, mem }, - // Note that we treat some of the vector loads as scalar floating-point loads, - // which is correct in a little endian environment. - (64, _, true) => Inst::FpuLoad64 { rd, mem }, - (128, _, _) => Inst::FpuLoad128 { rd, mem }, - _ => panic!("Unsupported size in load"), - }); - - let vec_extend = match op { - Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), - Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), - Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), - Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), - Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), - Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), - _ => None, - }; - - if let Some(t) = vec_extend { - ctx.emit(Inst::VecExtend { - t, - rd, - rn: rd.to_reg(), - high_half: false, - }); - } - } + lower_load( + ctx, + insn, + &inputs[..], + outputs[0], + |ctx, rd, elem_ty, mem| { + let is_float = ty_has_float_or_vec_representation(elem_ty); + ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { + (1, _, _) => Inst::ULoad8 { rd, mem }, + (8, false, _) => Inst::ULoad8 { rd, mem }, + (8, true, _) => Inst::SLoad8 { rd, mem }, + (16, false, _) => Inst::ULoad16 { rd, mem }, + (16, true, _) => Inst::SLoad16 { rd, mem }, + (32, false, false) => Inst::ULoad32 { rd, mem }, + (32, true, false) => Inst::SLoad32 { rd, mem }, + (32, _, true) => Inst::FpuLoad32 { rd, mem }, + (64, _, false) => Inst::ULoad64 { rd, mem }, + // Note that we treat some of the vector loads as scalar floating-point loads, + // which is correct in a little endian environment. + (64, _, true) => Inst::FpuLoad64 { rd, mem }, + (128, _, _) => Inst::FpuLoad128 { rd, mem }, + _ => panic!("Unsupported size in load"), + }); - /* - Opcode::LoadSplat => { - let off = ctx.data(insn).load_store_offset().unwrap(); - let ty = ty.unwrap(); - let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off); - let rd = get_output_reg(ctx, outputs[0]); - let size = VectorSize::from_ty(ty); - let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let vec_extend = match op { + Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), + Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), + Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), + Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), + Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), + Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), + _ => None, + }; - ctx.emit(Inst::LoadAddr { rd: tmp, mem }); - ctx.emit(Inst::VecLoadReplicate { - rd, - rn: tmp.to_reg(), - size, - }); + if let Some(t) = vec_extend { + ctx.emit(Inst::VecExtend { + t, + rd, + rn: rd.to_reg(), + high_half: false, + }); + } + }, + ); } - */ Opcode::Store | Opcode::Istore8 @@ -1998,6 +1953,40 @@ pub(crate) fn lower_insn_to_regs>( maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce) { lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else if let Some((_, insn)) = maybe_input_insn_multi( + ctx, + inputs[0], + &[ + Opcode::Uload8, + Opcode::Sload8, + Opcode::Uload16, + Opcode::Sload16, + Opcode::Uload32, + Opcode::Sload32, + Opcode::Load, + ], + ) { + ctx.sink_inst(insn); + let load_inputs = insn_inputs(ctx, insn); + let load_outputs = insn_outputs(ctx, insn); + lower_load( + ctx, + insn, + &load_inputs[..], + load_outputs[0], + |ctx, _rd, _elem_ty, mem| { + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem); + if let Some(addr_inst) = addr_inst { + ctx.emit(addr_inst); + } + ctx.emit(Inst::VecLoadReplicate { + rd, + rn: addr, + size, + }); + }, + ); } else { let input_ty = ctx.input_ty(insn, 0); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index af4bf020d809..614ae5b76888 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3355,21 +3355,13 @@ fn lower_insn_to_regs>( } } - Opcode::Splat | Opcode::LoadSplat => { + Opcode::Splat => { let ty = ty.unwrap(); assert_eq!(ty.bits(), 128); let src_ty = ctx.input_ty(insn, 0); assert!(src_ty.bits() < 128); - let src = match op { - Opcode::Splat => input_to_reg_mem(ctx, inputs[0]), - Opcode::LoadSplat => { - let offset = ctx.data(insn).load_store_offset().unwrap(); - let amode = lower_to_amode(ctx, inputs[0], offset); - RegMem::mem(amode) - } - _ => unreachable!(), - }; + let src = input_to_reg_mem(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]); // We know that splat will overwrite all of the lanes of `dst` but it takes several diff --git a/cranelift/codegen/src/machinst/inst_common.rs b/cranelift/codegen/src/machinst/inst_common.rs index 1ff6faedd4bd..ff2c2ae73731 100644 --- a/cranelift/codegen/src/machinst/inst_common.rs +++ b/cranelift/codegen/src/machinst/inst_common.rs @@ -1,6 +1,8 @@ //! A place to park MachInst::Inst fragments which are common across multiple architectures. +use super::{LowerCtx, VCodeInst}; use crate::ir::{self, Inst as IRInst}; +use smallvec::SmallVec; //============================================================================ // Instruction input "slots". @@ -22,6 +24,24 @@ pub(crate) struct InsnOutput { pub(crate) output: usize, } +pub(crate) fn insn_inputs>( + ctx: &C, + insn: IRInst, +) -> SmallVec<[InsnInput; 4]> { + (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect() +} + +pub(crate) fn insn_outputs>( + ctx: &C, + insn: IRInst, +) -> SmallVec<[InsnOutput; 4]> { + (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect() +} + //============================================================================ // Atomic instructions. diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index eb7ab89511ce..0d9dba658b56 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -147,9 +147,10 @@ pub trait LowerCtx { /// Emit a machine instruction that is a safepoint. fn emit_safepoint(&mut self, mach_inst: Self::I); /// Indicate that the side-effect of an instruction has been sunk to the - /// current scan location. This can only be done to an instruction with no - /// uses of its result register(s), because it will cause the instruction - /// not to be codegen'd at its original location. + /// current scan location. This should only be done with the instruction's + /// original results are not used (i.e., `put_input_in_reg` is not invoked + /// for the input produced by the sunk instruction), otherwise the + /// side-effect will occur twice. fn sink_inst(&mut self, ir_inst: Inst); /// Retrieve constant data given a handle. fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData; diff --git a/cranelift/filetests/filetests/isa/aarch64/simd.clif b/cranelift/filetests/filetests/isa/aarch64/simd.clif index 3e47bbbda2e2..2b85114211d7 100644 --- a/cranelift/filetests/filetests/isa/aarch64/simd.clif +++ b/cranelift/filetests/filetests/isa/aarch64/simd.clif @@ -47,3 +47,69 @@ block0: ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + +function %f4(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ld1r { v0.16b }, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f5(i64, i64) -> i8x16, i8x16 { +block0(v0: i64, v1: i64): + v2 = load.i8 v0 + v3 = load.i8 v1 + v4 = splat.i8x16 v2 + v5 = splat.i8x16 v3 + return v4, v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ld1r { v0.16b }, [x0] +; nextln: ld1r { v1.16b }, [x1] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f6(i64, i64) -> i8x16, i8x16 { +block0(v0: i64, v1: i64): + v2 = load.i8 v0 + v3 = load.i8 v1 + v4 = splat.i8x16 v3 + v5 = splat.i8x16 v2 + return v4, v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldrb w0, [x0] +; nextln: ld1r { v0.16b }, [x1] +; nextln: dup v1.16b, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f7(i64, i64) -> i8x16, i8x16 { +block0(v0: i64, v1: i64): + v2 = load.i8 v0 + v3 = splat.i8x16 v2 + v4 = splat.i8x16 v2 + return v3, v4 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldrb w0, [x0] +; nextln: dup v0.16b, w0 +; nextln: dup v1.16b, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret