From 9249851bc658f76d709602b7976f138fc8bff7cc Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Fri, 6 Nov 2020 16:12:49 -0800 Subject: [PATCH] AArch64 SIMD: pattern-match load+splat into `LD1R` instruction. --- .../codegen/src/isa/aarch64/inst/args.rs | 13 + cranelift/codegen/src/isa/aarch64/inst/mod.rs | 11 + .../codegen/src/isa/aarch64/lower_inst.rs | 224 ++++++++++-------- cranelift/codegen/src/machinst/inst_common.rs | 20 ++ .../filetests/filetests/isa/aarch64/simd.clif | 49 ++++ 5 files changed, 220 insertions(+), 97 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 7bd181c86b03..66f0d071d48c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -209,6 +209,19 @@ impl AMode { pub fn label(label: MemLabel) -> AMode { AMode::Label(label) } + + /// Does the address resolve to just a register value, with no offset or + /// other computation? + pub fn is_reg(&self) -> Option { + match self { + &AMode::UnsignedOffset(r, uimm12) if uimm12.value() == 0 => Some(r), + &AMode::Unscaled(r, imm9) if imm9.value() == 0 => Some(r), + &AMode::RegOffset(r, off, _) if off == 0 => Some(r), + &AMode::FPOffset(off, _) if off == 0 => Some(fp_reg()), + &AMode::SPOffset(off, _) if off == 0 => Some(stack_reg()), + _ => None, + } + } } /// A memory argument to a load/store-pair. diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index d99f1b45d045..0a0d70c804da 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -1557,6 +1557,17 @@ impl Inst { } } } + + /// Generate a LoadAddr instruction (load address of an amode into + /// register). Elides when possible (when amode is just a register). Returns + /// destination register: either `rd` or a register directly from the amode. + pub fn gen_load_addr(rd: Writable, mem: AMode) -> (Reg, Option) { + if let Some(r) = mem.is_reg() { + (r, None) + } else { + (rd.to_reg(), Some(Inst::LoadAddr { rd, mem })) + } + } } //============================================================================= diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 860166e88733..b1e7cd214840 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -4,7 +4,7 @@ use crate::binemit::CodeOffset; use crate::ir::condcodes::FloatCC; use crate::ir::types::*; use crate::ir::Inst as IRInst; -use crate::ir::{InstructionData, Opcode, TrapCode}; +use crate::ir::{InstructionData, Opcode, SourceLoc, TrapCode}; use crate::machinst::lower::*; use crate::machinst::*; use crate::{CodegenError, CodegenResult}; @@ -12,12 +12,11 @@ use crate::{CodegenError, CodegenResult}; use crate::isa::aarch64::abi::*; use crate::isa::aarch64::inst::*; -use regalloc::{RegClass, Writable}; +use regalloc::{Reg, RegClass, Writable}; use alloc::boxed::Box; use alloc::vec::Vec; use core::convert::TryFrom; -use smallvec::SmallVec; use super::lower::*; @@ -29,18 +28,60 @@ fn is_valid_atomic_transaction_ty(ty: Type) -> bool { } } +fn load_op_to_ty(op: Opcode) -> Option { + match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8), + Opcode::Sload16 | Opcode::Uload16 | Opcode::Sload16Complex | Opcode::Uload16Complex => { + Some(I16) + } + Opcode::Sload32 | Opcode::Uload32 | Opcode::Sload32Complex | Opcode::Uload32Complex => { + Some(I32) + } + Opcode::Load | Opcode::LoadComplex => None, + Opcode::Sload8x8 | Opcode::Uload8x8 => Some(I8X8), + Opcode::Sload16x4 | Opcode::Uload16x4 => Some(I16X4), + Opcode::Sload32x2 | Opcode::Uload32x2 => Some(I32X2), + _ => None, + } +} + +/// Helper to lower a load instruction; this is used in several places, because +/// a load can sometimes be merged into another operation. +fn lower_load< + C: LowerCtx, + F: FnMut(&mut C, Writable, Type, AMode, Option), +>( + ctx: &mut C, + ir_inst: IRInst, + inputs: &[InsnInput], + output: InsnOutput, + mut f: F, +) { + let op = ctx.data(ir_inst).opcode(); + + let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0)); + + let off = ctx.data(ir_inst).load_store_offset().unwrap(); + let mem = lower_address(ctx, elem_ty, &inputs[..], off); + let rd = get_output_reg(ctx, output); + let memflags = ctx.memflags(ir_inst).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(ir_inst)) + } else { + None + }; + + f(ctx, rd, elem_ty, mem, srcloc); +} + /// Actually codegen an instruction's results into registers. pub(crate) fn lower_insn_to_regs>( ctx: &mut C, insn: IRInst, ) -> CodegenResult<()> { let op = ctx.data(insn).opcode(); - let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) - .map(|i| InsnInput { insn, input: i }) - .collect(); - let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) - .map(|i| InsnOutput { insn, output: i }) - .collect(); + let inputs = insn_inputs(ctx, insn); + let outputs = insn_outputs(ctx, insn); let ty = if outputs.len() > 0 { Some(ctx.output_ty(insn, 0)) } else { @@ -1128,25 +1169,6 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Uload16x4 | Opcode::Sload32x2 | Opcode::Uload32x2 => { - let off = ctx.data(insn).load_store_offset().unwrap(); - let elem_ty = match op { - Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { - I8 - } - Opcode::Sload16 - | Opcode::Uload16 - | Opcode::Sload16Complex - | Opcode::Uload16Complex => I16, - Opcode::Sload32 - | Opcode::Uload32 - | Opcode::Sload32Complex - | Opcode::Uload32Complex => I32, - Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), - Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8, - Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4, - Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2, - _ => unreachable!(), - }; let sign_extend = match op { Opcode::Sload8 | Opcode::Sload8Complex @@ -1156,79 +1178,52 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Sload32Complex => true, _ => false, }; - let is_float = ty_has_float_or_vec_representation(elem_ty); - - let mem = lower_address(ctx, elem_ty, &inputs[..], off); - let rd = get_output_reg(ctx, outputs[0]); - - let memflags = ctx.memflags(insn).expect("memory flags"); - let srcloc = if !memflags.notrap() { - Some(ctx.srcloc(insn)) - } else { - None - }; - ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { - (1, _, _) => Inst::ULoad8 { rd, mem, srcloc }, - (8, false, _) => Inst::ULoad8 { rd, mem, srcloc }, - (8, true, _) => Inst::SLoad8 { rd, mem, srcloc }, - (16, false, _) => Inst::ULoad16 { rd, mem, srcloc }, - (16, true, _) => Inst::SLoad16 { rd, mem, srcloc }, - (32, false, false) => Inst::ULoad32 { rd, mem, srcloc }, - (32, true, false) => Inst::SLoad32 { rd, mem, srcloc }, - (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc }, - (64, _, false) => Inst::ULoad64 { rd, mem, srcloc }, - // Note that we treat some of the vector loads as scalar floating-point loads, - // which is correct in a little endian environment. - (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc }, - (128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc }, - _ => panic!("Unsupported size in load"), - }); - - let vec_extend = match op { - Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), - Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), - Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), - Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), - Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), - Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), - _ => None, - }; - - if let Some(t) = vec_extend { - ctx.emit(Inst::VecExtend { - t, - rd, - rn: rd.to_reg(), - high_half: false, - }); - } - } + lower_load( + ctx, + insn, + &inputs[..], + outputs[0], + |ctx, rd, elem_ty, mem, srcloc| { + let is_float = ty_has_float_or_vec_representation(elem_ty); + ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { + (1, _, _) => Inst::ULoad8 { rd, mem, srcloc }, + (8, false, _) => Inst::ULoad8 { rd, mem, srcloc }, + (8, true, _) => Inst::SLoad8 { rd, mem, srcloc }, + (16, false, _) => Inst::ULoad16 { rd, mem, srcloc }, + (16, true, _) => Inst::SLoad16 { rd, mem, srcloc }, + (32, false, false) => Inst::ULoad32 { rd, mem, srcloc }, + (32, true, false) => Inst::SLoad32 { rd, mem, srcloc }, + (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc }, + (64, _, false) => Inst::ULoad64 { rd, mem, srcloc }, + // Note that we treat some of the vector loads as scalar floating-point loads, + // which is correct in a little endian environment. + (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc }, + (128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc }, + _ => panic!("Unsupported size in load"), + }); - /* - Opcode::LoadSplat => { - let off = ctx.data(insn).load_store_offset().unwrap(); - let ty = ty.unwrap(); - let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off); - let memflags = ctx.memflags(insn).expect("memory flags"); - let rd = get_output_reg(ctx, outputs[0]); - let size = VectorSize::from_ty(ty); - let srcloc = if memflags.notrap() { - None - } else { - Some(ctx.srcloc(insn)) - }; - let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let vec_extend = match op { + Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), + Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), + Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), + Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), + Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), + Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), + _ => None, + }; - ctx.emit(Inst::LoadAddr { rd: tmp, mem }); - ctx.emit(Inst::VecLoadReplicate { - rd, - rn: tmp.to_reg(), - size, - srcloc, - }); + if let Some(t) = vec_extend { + ctx.emit(Inst::VecExtend { + t, + rd, + rn: rd.to_reg(), + high_half: false, + }); + } + }, + ); } - */ Opcode::Store | Opcode::Istore8 @@ -2057,6 +2052,41 @@ pub(crate) fn lower_insn_to_regs>( maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce) { lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else if let Some((_, insn)) = maybe_input_insn_multi( + ctx, + inputs[0], + &[ + Opcode::Uload8, + Opcode::Sload8, + Opcode::Uload16, + Opcode::Sload16, + Opcode::Uload32, + Opcode::Sload32, + Opcode::Load, + ], + ) { + ctx.sink_inst(insn); + let load_inputs = insn_inputs(ctx, insn); + let load_outputs = insn_outputs(ctx, insn); + lower_load( + ctx, + insn, + &load_inputs[..], + load_outputs[0], + |ctx, _rd, _elem_ty, mem, srcloc| { + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem); + if let Some(addr_inst) = addr_inst { + ctx.emit(addr_inst); + } + ctx.emit(Inst::VecLoadReplicate { + rd, + rn: addr, + size, + srcloc, + }); + }, + ); } else { let input_ty = ctx.input_ty(insn, 0); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); diff --git a/cranelift/codegen/src/machinst/inst_common.rs b/cranelift/codegen/src/machinst/inst_common.rs index 1ff6faedd4bd..ff2c2ae73731 100644 --- a/cranelift/codegen/src/machinst/inst_common.rs +++ b/cranelift/codegen/src/machinst/inst_common.rs @@ -1,6 +1,8 @@ //! A place to park MachInst::Inst fragments which are common across multiple architectures. +use super::{LowerCtx, VCodeInst}; use crate::ir::{self, Inst as IRInst}; +use smallvec::SmallVec; //============================================================================ // Instruction input "slots". @@ -22,6 +24,24 @@ pub(crate) struct InsnOutput { pub(crate) output: usize, } +pub(crate) fn insn_inputs>( + ctx: &C, + insn: IRInst, +) -> SmallVec<[InsnInput; 4]> { + (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect() +} + +pub(crate) fn insn_outputs>( + ctx: &C, + insn: IRInst, +) -> SmallVec<[InsnOutput; 4]> { + (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect() +} + //============================================================================ // Atomic instructions. diff --git a/cranelift/filetests/filetests/isa/aarch64/simd.clif b/cranelift/filetests/filetests/isa/aarch64/simd.clif index 3e47bbbda2e2..f6a3a2334776 100644 --- a/cranelift/filetests/filetests/isa/aarch64/simd.clif +++ b/cranelift/filetests/filetests/isa/aarch64/simd.clif @@ -47,3 +47,52 @@ block0: ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + +function %f4(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ld1r { v0.16b }, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f5(i64, i64) -> i8x16, i8x16 { +block0(v0: i64, v1: i64): + v2 = load.i8 v0 + v3 = load.i8 v1 + v4 = splat.i8x16 v2 + v5 = splat.i8x16 v3 + return v4, v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ld1r { v0.16b }, [x0] +; nextln: ld1r { v1.16b }, [x1] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f6(i64, i64) -> i8x16, i8x16 { +block0(v0: i64, v1: i64): + v2 = load.i8 v0 + v3 = load.i8 v1 + v4 = splat.i8x16 v3 + v5 = splat.i8x16 v2 + return v4, v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldrb w0, [x0] +; nextln: ld1r { v0.16b }, [x1] +; nextln: dup v1.16b, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret