Skip to content

Commit

Permalink
AArch64 SIMD: pattern-match load+splat into LD1R instruction.
Browse files Browse the repository at this point in the history
  • Loading branch information
cfallin committed Nov 7, 2020
1 parent b0cb320 commit 9249851
Show file tree
Hide file tree
Showing 5 changed files with 220 additions and 97 deletions.
13 changes: 13 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,19 @@ impl AMode {
pub fn label(label: MemLabel) -> AMode {
AMode::Label(label)
}

/// Does the address resolve to just a register value, with no offset or
/// other computation?
pub fn is_reg(&self) -> Option<Reg> {
match self {
&AMode::UnsignedOffset(r, uimm12) if uimm12.value() == 0 => Some(r),
&AMode::Unscaled(r, imm9) if imm9.value() == 0 => Some(r),
&AMode::RegOffset(r, off, _) if off == 0 => Some(r),
&AMode::FPOffset(off, _) if off == 0 => Some(fp_reg()),
&AMode::SPOffset(off, _) if off == 0 => Some(stack_reg()),
_ => None,
}
}
}

/// A memory argument to a load/store-pair.
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1557,6 +1557,17 @@ impl Inst {
}
}
}

/// Generate a LoadAddr instruction (load address of an amode into
/// register). Elides when possible (when amode is just a register). Returns
/// destination register: either `rd` or a register directly from the amode.
pub fn gen_load_addr(rd: Writable<Reg>, mem: AMode) -> (Reg, Option<Inst>) {
if let Some(r) = mem.is_reg() {
(r, None)
} else {
(rd.to_reg(), Some(Inst::LoadAddr { rd, mem }))
}
}
}

//=============================================================================
Expand Down
224 changes: 127 additions & 97 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@ use crate::binemit::CodeOffset;
use crate::ir::condcodes::FloatCC;
use crate::ir::types::*;
use crate::ir::Inst as IRInst;
use crate::ir::{InstructionData, Opcode, TrapCode};
use crate::ir::{InstructionData, Opcode, SourceLoc, TrapCode};
use crate::machinst::lower::*;
use crate::machinst::*;
use crate::{CodegenError, CodegenResult};

use crate::isa::aarch64::abi::*;
use crate::isa::aarch64::inst::*;

use regalloc::{RegClass, Writable};
use regalloc::{Reg, RegClass, Writable};

use alloc::boxed::Box;
use alloc::vec::Vec;
use core::convert::TryFrom;
use smallvec::SmallVec;

use super::lower::*;

Expand All @@ -29,18 +28,60 @@ fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
}
}

fn load_op_to_ty(op: Opcode) -> Option<Type> {
match op {
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8),
Opcode::Sload16 | Opcode::Uload16 | Opcode::Sload16Complex | Opcode::Uload16Complex => {
Some(I16)
}
Opcode::Sload32 | Opcode::Uload32 | Opcode::Sload32Complex | Opcode::Uload32Complex => {
Some(I32)
}
Opcode::Load | Opcode::LoadComplex => None,
Opcode::Sload8x8 | Opcode::Uload8x8 => Some(I8X8),
Opcode::Sload16x4 | Opcode::Uload16x4 => Some(I16X4),
Opcode::Sload32x2 | Opcode::Uload32x2 => Some(I32X2),
_ => None,
}
}

/// Helper to lower a load instruction; this is used in several places, because
/// a load can sometimes be merged into another operation.
fn lower_load<
C: LowerCtx<I = Inst>,
F: FnMut(&mut C, Writable<Reg>, Type, AMode, Option<SourceLoc>),
>(
ctx: &mut C,
ir_inst: IRInst,
inputs: &[InsnInput],
output: InsnOutput,
mut f: F,
) {
let op = ctx.data(ir_inst).opcode();

let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0));

let off = ctx.data(ir_inst).load_store_offset().unwrap();
let mem = lower_address(ctx, elem_ty, &inputs[..], off);
let rd = get_output_reg(ctx, output);
let memflags = ctx.memflags(ir_inst).expect("memory flags");
let srcloc = if !memflags.notrap() {
Some(ctx.srcloc(ir_inst))
} else {
None
};

f(ctx, rd, elem_ty, mem, srcloc);
}

/// Actually codegen an instruction's results into registers.
pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx: &mut C,
insn: IRInst,
) -> CodegenResult<()> {
let op = ctx.data(insn).opcode();
let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
.map(|i| InsnInput { insn, input: i })
.collect();
let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
.map(|i| InsnOutput { insn, output: i })
.collect();
let inputs = insn_inputs(ctx, insn);
let outputs = insn_outputs(ctx, insn);
let ty = if outputs.len() > 0 {
Some(ctx.output_ty(insn, 0))
} else {
Expand Down Expand Up @@ -1128,25 +1169,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Uload16x4
| Opcode::Sload32x2
| Opcode::Uload32x2 => {
let off = ctx.data(insn).load_store_offset().unwrap();
let elem_ty = match op {
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
I8
}
Opcode::Sload16
| Opcode::Uload16
| Opcode::Sload16Complex
| Opcode::Uload16Complex => I16,
Opcode::Sload32
| Opcode::Uload32
| Opcode::Sload32Complex
| Opcode::Uload32Complex => I32,
Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8,
Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4,
Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2,
_ => unreachable!(),
};
let sign_extend = match op {
Opcode::Sload8
| Opcode::Sload8Complex
Expand All @@ -1156,79 +1178,52 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload32Complex => true,
_ => false,
};
let is_float = ty_has_float_or_vec_representation(elem_ty);

let mem = lower_address(ctx, elem_ty, &inputs[..], off);
let rd = get_output_reg(ctx, outputs[0]);

let memflags = ctx.memflags(insn).expect("memory flags");
let srcloc = if !memflags.notrap() {
Some(ctx.srcloc(insn))
} else {
None
};

ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
(1, _, _) => Inst::ULoad8 { rd, mem, srcloc },
(8, false, _) => Inst::ULoad8 { rd, mem, srcloc },
(8, true, _) => Inst::SLoad8 { rd, mem, srcloc },
(16, false, _) => Inst::ULoad16 { rd, mem, srcloc },
(16, true, _) => Inst::SLoad16 { rd, mem, srcloc },
(32, false, false) => Inst::ULoad32 { rd, mem, srcloc },
(32, true, false) => Inst::SLoad32 { rd, mem, srcloc },
(32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc },
(64, _, false) => Inst::ULoad64 { rd, mem, srcloc },
// Note that we treat some of the vector loads as scalar floating-point loads,
// which is correct in a little endian environment.
(64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc },
(128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc },
_ => panic!("Unsupported size in load"),
});

let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
_ => None,
};

if let Some(t) = vec_extend {
ctx.emit(Inst::VecExtend {
t,
rd,
rn: rd.to_reg(),
high_half: false,
});
}
}
lower_load(
ctx,
insn,
&inputs[..],
outputs[0],
|ctx, rd, elem_ty, mem, srcloc| {
let is_float = ty_has_float_or_vec_representation(elem_ty);
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
(1, _, _) => Inst::ULoad8 { rd, mem, srcloc },
(8, false, _) => Inst::ULoad8 { rd, mem, srcloc },
(8, true, _) => Inst::SLoad8 { rd, mem, srcloc },
(16, false, _) => Inst::ULoad16 { rd, mem, srcloc },
(16, true, _) => Inst::SLoad16 { rd, mem, srcloc },
(32, false, false) => Inst::ULoad32 { rd, mem, srcloc },
(32, true, false) => Inst::SLoad32 { rd, mem, srcloc },
(32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc },
(64, _, false) => Inst::ULoad64 { rd, mem, srcloc },
// Note that we treat some of the vector loads as scalar floating-point loads,
// which is correct in a little endian environment.
(64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc },
(128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc },
_ => panic!("Unsupported size in load"),
});

/*
Opcode::LoadSplat => {
let off = ctx.data(insn).load_store_offset().unwrap();
let ty = ty.unwrap();
let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
let memflags = ctx.memflags(insn).expect("memory flags");
let rd = get_output_reg(ctx, outputs[0]);
let size = VectorSize::from_ty(ty);
let srcloc = if memflags.notrap() {
None
} else {
Some(ctx.srcloc(insn))
};
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
_ => None,
};

ctx.emit(Inst::LoadAddr { rd: tmp, mem });
ctx.emit(Inst::VecLoadReplicate {
rd,
rn: tmp.to_reg(),
size,
srcloc,
});
if let Some(t) = vec_extend {
ctx.emit(Inst::VecExtend {
t,
rd,
rn: rd.to_reg(),
high_half: false,
});
}
},
);
}
*/

Opcode::Store
| Opcode::Istore8
Expand Down Expand Up @@ -2057,6 +2052,41 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
{
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
} else if let Some((_, insn)) = maybe_input_insn_multi(
ctx,
inputs[0],
&[
Opcode::Uload8,
Opcode::Sload8,
Opcode::Uload16,
Opcode::Sload16,
Opcode::Uload32,
Opcode::Sload32,
Opcode::Load,
],
) {
ctx.sink_inst(insn);
let load_inputs = insn_inputs(ctx, insn);
let load_outputs = insn_outputs(ctx, insn);
lower_load(
ctx,
insn,
&load_inputs[..],
load_outputs[0],
|ctx, _rd, _elem_ty, mem, srcloc| {
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
if let Some(addr_inst) = addr_inst {
ctx.emit(addr_inst);
}
ctx.emit(Inst::VecLoadReplicate {
rd,
rn: addr,
size,
srcloc,
});
},
);
} else {
let input_ty = ctx.input_ty(insn, 0);
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
Expand Down
20 changes: 20 additions & 0 deletions cranelift/codegen/src/machinst/inst_common.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! A place to park MachInst::Inst fragments which are common across multiple architectures.
use super::{LowerCtx, VCodeInst};
use crate::ir::{self, Inst as IRInst};
use smallvec::SmallVec;

//============================================================================
// Instruction input "slots".
Expand All @@ -22,6 +24,24 @@ pub(crate) struct InsnOutput {
pub(crate) output: usize,
}

pub(crate) fn insn_inputs<I: VCodeInst, C: LowerCtx<I = I>>(
ctx: &C,
insn: IRInst,
) -> SmallVec<[InsnInput; 4]> {
(0..ctx.num_inputs(insn))
.map(|i| InsnInput { insn, input: i })
.collect()
}

pub(crate) fn insn_outputs<I: VCodeInst, C: LowerCtx<I = I>>(
ctx: &C,
insn: IRInst,
) -> SmallVec<[InsnOutput; 4]> {
(0..ctx.num_outputs(insn))
.map(|i| InsnOutput { insn, output: i })
.collect()
}

//============================================================================
// Atomic instructions.

Expand Down
Loading

0 comments on commit 9249851

Please sign in to comment.