From 48c77b744820495957b78115a6399bf92b32a31c Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Fri, 15 Jul 2022 18:39:25 +0100 Subject: [PATCH] cranelift: Implement scalar FMA on x86 x86 does not have dedicated instructions for scalar FMA, lower to a libcall which seems to be what llvm does. --- cranelift/codegen/src/ir/libcall.rs | 65 ++++++++++++++++++- cranelift/codegen/src/isa/aarch64/lower.rs | 2 +- .../codegen/src/isa/aarch64/lower/isle.rs | 14 +++- .../codegen/src/isa/aarch64/lower_inst.rs | 4 +- cranelift/codegen/src/isa/s390x/lower.rs | 12 +++- cranelift/codegen/src/isa/s390x/lower/isle.rs | 27 ++++++-- cranelift/codegen/src/isa/x64/inst.isle | 11 ++++ cranelift/codegen/src/isa/x64/lower.isle | 8 +++ cranelift/codegen/src/isa/x64/lower.rs | 50 ++++---------- cranelift/codegen/src/isa/x64/lower/isle.rs | 35 +++++++++- cranelift/codegen/src/machinst/isle.rs | 4 ++ .../filetests/filetests/runtests/fma.clif | 1 + cranelift/module/src/lib.rs | 2 + 13 files changed, 181 insertions(+), 54 deletions(-) diff --git a/cranelift/codegen/src/ir/libcall.rs b/cranelift/codegen/src/ir/libcall.rs index 5dbbd7232df5..5735c851dc3f 100644 --- a/cranelift/codegen/src/ir/libcall.rs +++ b/cranelift/codegen/src/ir/libcall.rs @@ -1,6 +1,9 @@ //! Naming well-known routines in the runtime library. -use crate::ir::{types, ExternalName, FuncRef, Function, Opcode, Type}; +use crate::ir::{ + types, AbiParam, ArgumentPurpose, ExternalName, FuncRef, Function, Opcode, Signature, Type, +}; +use crate::isa::CallConv; use core::fmt; use core::str::FromStr; #[cfg(feature = "enable-serde")] @@ -50,6 +53,10 @@ pub enum LibCall { NearestF32, /// nearest.f64 NearestF64, + /// fma.f32 + FmaF32, + /// fma.f64 + FmaF64, /// libc.memcpy Memcpy, /// libc.memset @@ -91,6 +98,8 @@ impl FromStr for LibCall { "TruncF64" => Ok(Self::TruncF64), "NearestF32" => Ok(Self::NearestF32), "NearestF64" => Ok(Self::NearestF64), + "FmaF32" => Ok(Self::FmaF32), + "FmaF64" => Ok(Self::FmaF64), "Memcpy" => Ok(Self::Memcpy), "Memset" => Ok(Self::Memset), "Memmove" => Ok(Self::Memmove), @@ -124,6 +133,7 @@ impl LibCall { Opcode::Floor => Self::FloorF32, Opcode::Trunc => Self::TruncF32, Opcode::Nearest => Self::NearestF32, + Opcode::Fma => Self::FmaF32, _ => return None, }, types::F64 => match opcode { @@ -131,6 +141,7 @@ impl LibCall { Opcode::Floor => Self::FloorF64, Opcode::Trunc => Self::TruncF64, Opcode::Nearest => Self::NearestF64, + Opcode::Fma => Self::FmaF64, _ => return None, }, _ => return None, @@ -157,6 +168,8 @@ impl LibCall { TruncF64, NearestF32, NearestF64, + FmaF32, + FmaF64, Memcpy, Memset, Memmove, @@ -164,6 +177,56 @@ impl LibCall { ElfTlsGetAddr, ] } + + /// Get a [Signature] for the function targeted by this [LibCall]. + pub fn signature(&self, call_conv: CallConv) -> Signature { + use types::*; + let mut sig = Signature::new(call_conv); + + match self { + LibCall::UdivI64 + | LibCall::SdivI64 + | LibCall::UremI64 + | LibCall::SremI64 + | LibCall::IshlI64 + | LibCall::UshrI64 + | LibCall::SshrI64 => { + sig.params.push(AbiParam::new(I64)); + sig.params.push(AbiParam::new(I64)); + sig.returns.push(AbiParam::new(I64)); + } + LibCall::CeilF32 | LibCall::FloorF32 | LibCall::TruncF32 | LibCall::NearestF32 => { + sig.params.push(AbiParam::new(F32)); + sig.returns.push(AbiParam::new(F32)); + } + LibCall::TruncF64 | LibCall::FloorF64 | LibCall::CeilF64 | LibCall::NearestF64 => { + sig.params.push(AbiParam::new(F64)); + sig.returns.push(AbiParam::new(F64)); + } + LibCall::FmaF32 | LibCall::FmaF64 => { + let ty = if *self == LibCall::FmaF32 { F32 } else { F64 }; + + sig.params.push(AbiParam::new(ty)); + sig.params.push(AbiParam::new(ty)); + sig.params.push(AbiParam::new(ty)); + sig.returns.push(AbiParam::new(ty)); + } + LibCall::Probestack + | LibCall::Memcpy + | LibCall::Memset + | LibCall::Memmove + | LibCall::Memcmp + | LibCall::ElfTlsGetAddr => unimplemented!(), + } + + if call_conv.extends_baldrdash() { + // Adds the special VMContext parameter to the signature. + sig.params + .push(AbiParam::special(I64, ArgumentPurpose::VMContext)); + } + + sig + } } /// Get a function reference for the probestack function in `func`. diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 5999fbe87506..6c31e3150c76 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1551,7 +1551,7 @@ impl LowerBackend for AArch64Backend { type MInst = Inst; fn lower>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { - lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.isa_flags) + lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.triple, &self.flags, &self.isa_flags) } fn lower_branch_group>( diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index e6d51db79687..0e80e7c06e4a 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -28,6 +28,7 @@ use crate::{ use std::boxed::Box; use std::convert::TryFrom; use std::vec::Vec; +use target_lexicon::Triple; type BoxCallInfo = Box; type BoxCallIndInfo = Box; @@ -38,6 +39,7 @@ type BoxExternalName = Box; /// The main entry point for lowering with ISLE. pub(crate) fn lower( lower_ctx: &mut C, + triple: &Triple, flags: &Flags, isa_flags: &IsaFlags, outputs: &[InsnOutput], @@ -46,9 +48,15 @@ pub(crate) fn lower( where C: LowerCtx, { - lower_common(lower_ctx, flags, isa_flags, outputs, inst, |cx, insn| { - generated_code::constructor_lower(cx, insn) - }) + lower_common( + lower_ctx, + triple, + flags, + isa_flags, + outputs, + inst, + |cx, insn| generated_code::constructor_lower(cx, insn), + ) } pub struct ExtendedValue { diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index b407ef3dd92a..18bd0b8c6828 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -16,11 +16,13 @@ use crate::{CodegenError, CodegenResult}; use alloc::boxed::Box; use alloc::vec::Vec; use core::convert::TryFrom; +use target_lexicon::Triple; /// Actually codegen an instruction's results into registers. pub(crate) fn lower_insn_to_regs>( ctx: &mut C, insn: IRInst, + triple: &Triple, flags: &Flags, isa_flags: &aarch64_settings::Flags, ) -> CodegenResult<()> { @@ -33,7 +35,7 @@ pub(crate) fn lower_insn_to_regs>( None }; - if let Ok(()) = super::lower::isle::lower(ctx, flags, isa_flags, &outputs, insn) { + if let Ok(()) = super::lower::isle::lower(ctx, triple, flags, isa_flags, &outputs, insn) { return Ok(()); } diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs index 2c87621aae32..3c8e0ad9a414 100644 --- a/cranelift/codegen/src/isa/s390x/lower.rs +++ b/cranelift/codegen/src/isa/s390x/lower.rs @@ -30,9 +30,14 @@ impl LowerBackend for S390xBackend { None }; - if let Ok(()) = - super::lower::isle::lower(ctx, &self.flags, &self.isa_flags, &outputs, ir_inst) - { + if let Ok(()) = super::lower::isle::lower( + ctx, + &self.triple, + &self.flags, + &self.isa_flags, + &outputs, + ir_inst, + ) { return Ok(()); } @@ -294,6 +299,7 @@ impl LowerBackend for S390xBackend { // the second branch (if any) by emitting a two-way conditional branch. if let Ok(()) = super::lower::isle::lower_branch( ctx, + &self.triple, &self.flags, &self.isa_flags, branches[0], diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs index 2d41c6a88adc..4ccb54d7eb0e 100644 --- a/cranelift/codegen/src/isa/s390x/lower/isle.rs +++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs @@ -25,6 +25,7 @@ use std::boxed::Box; use std::cell::Cell; use std::convert::TryFrom; use std::vec::Vec; +use target_lexicon::Triple; type BoxCallInfo = Box; type BoxCallIndInfo = Box; @@ -36,6 +37,7 @@ type VecMInstBuilder = Cell>; /// The main entry point for lowering with ISLE. pub(crate) fn lower( lower_ctx: &mut C, + triple: &Triple, flags: &Flags, isa_flags: &IsaFlags, outputs: &[InsnOutput], @@ -44,14 +46,21 @@ pub(crate) fn lower( where C: LowerCtx, { - lower_common(lower_ctx, flags, isa_flags, outputs, inst, |cx, insn| { - generated_code::constructor_lower(cx, insn) - }) + lower_common( + lower_ctx, + triple, + flags, + isa_flags, + outputs, + inst, + |cx, insn| generated_code::constructor_lower(cx, insn), + ) } /// The main entry point for branch lowering with ISLE. pub(crate) fn lower_branch( lower_ctx: &mut C, + triple: &Triple, flags: &Flags, isa_flags: &IsaFlags, branch: Inst, @@ -60,9 +69,15 @@ pub(crate) fn lower_branch( where C: LowerCtx, { - lower_common(lower_ctx, flags, isa_flags, &[], branch, |cx, insn| { - generated_code::constructor_lower_branch(cx, insn, &targets.to_vec()) - }) + lower_common( + lower_ctx, + triple, + flags, + isa_flags, + &[], + branch, + |cx, insn| generated_code::constructor_lower_branch(cx, insn, &targets.to_vec()), + ) } impl generated_code::Context for IsleContext<'_, C, Flags, IsaFlags, 6> diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 0df0f53c5b7a..081134c3dd46 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -3023,3 +3023,14 @@ (decl synthetic_amode_to_xmm_mem (SyntheticAmode) XmmMem) (rule (synthetic_amode_to_xmm_mem amode) (synthetic_amode_to_reg_mem amode)) + + +;;;; Helpers for Emitting LibCalls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(type LibCall extern + (enum + FmaF32 + FmaF64)) + +(decl libcall_3_ret_1 (LibCall Reg Reg Reg) Reg) +(extern constructor libcall_3_ret_1 libcall_3_ret_1) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 848794f85e6c..a4229c25df37 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2542,6 +2542,14 @@ (rule (lower (has_type $F64X2 (fmax_pseudo x y))) (x64_maxpd y x)) +;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $F32 (fma x y z))) + (libcall_3_ret_1 (LibCall.FmaF32) x y z)) +(rule (lower (has_type $F64 (fma x y z))) + (libcall_3_ret_1 (LibCall.FmaF64) x y z)) + + ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; In order to load a value from memory to a GPR register, we may need to extend diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 07bf0c6e74f7..4a1b7fef8b4f 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -6,8 +6,7 @@ pub(super) mod isle; use crate::data_value::DataValue; use crate::ir::{ condcodes::{CondCode, FloatCC, IntCC}, - types, AbiParam, ArgumentPurpose, ExternalName, Inst as IRInst, InstructionData, LibCall, - Opcode, Signature, Type, + types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type, }; use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; @@ -575,35 +574,13 @@ fn emit_fcmp>( cond_result } -fn make_libcall_sig>( - ctx: &mut C, - insn: IRInst, - call_conv: CallConv, - ptr_ty: Type, -) -> Signature { - let mut sig = Signature::new(call_conv); - for i in 0..ctx.num_inputs(insn) { - sig.params.push(AbiParam::new(ctx.input_ty(insn, i))); - } - for i in 0..ctx.num_outputs(insn) { - sig.returns.push(AbiParam::new(ctx.output_ty(insn, i))); - } - if call_conv.extends_baldrdash() { - // Adds the special VMContext parameter to the signature. - sig.params - .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext)); - } - sig -} - fn emit_vm_call>( ctx: &mut C, flags: &Flags, triple: &Triple, libcall: LibCall, - insn: IRInst, - inputs: SmallVec<[InsnInput; 4]>, - outputs: SmallVec<[InsnOutput; 2]>, + inputs: &[Reg], + outputs: &[Writable], ) -> CodegenResult<()> { let extname = ExternalName::LibCall(libcall); @@ -615,7 +592,7 @@ fn emit_vm_call>( // TODO avoid recreating signatures for every single Libcall function. let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple)); - let sig = make_libcall_sig(ctx, insn, call_conv, types::I64); + let sig = libcall.signature(call_conv); let caller_conv = ctx.abi().call_conv(); let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv, flags)?; @@ -626,8 +603,7 @@ fn emit_vm_call>( assert_eq!(inputs.len() + vm_context, abi.num_args()); for (i, input) in inputs.iter().enumerate() { - let arg_reg = put_input_in_reg(ctx, *input); - abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg)); + abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(*input)); } if call_conv.extends_baldrdash() { let vm_context_vreg = ctx @@ -638,8 +614,7 @@ fn emit_vm_call>( abi.emit_call(ctx); for (i, output) in outputs.iter().enumerate() { - let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap(); - abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg)); + abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(*output)); } abi.emit_stack_post_adjust(ctx); @@ -825,7 +800,7 @@ fn lower_insn_to_regs>( None }; - if let Ok(()) = isle::lower(ctx, flags, isa_flags, &outputs, insn) { + if let Ok(()) = isle::lower(ctx, triple, flags, isa_flags, &outputs, insn) { return Ok(()); } @@ -893,7 +868,8 @@ fn lower_insn_to_regs>( | Opcode::Fmin | Opcode::Fmax | Opcode::FminPseudo - | Opcode::FmaxPseudo => implemented_in_isle(ctx), + | Opcode::FmaxPseudo + | Opcode::Fma => implemented_in_isle(ctx), Opcode::Icmp => { implemented_in_isle(ctx); @@ -2103,7 +2079,11 @@ fn lower_insn_to_regs>( ty, op ), }; - emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; + + let input = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + + emit_vm_call(ctx, flags, triple, libcall, &[input], &[dst])?; } } @@ -2917,8 +2897,6 @@ fn lower_insn_to_regs>( Opcode::Cls => unimplemented!("Cls not supported"), - Opcode::Fma => unimplemented!("Fma not supported"), - Opcode::BorNot | Opcode::BxorNot => { unimplemented!("or-not / xor-not opcodes not implemented"); } diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 9b068b1ebaaf..0b3e2a78c1db 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -10,6 +10,8 @@ use generated_code::MInst; // Types that the generated ISLE code uses via `use super::*`. use super::{is_int_or_ref_ty, is_mergeable_load, lower_to_amode}; +use crate::ir::LibCall; +use crate::isa::x64::lower::emit_vm_call; use crate::{ ir::{ condcodes::{FloatCC, IntCC}, @@ -31,6 +33,7 @@ use crate::{ }; use std::boxed::Box; use std::convert::TryFrom; +use target_lexicon::Triple; type BoxCallInfo = Box; @@ -43,6 +46,7 @@ pub struct SinkableLoad { /// The main entry point for lowering with ISLE. pub(crate) fn lower( lower_ctx: &mut C, + triple: &Triple, flags: &Flags, isa_flags: &IsaFlags, outputs: &[InsnOutput], @@ -51,9 +55,15 @@ pub(crate) fn lower( where C: LowerCtx, { - lower_common(lower_ctx, flags, isa_flags, outputs, inst, |cx, insn| { - generated_code::constructor_lower(cx, insn) - }) + lower_common( + lower_ctx, + triple, + flags, + isa_flags, + outputs, + inst, + |cx, insn| generated_code::constructor_lower(cx, insn), + ) } impl generated_code::Context for IsleContext<'_, C, Flags, IsaFlags, 6> @@ -573,6 +583,25 @@ where fn atomic_rmw_op_to_mach_atomic_rmw_op(&mut self, op: &AtomicRmwOp) -> MachAtomicRmwOp { MachAtomicRmwOp::from(*op) } + + #[inline] + fn libcall_3_ret_1(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg { + let call_conv = self.lower_ctx.abi().call_conv(); + let ret_ty = libcall.signature(call_conv).returns[0].value_type; + let output_reg = self.lower_ctx.alloc_tmp(ret_ty).only_reg().unwrap(); + + emit_vm_call( + self.lower_ctx, + self.flags, + self.triple, + libcall.clone(), + &[a, b, c], + &[output_reg], + ) + .expect("Failed to emit LibCall"); + + output_reg.to_reg() + } } // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 28005863df33..6fd2d0743af8 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -4,6 +4,7 @@ use alloc::boxed::Box; use alloc::vec::Vec; use smallvec::SmallVec; use std::cell::Cell; +use target_lexicon::Triple; pub use super::MachLabel; pub use crate::ir::{ @@ -741,6 +742,7 @@ where [(C::I, bool); N]: smallvec::Array, { pub lower_ctx: &'a mut C, + pub triple: &'a Triple, pub flags: &'a F, pub isa_flags: &'a I, } @@ -752,6 +754,7 @@ where /// lowering. pub(crate) fn lower_common( lower_ctx: &mut C, + triple: &Triple, flags: &F, isa_flags: &I, outputs: &[InsnOutput], @@ -767,6 +770,7 @@ where // internal heap allocations. let mut isle_ctx = IsleContext { lower_ctx, + triple, flags, isa_flags, }; diff --git a/cranelift/filetests/filetests/runtests/fma.clif b/cranelift/filetests/filetests/runtests/fma.clif index e9429f4b51d4..ff0c9d05f6a9 100644 --- a/cranelift/filetests/filetests/runtests/fma.clif +++ b/cranelift/filetests/filetests/runtests/fma.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target s390x +target x86_64 function %fma_f32(f32, f32, f32) -> f32 { block0(v0: f32, v1: f32, v2: f32): diff --git a/cranelift/module/src/lib.rs b/cranelift/module/src/lib.rs index 82562ae5df64..5b307b60a9a1 100644 --- a/cranelift/module/src/lib.rs +++ b/cranelift/module/src/lib.rs @@ -70,6 +70,8 @@ pub fn default_libcall_names() -> Box String + Send + Syn ir::LibCall::TruncF64 => "trunc".to_owned(), ir::LibCall::NearestF32 => "nearbyintf".to_owned(), ir::LibCall::NearestF64 => "nearbyint".to_owned(), + ir::LibCall::FmaF32 => "fmaf".to_owned(), + ir::LibCall::FmaF64 => "fma".to_owned(), ir::LibCall::Memcpy => "memcpy".to_owned(), ir::LibCall::Memset => "memset".to_owned(), ir::LibCall::Memmove => "memmove".to_owned(),