From 3b2685c4a7b957843031a252b518dc08ea6970ff Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 17 Nov 2021 12:44:25 +0100 Subject: [PATCH] Make have_fma a Julia intrinsic. --- base/compiler/optimize.jl | 1 + base/compiler/tfuncs.jl | 3 ++- base/floatfuncs.jl | 7 ++----- src/intrinsics.cpp | 21 +++++++++++++++++++++ src/intrinsics.h | 2 ++ src/jl_exported_funcs.inc | 1 + src/julia_internal.h | 1 + src/llvm-cpufeatures.cpp | 5 ++++- src/runtime_intrinsics.c | 7 +++++++ 9 files changed, 41 insertions(+), 7 deletions(-) diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl index 38db05a08a875..23333b30cdce1 100644 --- a/base/compiler/optimize.jl +++ b/base/compiler/optimize.jl @@ -416,6 +416,7 @@ function is_pure_intrinsic_infer(f::IntrinsicFunction) f === Intrinsics.arraylen || # this one is volatile f === Intrinsics.sqrt_llvm || # this one may differ at runtime (by a few ulps) f === Intrinsics.sqrt_llvm_fast || # this one may differ at runtime (by a few ulps) + f === Intrinsics.have_fma || # this one depends on the runtime environment f === Intrinsics.cglobal) # cglobal lookup answer changes at runtime end diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl index 0b571a087647b..4444242bafc9c 100644 --- a/base/compiler/tfuncs.jl +++ b/base/compiler/tfuncs.jl @@ -10,7 +10,7 @@ const _NAMEDTUPLE_NAME = NamedTuple.body.body.name const INT_INF = typemax(Int) # integer infinity -const N_IFUNC = reinterpret(Int32, arraylen) + 1 +const N_IFUNC = reinterpret(Int32, have_fma) + 1 const T_IFUNC = Vector{Tuple{Int, Int, Any}}(undef, N_IFUNC) const T_IFUNC_COST = Vector{Int}(undef, N_IFUNC) const T_FFUNC_KEY = Vector{Any}() @@ -214,6 +214,7 @@ cglobal_tfunc(@nospecialize(fptr)) = Ptr{Cvoid} cglobal_tfunc(@nospecialize(fptr), @nospecialize(t)) = (isType(t) ? Ptr{t.parameters[1]} : Ptr) cglobal_tfunc(@nospecialize(fptr), t::Const) = (isa(t.val, Type) ? Ptr{t.val} : Ptr) add_tfunc(Core.Intrinsics.cglobal, 1, 2, cglobal_tfunc, 5) +add_tfunc(Core.Intrinsics.have_fma, 1, 1, @nospecialize(x)->Bool, 1) function ifelse_tfunc(@nospecialize(cnd), @nospecialize(x), @nospecialize(y)) if isa(cnd, Const) diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl index ed306edcbb3a0..602e3178889e5 100644 --- a/base/floatfuncs.jl +++ b/base/floatfuncs.jl @@ -409,11 +409,8 @@ fma_llvm(x::Float64, y::Float64, z::Float64) = fma_float(x, y, z) # Disable LLVM's fma if it is incorrect, e.g. because LLVM falls back # onto a broken system libm; if so, use a software emulated fma -have_fma(::Type) = false -have_fma(::Type{Float32}) = ccall("extern julia.cpu.have_fma.f32", llvmcall, Int, ()) == 1 -have_fma(::Type{Float64}) = ccall("extern julia.cpu.have_fma.f64", llvmcall, Int, ()) == 1 -fma(x::Float32, y::Float32, z::Float32) = have_fma(Float32) ? fma_llvm(x,y,z) : fma_emulated(x,y,z) -fma(x::Float64, y::Float64, z::Float64) = have_fma(Float64) ? fma_llvm(x,y,z) : fma_emulated(x,y,z) +fma(x::Float32, y::Float32, z::Float32) = Core.Intrinsics.have_fma(Float32) ? fma_llvm(x,y,z) : fma_emulated(x,y,z) +fma(x::Float64, y::Float64, z::Float64) = Core.Intrinsics.have_fma(Float64) ? fma_llvm(x,y,z) : fma_emulated(x,y,z) function fma(a::Float16, b::Float16, c::Float16) Float16(muladd(Float32(a), Float32(b), Float32(c))) #don't use fma if the hardware doesn't have it. diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 2f60b39b4c6e7..6fe51cc8b6b22 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -1146,6 +1146,27 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar return mark_julia_type(ctx, ans, false, x.typ); } + case have_fma: { + assert(nargs == 1); + const jl_cgval_t &x = argv[0]; + if (!x.constant || !jl_is_datatype(x.constant)) + return emit_runtime_call(ctx, f, argv, nargs); + jl_datatype_t *dt = (jl_datatype_t*) x.constant; + + // select the appropriated overloaded intrinsic + std::string intr_name = "julia.cpu.have_fma."; + if (dt == jl_float32_type) + intr_name += "f32"; + else if (dt == jl_float64_type) + intr_name += "f64"; + else + return emit_runtime_call(ctx, f, argv, nargs); + + FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, T_int1); + auto ret = ctx.builder.CreateCall(intr); + return mark_julia_type(ctx, ret, false, jl_bool_type); + } + default: { assert(nargs >= 1 && "invalid nargs for intrinsic call"); const jl_cgval_t &xinfo = argv[0]; diff --git a/src/intrinsics.h b/src/intrinsics.h index 52988a313c990..bb67460bbb31f 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -103,6 +103,8 @@ ALIAS(llvmcall, llvmcall) \ /* object access */ \ ADD_I(arraylen, 1) \ + /* cpu feature tests */ \ + ADD_I(have_fma, 1) \ /* hidden intrinsics */ \ ADD_HIDDEN(cglobal_auto, 1) diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index cff8f35a63817..1462dab81cb2b 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -32,6 +32,7 @@ XX(jl_array_grow_end) \ XX(jl_array_isassigned) \ XX(jl_arraylen) \ + XX(jl_have_fma) \ XX(jl_array_ptr) \ XX(jl_array_ptr_1d_append) \ XX(jl_array_ptr_1d_push) \ diff --git a/src/julia_internal.h b/src/julia_internal.h index b8a09288d4548..ee46eb88297af 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -1240,6 +1240,7 @@ JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_arraylen(jl_value_t *a); +JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a); JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type); JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a); JL_DLLEXPORT int jl_array_isassigned(jl_array_t *a, size_t i); diff --git a/src/llvm-cpufeatures.cpp b/src/llvm-cpufeatures.cpp index 47e85cc379b34..934c0a135d82f 100644 --- a/src/llvm-cpufeatures.cpp +++ b/src/llvm-cpufeatures.cpp @@ -5,7 +5,10 @@ // specific CPU features. // // The following intrinsics are supported: -// - julia.cpu.have_fma.$typ: returns 1 if the platform supports hardware-accelerated FMA +// - julia.cpu.have_fma.$typ: returns 1 if the platform supports hardware-accelerated FMA. +// +// Some of these intrinsics are overloaded, i.e., they are suffixed with a type name. +// To extend support, make sure codegen (in intrinsics.cpp) knows how to emit them. // // XXX: can / do we want to make this a codegen pass to enable querying TargetPassConfig // instead of using the global target machine? diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index 5b1862ac3f282..51bb3df36dbaf 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -1349,3 +1349,10 @@ JL_DLLEXPORT jl_value_t *jl_arraylen(jl_value_t *a) JL_TYPECHK(arraylen, array, a); return jl_box_long(jl_array_len((jl_array_t*)a)); } + +JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *typ) +{ + JL_TYPECHK(have_fma, datatype, typ); + // TODO: run-time feature check? + return jl_false; +}