From 3b2685c4a7b957843031a252b518dc08ea6970ff Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 17 Nov 2021 12:44:25 +0100
Subject: [PATCH] Make have_fma a Julia intrinsic.

---
 base/compiler/optimize.jl |  1 +
 base/compiler/tfuncs.jl   |  3 ++-
 base/floatfuncs.jl        |  7 ++-----
 src/intrinsics.cpp        | 21 +++++++++++++++++++++
 src/intrinsics.h          |  2 ++
 src/jl_exported_funcs.inc |  1 +
 src/julia_internal.h      |  1 +
 src/llvm-cpufeatures.cpp  |  5 ++++-
 src/runtime_intrinsics.c  |  7 +++++++
 9 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl
index 38db05a08a875..23333b30cdce1 100644
--- a/base/compiler/optimize.jl
+++ b/base/compiler/optimize.jl
@@ -416,6 +416,7 @@ function is_pure_intrinsic_infer(f::IntrinsicFunction)
              f === Intrinsics.arraylen ||   # this one is volatile
              f === Intrinsics.sqrt_llvm ||  # this one may differ at runtime (by a few ulps)
              f === Intrinsics.sqrt_llvm_fast ||  # this one may differ at runtime (by a few ulps)
+             f === Intrinsics.have_fma ||  # this one depends on the runtime environment
              f === Intrinsics.cglobal)  # cglobal lookup answer changes at runtime
 end
 
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index 0b571a087647b..4444242bafc9c 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -10,7 +10,7 @@ const _NAMEDTUPLE_NAME = NamedTuple.body.body.name
 
 const INT_INF = typemax(Int) # integer infinity
 
-const N_IFUNC = reinterpret(Int32, arraylen) + 1
+const N_IFUNC = reinterpret(Int32, have_fma) + 1
 const T_IFUNC = Vector{Tuple{Int, Int, Any}}(undef, N_IFUNC)
 const T_IFUNC_COST = Vector{Int}(undef, N_IFUNC)
 const T_FFUNC_KEY = Vector{Any}()
@@ -214,6 +214,7 @@ cglobal_tfunc(@nospecialize(fptr)) = Ptr{Cvoid}
 cglobal_tfunc(@nospecialize(fptr), @nospecialize(t)) = (isType(t) ? Ptr{t.parameters[1]} : Ptr)
 cglobal_tfunc(@nospecialize(fptr), t::Const) = (isa(t.val, Type) ? Ptr{t.val} : Ptr)
 add_tfunc(Core.Intrinsics.cglobal, 1, 2, cglobal_tfunc, 5)
+add_tfunc(Core.Intrinsics.have_fma, 1, 1, @nospecialize(x)->Bool, 1)
 
 function ifelse_tfunc(@nospecialize(cnd), @nospecialize(x), @nospecialize(y))
     if isa(cnd, Const)
diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index ed306edcbb3a0..602e3178889e5 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -409,11 +409,8 @@ fma_llvm(x::Float64, y::Float64, z::Float64) = fma_float(x, y, z)
 
 # Disable LLVM's fma if it is incorrect, e.g. because LLVM falls back
 # onto a broken system libm; if so, use a software emulated fma
-have_fma(::Type) = false
-have_fma(::Type{Float32}) = ccall("extern julia.cpu.have_fma.f32", llvmcall, Int, ()) == 1
-have_fma(::Type{Float64}) = ccall("extern julia.cpu.have_fma.f64", llvmcall, Int, ()) == 1
-fma(x::Float32, y::Float32, z::Float32) = have_fma(Float32) ? fma_llvm(x,y,z) : fma_emulated(x,y,z)
-fma(x::Float64, y::Float64, z::Float64) = have_fma(Float64) ? fma_llvm(x,y,z) : fma_emulated(x,y,z)
+fma(x::Float32, y::Float32, z::Float32) = Core.Intrinsics.have_fma(Float32) ? fma_llvm(x,y,z) : fma_emulated(x,y,z)
+fma(x::Float64, y::Float64, z::Float64) = Core.Intrinsics.have_fma(Float64) ? fma_llvm(x,y,z) : fma_emulated(x,y,z)
 
 function fma(a::Float16, b::Float16, c::Float16)
     Float16(muladd(Float32(a), Float32(b), Float32(c))) #don't use fma if the hardware doesn't have it.
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 2f60b39b4c6e7..6fe51cc8b6b22 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -1146,6 +1146,27 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
         return mark_julia_type(ctx, ans, false, x.typ);
     }
 
+    case have_fma: {
+        assert(nargs == 1);
+        const jl_cgval_t &x = argv[0];
+        if (!x.constant || !jl_is_datatype(x.constant))
+            return emit_runtime_call(ctx, f, argv, nargs);
+        jl_datatype_t *dt = (jl_datatype_t*) x.constant;
+
+        // select the appropriated overloaded intrinsic
+        std::string intr_name = "julia.cpu.have_fma.";
+        if (dt == jl_float32_type)
+            intr_name += "f32";
+        else if (dt == jl_float64_type)
+            intr_name += "f64";
+        else
+            return emit_runtime_call(ctx, f, argv, nargs);
+
+        FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, T_int1);
+        auto ret = ctx.builder.CreateCall(intr);
+        return mark_julia_type(ctx, ret, false, jl_bool_type);
+    }
+
     default: {
         assert(nargs >= 1 && "invalid nargs for intrinsic call");
         const jl_cgval_t &xinfo = argv[0];
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 52988a313c990..bb67460bbb31f 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -103,6 +103,8 @@
     ALIAS(llvmcall, llvmcall) \
     /*  object access */ \
     ADD_I(arraylen, 1) \
+    /*  cpu feature tests */ \
+    ADD_I(have_fma, 1) \
     /*  hidden intrinsics */ \
     ADD_HIDDEN(cglobal_auto, 1)
 
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index cff8f35a63817..1462dab81cb2b 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -32,6 +32,7 @@
     XX(jl_array_grow_end) \
     XX(jl_array_isassigned) \
     XX(jl_arraylen) \
+    XX(jl_have_fma) \
     XX(jl_array_ptr) \
     XX(jl_array_ptr_1d_append) \
     XX(jl_array_ptr_1d_push) \
diff --git a/src/julia_internal.h b/src/julia_internal.h
index b8a09288d4548..ee46eb88297af 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1240,6 +1240,7 @@ JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b);
 
 JL_DLLEXPORT jl_value_t *jl_arraylen(jl_value_t *a);
+JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a);
 JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type);
 JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a);
 JL_DLLEXPORT int jl_array_isassigned(jl_array_t *a, size_t i);
diff --git a/src/llvm-cpufeatures.cpp b/src/llvm-cpufeatures.cpp
index 47e85cc379b34..934c0a135d82f 100644
--- a/src/llvm-cpufeatures.cpp
+++ b/src/llvm-cpufeatures.cpp
@@ -5,7 +5,10 @@
 // specific CPU features.
 //
 // The following intrinsics are supported:
-// - julia.cpu.have_fma.$typ: returns 1 if the platform supports hardware-accelerated FMA
+// - julia.cpu.have_fma.$typ: returns 1 if the platform supports hardware-accelerated FMA.
+//
+// Some of these intrinsics are overloaded, i.e., they are suffixed with a type name.
+// To extend support, make sure codegen (in intrinsics.cpp) knows how to emit them.
 //
 // XXX: can / do we want to make this a codegen pass to enable querying TargetPassConfig
 //      instead of using the global target machine?
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 5b1862ac3f282..51bb3df36dbaf 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1349,3 +1349,10 @@ JL_DLLEXPORT jl_value_t *jl_arraylen(jl_value_t *a)
     JL_TYPECHK(arraylen, array, a);
     return jl_box_long(jl_array_len((jl_array_t*)a));
 }
+
+JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *typ)
+{
+    JL_TYPECHK(have_fma, datatype, typ);
+    // TODO: run-time feature check?
+    return jl_false;
+}