Improve use of intrinsics for calculation

boostorg · May 25, 2022 · 49881be · 49881be
1 parent 720573a
commit 49881be
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 6 deletions.
diff --git a/doc/sf/ccmath.qbk b/doc/sf/ccmath.qbk
@@ -184,6 +184,7 @@ All of the following functions require C++17 or greater.
 
         template <typename Real>
         inline constexpr Real fma(Real x, Real y, Real z) noexcept
+        Requires compiling with fma flag
 
         template <typename Arithmetic1, typename Arithmetic2, typename Arithmetic3>
         inline constepxr Promoted fma(Arithmetic1 x, Arithmetic2 y, Arithmetic3 z) noexcept

diff --git a/include/boost/math/ccmath/fma.hpp b/include/boost/math/ccmath/fma.hpp
@@ -13,16 +13,19 @@
 #include <boost/math/ccmath/isinf.hpp>
 #include <boost/math/ccmath/isnan.hpp>
 
+#if __has_include("immintrin.h") && defined(__X86_64__) || defined(__amd64__)
+#   include "immintrin.h"
+#   define BOOST_MATH_HAS_IMMINTRIN_H
+#endif
+
 namespace boost::math::ccmath {
 
 namespace detail {
 
 template <typename T>
 inline constexpr T fma_imp(const T x, const T y, const T z) noexcept
 {
-    #if __GNUC__ < 10
-        return (x * y) + z;
-    #else
+    #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && !defined(__INTEL_LLVM_COMPILER)
     if constexpr (std::is_same_v<T, float>)
     {
         return __builtin_fmaf(x, y, z);
@@ -35,11 +38,23 @@ inline constexpr T fma_imp(const T x, const T y, const T z) noexcept
     {
         return __builtin_fmal(x, y, z);
     }
-    else // e.g. Boost.Multiprecision types where no built-in exists
+    #elif defined(BOOST_MATH_HAS_IMMINTRIN_H)
+    if constexpr (std::is_same_v<T, float>)
+    {
+        return static_cast<float>(_mm_fmadd_ps(x, y, z));
+    }
+    else if constexpr (std::is_same_v<T, double>)
+    {
+        return static_cast<double>(_mm_fmadd_pd(x, y, z));
+    }
+    else if constexpr (std::is_same_v<T, long double>)
     {
-        return (x * y) + z;
+        return static_cast<long double>(_mm256_fmadd_pd(x, y, z));
     }
     #endif
+
+    // If we can't use compiler intrinsics hope that -fma flag optimizes this call to fma instruction
+    return (x * y) + z;
 }
 
 } // Namespace detail

diff --git a/test/ccmath_fma_test.cpp b/test/ccmath_fma_test.cpp
@@ -17,6 +17,7 @@
 #include <boost/multiprecision/float128.hpp>
 #endif
 
+#if !defined(BOOST_MATH_NO_CONSTEXPR_DETECTION) && !defined(BOOST_MATH_USING_BUILTIN_CONSTANT_P)
 template <typename T>
 constexpr void test()
 {
@@ -49,7 +50,6 @@ constexpr void test()
     }
 }
 
-#if !defined(BOOST_MATH_NO_CONSTEXPR_DETECTION) && !defined(BOOST_MATH_USING_BUILTIN_CONSTANT_P)
 int main()
 {
     test<float>();