shibatch · shibatch · Sep 16, 2020 · Sep 15, 2020 · Sep 15, 2020 · Sep 15, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,6 +22,8 @@ option(ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
 
 option(DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)
 
+option(ENABLE_CUDA "Enable CUDA" OFF)
+
 cmake_minimum_required(VERSION 3.4.3)
 
 # Set to NEW when updating cmake_minimum_required to VERSION >= 3.7.2
@@ -36,14 +38,20 @@ endif()
 enable_testing()
 
 set(SLEEF_VERSION_MAJOR 3)
-set(SLEEF_VERSION_MINOR 5)
-set(SLEEF_VERSION_PATCHLEVEL 1)
+set(SLEEF_VERSION_MINOR 6)
+set(SLEEF_VERSION_PATCHLEVEL 0)
 set(SLEEF_VERSION ${SLEEF_VERSION_MAJOR}.${SLEEF_VERSION_MINOR}.${SLEEF_VERSION_PATCHLEVEL})
 set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
 
-project(SLEEF
-	VERSION ${SLEEF_VERSION}
-	LANGUAGES C)
+if (ENABLE_CUDA)
+  project(SLEEF
+    VERSION ${SLEEF_VERSION}
+    LANGUAGES C CUDA)
+else()
+  project(SLEEF
+    VERSION ${SLEEF_VERSION}
+    LANGUAGES C)
+endif()
 
 # For specifying installation directories
 include(GNUInstallDirs)

diff --git a/Configure.cmake b/Configure.cmake
@@ -1,6 +1,7 @@
 include(CheckCCompilerFlag)
 include(CheckCSourceCompiles)
 include(CheckTypeSize)
+include(CheckLanguage)
 
 if (BUILD_STATIC_TEST_BINS)
   set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
@@ -264,21 +265,22 @@ command_arguments(RENAME_PARAMS_NEON32          cinz_ 2 4 neon)
 command_arguments(RENAME_PARAMS_NEON32VFPV4     finz_ 2 4 neonvfpv4)
 command_arguments(RENAME_PARAMS_VSX             finz_ 2 4 vsx)
 command_arguments(RENAME_PARAMS_VSXNOFMA        cinz_ 2 4 vsxnofma)
-command_arguments(RENAME_PARAMS_ZVECTOR2             finz_ 2 4 zvector2)
-command_arguments(RENAME_PARAMS_ZVECTOR2NOFMA        cinz_ 2 4 zvector2nofma)
+command_arguments(RENAME_PARAMS_ZVECTOR2        finz_ 2 4 zvector2)
+command_arguments(RENAME_PARAMS_ZVECTOR2NOFMA   cinz_ 2 4 zvector2nofma)
 command_arguments(RENAME_PARAMS_PUREC_SCALAR    cinz_ 1 1 purec)
 command_arguments(RENAME_PARAMS_PURECFMA_SCALAR finz_ 1 1 purecfma)
+command_arguments(RENAME_PARAMS_CUDA            finz_ 1 1 cuda)
 # The vector length parameters in SVE, for SP and DP, are chosen for
 # the smallest SVE vector size (128-bit). The name is generated using
 # the "x" token of VLA SVE vector functions.
 command_arguments(RENAME_PARAMS_SVE             finz_ x x sve)
 command_arguments(RENAME_PARAMS_SVENOFMA        cinz_ x x svenofma)
 
-command_arguments(RENAME_PARAMS_GNUABI_SSE2    sse2 b 2 4 _mm128d _mm128 _mm128i _mm128i __SSE2__)
-command_arguments(RENAME_PARAMS_GNUABI_AVX     avx c 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__)
-command_arguments(RENAME_PARAMS_GNUABI_AVX2    avx2 d 4 8 __m256d __m256 __m128i __m256i __AVX2__)
-command_arguments(RENAME_PARAMS_GNUABI_AVX512F avx512f e 8 16 __m512d __m512 __m256i __m512i __AVX512F__)
-command_arguments(RENAME_PARAMS_GNUABI_ADVSIMD advsimd n 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON)
+command_arguments(RENAME_PARAMS_GNUABI_SSE2     sse2 b 2 4 _mm128d _mm128 _mm128i _mm128i __SSE2__)
+command_arguments(RENAME_PARAMS_GNUABI_AVX      avx c 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__)
+command_arguments(RENAME_PARAMS_GNUABI_AVX2     avx2 d 4 8 __m256d __m256 __m128i __m256i __AVX2__)
+command_arguments(RENAME_PARAMS_GNUABI_AVX512F  avx512f e 8 16 __m512d __m512 __m256i __m512i __AVX512F__)
+command_arguments(RENAME_PARAMS_GNUABI_ADVSIMD  advsimd n 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON)
 # The vector length parameters in SVE, for SP and DP, are chosen for
 # the smallest SVE vector size (128-bit). The name is generated using
 # the "x" token of VLA SVE vector functions.
@@ -714,6 +716,14 @@ if (ENFORCE_ZVECTOR2 AND NOT COMPILER_SUPPORTS_ZVECTOR2)
   message(FATAL_ERROR "ENFORCE_ZVECTOR2 is specified and that feature is disabled or not supported by the compiler")
 endif()
 
+# CUDA
+
+option(ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
+
+if (ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
+  message(FATAL_ERROR "ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
+endif()
+
 # OpenMP
 
 option(DISABLE_OPENMP "Disable OPENMP" OFF)

diff --git a/appveyor.yml b/appveyor.yml
@@ -30,9 +30,9 @@ build_script:
   - if "%DO_TEST%" == "TRUE" call p.bat
   - mkdir build
   - cd build
-  - cmake -G"Visual Studio 16 2019" .. -DRUNNING_ON_APPVEYOR=TRUE -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE -DBUILD_INLINE_HEADERS=TRUE  -DENFORCE_SSE2=TRUE -DENFORCE_SSE4=TRUE -DENFORCE_AVX=TRUE -DENFORCE_FMA4=TRUE -DENFORCE_AVX2=TRUE -DENFORCE_AVX512F=TRUE %ENV_BUILD_STATIC%
+  - cmake -G"Visual Studio 16 2019" .. -DRUNNING_ON_APPVEYOR=TRUE -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE -DBUILD_INLINE_HEADERS=TRUE -DENFORCE_SSE2=TRUE -DENFORCE_SSE4=TRUE -DENFORCE_AVX=TRUE -DENFORCE_FMA4=TRUE -DENFORCE_AVX2=TRUE -DENFORCE_AVX512F=TRUE %ENV_BUILD_STATIC%
   - cmake --build . --target install --config Release
-  - if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 4 -C Release)
+  - if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 16 -C Release)
   - cd "%BUILDFOLDER%"
   - echo PATH %ORGPATH%;c:\Cygwin64\bin;c:\Cygwin64\usr\bin;%CD%\build-cygwin\bin;%CD%\build-clang\bin > q.bat
   - powershell -Command "(gc q.bat) -replace ' ;', ';' | Out-File -encoding ASCII p.bat"
@@ -42,7 +42,7 @@ build_script:
   - cmake -G Ninja .. -DRUNNING_ON_APPVEYOR=TRUE -DCMAKE_C_COMPILER:PATH="C:\Program Files\LLVM\bin\clang.exe" -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_INLINE_HEADERS=TRUE -DBUILD_QUAD=TRUE  -DENFORCE_SSE2=TRUE -DENFORCE_SSE4=TRUE -DENFORCE_AVX=TRUE -DENFORCE_FMA4=TRUE -DENFORCE_AVX2=TRUE -DENFORCE_AVX512F=TRUE %ENV_BUILD_STATIC%
   - ninja
 test_script:
-  - if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 4 -C Release)
+  - if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 16 -C Release)
 artifacts:
 - path: build\install\**\*
   name: SLEEFWindowsx64
diff --git a/src/arch/helperpurec_scalar.h b/src/arch/helperpurec_scalar.h
@@ -48,13 +48,13 @@
 #define ENABLE_SP
 //@#define ENABLE_SP
 
-#if CONFIG == 2
+#if CONFIG == 2 || CONFIG == 3
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 
-#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__)
+#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) || CONFIG == 3
 #ifndef FP_FAST_FMA
 #define FP_FAST_FMA
 //@#define FP_FAST_FMA
@@ -68,11 +68,12 @@
 #if (!defined(FP_FAST_FMA) || !defined(FP_FAST_FMAF)) && !defined(SLEEF_GENHEADER)
 #error FP_FAST_FMA or FP_FAST_FMAF not defined
 #endif
+
 #define ISANAME "Pure C scalar with FMA"
 
-#else // #if CONFIG == 2
+#else // #if CONFIG == 2 || CONFIG == 3
 #define ISANAME "Pure C scalar"
-#endif // #if CONFIG == 2
+#endif // #if CONFIG == 2 || CONFIG == 3
 
 #define LOG2VECTLENDP 0
 //@#define LOG2VECTLENDP 0
@@ -86,7 +87,7 @@
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 
-#if defined(__SSE4_1__) || defined(__aarch64__)
+#if defined(__SSE4_1__) || defined(__aarch64__) || CONFIG == 3
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #endif
@@ -191,7 +192,7 @@ static INLINE vint vtruncate_vi_vd(vdouble vd) { return (int32_t)TRUNC(vd); }
 #else
 static INLINE vint vrint_vi_vd(vdouble a) {
   a += a > 0 ? 0.5 : -0.5;
-  versatileVector v = { .d = a }; v.x -= 1 & (int)a;
+  versatileVector v; v.d = a; v.x -= 1 & (int)a;
   return (int32_t)v.d;
 }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
@@ -215,7 +216,7 @@ static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
 static INLINE vdouble vrec_vd_vd(vdouble x)               { return 1 / x; }
 
-static INLINE vdouble vabs_vd_vd(vdouble d) { versatileVector v = { .d = d }; v.x &= 0x7fffffffffffffffULL; return v.d; }
+static INLINE vdouble vabs_vd_vd(vdouble d) { versatileVector v; v.d = d; v.x &= 0x7fffffffffffffffULL; return v.d; }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
 
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return x > y ? x : y; }
@@ -298,7 +299,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (int32_t)TRUNCF(vf); }
 #else
 static INLINE vint2 vrint_vi2_vf(vfloat a) {
   a += a > 0 ? 0.5f : -0.5f;
-  versatileVector v = { .f = a }; v.u[0] -= 1 & (int)a;
+  versatileVector v; v.f = a; v.u[0] -= 1 & (int)a;
   return (int32_t)v.f;
 }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
@@ -323,7 +324,7 @@ static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
 static INLINE vfloat vrec_vf_vf   (vfloat x)           { return 1 / x; }
 
-static INLINE vfloat vabs_vf_vf(vfloat x) { versatileVector v = { .f = x }; v.i[0] &= 0x7fffffff; return v.f; }
+static INLINE vfloat vabs_vf_vf(vfloat x) { versatileVector v; v.f = x; v.i[0] &= 0x7fffffff; return v.f; }
 static INLINE vfloat vneg_vf_vf(vfloat x) { return -x; }
 
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return x > y ? x : y; }
@@ -351,9 +352,9 @@ static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)  { return x <= y ? ~(uint
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)  { return x >  y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)  { return x >= y ? ~(uint32_t)0 : 0; }
 
-static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v = { .i2 = x }, w = { .i2 = y }; v.i[0] += w.i[0]; v.i[1] += w.i[1]; return v.i2; }
-static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v = { .i2 = x }, w = { .i2 = y }; v.i[0] -= w.i[0]; v.i[1] -= w.i[1]; return v.i2; }
-static INLINE vint2 vneg_vi2_vi2(vint2 x)              { versatileVector v = { .i2 = x }; v.i[0] = -v.i[0]; v.i[1] = -v.i[1]; return v.i2; }
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v, w; v.i2 = x; w.i2 = y; v.i[0] += w.i[0]; v.i[1] += w.i[1]; return v.i2; }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v, w; v.i2 = x; w.i2 = y; v.i[0] -= w.i[0]; v.i[1] -= w.i[1]; return v.i2; }
+static INLINE vint2 vneg_vi2_vi2(vint2 x)              { versatileVector v; v.i2 = x; v.i[0] = -v.i[0]; v.i[1] = -v.i[1]; return v.i2; }
 
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)    { return x & y; }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
@@ -374,9 +375,9 @@ static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vcast_vm_vo(x) & y; }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~vcast_vm_vo(x); }
 
-static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { versatileVector v = { .i2 = x }; v.u[0] <<= c; v.u[1] <<= c; return v.i2; }
-static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { versatileVector v = { .i2 = x }; v.u[0] >>= c; v.u[1] >>= c; return v.i2; }
-static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { versatileVector v = { .i2 = x }; v.i[0] >>= c; v.i[1] >>= c; return v.i2; }
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { versatileVector v; v.i2 = x; v.u[0] <<= c; v.u[1] <<= c; return v.i2; }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { versatileVector v; v.i2 = x; v.u[0] >>= c; v.u[1] >>= c; return v.i2; }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { versatileVector v; v.i2 = x; v.i[0] >>= c; v.i[1] >>= c; return v.i2; }
 
 static INLINE vopmask visinf_vo_vf (vfloat d) { return (d == SLEEF_INFINITYf || d == -SLEEF_INFINITYf) ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return d == SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }

diff --git a/src/common/misc.h b/src/common/misc.h
@@ -287,13 +287,21 @@ typedef union {
 
 #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 
+#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
+#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
+#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
+
+#if defined(SLEEF_GENHEADER)
+
+#define INLINE SLEEF_ALWAYS_INLINE
+#define CONST SLEEF_CONST
+#define EXPORT SLEEF_INLINE
+#define NOEXPORT
+
+#else // #if defined(SLEEF_GENHEADER)
+
 #define INLINE __forceinline
 #define CONST
-#define RESTRICT
-#define ALIGNED(x)
-#define LIKELY(condition) (condition)
-#define UNLIKELY(condition) (condition)
-
 #ifndef SLEEF_STATIC_LIBS
 #define EXPORT __declspec(dllexport)
 #define NOEXPORT
@@ -302,6 +310,13 @@ typedef union {
 #define NOEXPORT
 #endif
 
+#endif // #if defined(SLEEF_GENHEADER)
+
+#define RESTRICT
+#define ALIGNED(x)
+#define LIKELY(condition) (condition)
+#define UNLIKELY(condition) (condition)
+
 #if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
 #include <x86intrin.h>
 #endif

diff --git a/src/libm-tester/CMakeLists.txt b/src/libm-tester/CMakeLists.txt
@@ -91,6 +91,17 @@ set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 add_test_iut(${TARGET_IUT})
 set(IUT_LIST ${TARGET_IUT})
 
+# Compile executable 'iutcuda'
+if (BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
+  add_executable(iutcuda iutcuda.cu)
+  set_target_properties(iutcuda PROPERTIES LINKER_LANGUAGE CUDA)
+  target_compile_options(iutcuda PRIVATE "--fmad=false")
+  target_include_directories(iutcuda PRIVATE ${PROJECT_BINARY_DIR}/inline)
+  add_dependencies(iutcuda ${TARGET_INLINE_HEADERS})
+  add_test_iut(iutcuda)
+  list(APPEND IUT_LIST iutcuda)
+endif()
+
 set(IUT_SRC iutsimd.c iutsimdmain.c testerutil.c)
 
 # Add vector extension `iut`s
@@ -150,7 +161,7 @@ macro(test_extension SIMD)
 	SIMD_SUFFIX=_${LCSIMD}_sleef
 	)
       target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/inline)
-      target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT} ${TARGET_LIBINLINE})
+      target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
       add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
       set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
       add_test_iut(${IUTINAME})