Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuda support #337

Merged
merged 12 commits into from
Sep 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ option(ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)

option(DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)

option(ENABLE_CUDA "Enable CUDA" OFF)

cmake_minimum_required(VERSION 3.4.3)

# Set to NEW when updating cmake_minimum_required to VERSION >= 3.7.2
Expand All @@ -36,14 +38,20 @@ endif()
enable_testing()

set(SLEEF_VERSION_MAJOR 3)
set(SLEEF_VERSION_MINOR 5)
set(SLEEF_VERSION_PATCHLEVEL 1)
set(SLEEF_VERSION_MINOR 6)
set(SLEEF_VERSION_PATCHLEVEL 0)
set(SLEEF_VERSION ${SLEEF_VERSION_MAJOR}.${SLEEF_VERSION_MINOR}.${SLEEF_VERSION_PATCHLEVEL})
set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})

project(SLEEF
VERSION ${SLEEF_VERSION}
LANGUAGES C)
if (ENABLE_CUDA)
project(SLEEF
VERSION ${SLEEF_VERSION}
LANGUAGES C CUDA)
else()
project(SLEEF
VERSION ${SLEEF_VERSION}
LANGUAGES C)
endif()

# For specifying installation directories
include(GNUInstallDirs)
Expand Down
24 changes: 17 additions & 7 deletions Configure.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include(CheckCCompilerFlag)
include(CheckCSourceCompiles)
include(CheckTypeSize)
include(CheckLanguage)

if (BUILD_STATIC_TEST_BINS)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
Expand Down Expand Up @@ -264,21 +265,22 @@ command_arguments(RENAME_PARAMS_NEON32 cinz_ 2 4 neon)
command_arguments(RENAME_PARAMS_NEON32VFPV4 finz_ 2 4 neonvfpv4)
command_arguments(RENAME_PARAMS_VSX finz_ 2 4 vsx)
command_arguments(RENAME_PARAMS_VSXNOFMA cinz_ 2 4 vsxnofma)
command_arguments(RENAME_PARAMS_ZVECTOR2 finz_ 2 4 zvector2)
command_arguments(RENAME_PARAMS_ZVECTOR2NOFMA cinz_ 2 4 zvector2nofma)
command_arguments(RENAME_PARAMS_ZVECTOR2 finz_ 2 4 zvector2)
command_arguments(RENAME_PARAMS_ZVECTOR2NOFMA cinz_ 2 4 zvector2nofma)
command_arguments(RENAME_PARAMS_PUREC_SCALAR cinz_ 1 1 purec)
command_arguments(RENAME_PARAMS_PURECFMA_SCALAR finz_ 1 1 purecfma)
command_arguments(RENAME_PARAMS_CUDA finz_ 1 1 cuda)
# The vector length parameters in SVE, for SP and DP, are chosen for
# the smallest SVE vector size (128-bit). The name is generated using
# the "x" token of VLA SVE vector functions.
command_arguments(RENAME_PARAMS_SVE finz_ x x sve)
command_arguments(RENAME_PARAMS_SVENOFMA cinz_ x x svenofma)

command_arguments(RENAME_PARAMS_GNUABI_SSE2 sse2 b 2 4 _mm128d _mm128 _mm128i _mm128i __SSE2__)
command_arguments(RENAME_PARAMS_GNUABI_AVX avx c 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__)
command_arguments(RENAME_PARAMS_GNUABI_AVX2 avx2 d 4 8 __m256d __m256 __m128i __m256i __AVX2__)
command_arguments(RENAME_PARAMS_GNUABI_AVX512F avx512f e 8 16 __m512d __m512 __m256i __m512i __AVX512F__)
command_arguments(RENAME_PARAMS_GNUABI_ADVSIMD advsimd n 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON)
command_arguments(RENAME_PARAMS_GNUABI_SSE2 sse2 b 2 4 _mm128d _mm128 _mm128i _mm128i __SSE2__)
command_arguments(RENAME_PARAMS_GNUABI_AVX avx c 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__)
command_arguments(RENAME_PARAMS_GNUABI_AVX2 avx2 d 4 8 __m256d __m256 __m128i __m256i __AVX2__)
command_arguments(RENAME_PARAMS_GNUABI_AVX512F avx512f e 8 16 __m512d __m512 __m256i __m512i __AVX512F__)
command_arguments(RENAME_PARAMS_GNUABI_ADVSIMD advsimd n 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON)
# The vector length parameters in SVE, for SP and DP, are chosen for
# the smallest SVE vector size (128-bit). The name is generated using
# the "x" token of VLA SVE vector functions.
Expand Down Expand Up @@ -714,6 +716,14 @@ if (ENFORCE_ZVECTOR2 AND NOT COMPILER_SUPPORTS_ZVECTOR2)
message(FATAL_ERROR "ENFORCE_ZVECTOR2 is specified and that feature is disabled or not supported by the compiler")
endif()

# CUDA

option(ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)

if (ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
endif()

# OpenMP

option(DISABLE_OPENMP "Disable OPENMP" OFF)
Expand Down
6 changes: 3 additions & 3 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ build_script:
- if "%DO_TEST%" == "TRUE" call p.bat
- mkdir build
- cd build
- cmake -G"Visual Studio 16 2019" .. -DRUNNING_ON_APPVEYOR=TRUE -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE -DBUILD_INLINE_HEADERS=TRUE -DENFORCE_SSE2=TRUE -DENFORCE_SSE4=TRUE -DENFORCE_AVX=TRUE -DENFORCE_FMA4=TRUE -DENFORCE_AVX2=TRUE -DENFORCE_AVX512F=TRUE %ENV_BUILD_STATIC%
- cmake -G"Visual Studio 16 2019" .. -DRUNNING_ON_APPVEYOR=TRUE -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_QUAD=TRUE -DBUILD_INLINE_HEADERS=TRUE -DENFORCE_SSE2=TRUE -DENFORCE_SSE4=TRUE -DENFORCE_AVX=TRUE -DENFORCE_FMA4=TRUE -DENFORCE_AVX2=TRUE -DENFORCE_AVX512F=TRUE %ENV_BUILD_STATIC%
- cmake --build . --target install --config Release
- if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 4 -C Release)
- if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 16 -C Release)
- cd "%BUILDFOLDER%"
- echo PATH %ORGPATH%;c:\Cygwin64\bin;c:\Cygwin64\usr\bin;%CD%\build-cygwin\bin;%CD%\build-clang\bin > q.bat
- powershell -Command "(gc q.bat) -replace ' ;', ';' | Out-File -encoding ASCII p.bat"
Expand All @@ -42,7 +42,7 @@ build_script:
- cmake -G Ninja .. -DRUNNING_ON_APPVEYOR=TRUE -DCMAKE_C_COMPILER:PATH="C:\Program Files\LLVM\bin\clang.exe" -DCMAKE_INSTALL_PREFIX=install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_SHOW_ERROR_LOG=1 -DENFORCE_TESTER3=TRUE -DBUILD_INLINE_HEADERS=TRUE -DBUILD_QUAD=TRUE -DENFORCE_SSE2=TRUE -DENFORCE_SSE4=TRUE -DENFORCE_AVX=TRUE -DENFORCE_FMA4=TRUE -DENFORCE_AVX2=TRUE -DENFORCE_AVX512F=TRUE %ENV_BUILD_STATIC%
- ninja
test_script:
- if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 4 -C Release)
- if "%DO_TEST%" == "TRUE" (ctest --output-on-failure -j 16 -C Release)
artifacts:
- path: build\install\**\*
name: SLEEFWindowsx64
31 changes: 16 additions & 15 deletions src/arch/helperpurec_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@
#define ENABLE_SP
//@#define ENABLE_SP

#if CONFIG == 2
#if CONFIG == 2 || CONFIG == 3
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP

#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__)
#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) || CONFIG == 3
#ifndef FP_FAST_FMA
#define FP_FAST_FMA
//@#define FP_FAST_FMA
Expand All @@ -68,11 +68,12 @@
#if (!defined(FP_FAST_FMA) || !defined(FP_FAST_FMAF)) && !defined(SLEEF_GENHEADER)
#error FP_FAST_FMA or FP_FAST_FMAF not defined
#endif

#define ISANAME "Pure C scalar with FMA"

#else // #if CONFIG == 2
#else // #if CONFIG == 2 || CONFIG == 3
#define ISANAME "Pure C scalar"
#endif // #if CONFIG == 2
#endif // #if CONFIG == 2 || CONFIG == 3

#define LOG2VECTLENDP 0
//@#define LOG2VECTLENDP 0
Expand All @@ -86,7 +87,7 @@
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

#if defined(__SSE4_1__) || defined(__aarch64__)
#if defined(__SSE4_1__) || defined(__aarch64__) || CONFIG == 3
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#endif
Expand Down Expand Up @@ -191,7 +192,7 @@ static INLINE vint vtruncate_vi_vd(vdouble vd) { return (int32_t)TRUNC(vd); }
#else
static INLINE vint vrint_vi_vd(vdouble a) {
a += a > 0 ? 0.5 : -0.5;
versatileVector v = { .d = a }; v.x -= 1 & (int)a;
versatileVector v; v.d = a; v.x -= 1 & (int)a;
return (int32_t)v.d;
}
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
Expand All @@ -215,7 +216,7 @@ static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }

static INLINE vdouble vabs_vd_vd(vdouble d) { versatileVector v = { .d = d }; v.x &= 0x7fffffffffffffffULL; return v.d; }
static INLINE vdouble vabs_vd_vd(vdouble d) { versatileVector v; v.d = d; v.x &= 0x7fffffffffffffffULL; return v.d; }
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }

static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return x > y ? x : y; }
Expand Down Expand Up @@ -298,7 +299,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (int32_t)TRUNCF(vf); }
#else
static INLINE vint2 vrint_vi2_vf(vfloat a) {
a += a > 0 ? 0.5f : -0.5f;
versatileVector v = { .f = a }; v.u[0] -= 1 & (int)a;
versatileVector v; v.f = a; v.u[0] -= 1 & (int)a;
return (int32_t)v.f;
}
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
Expand All @@ -323,7 +324,7 @@ static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
static INLINE vfloat vrec_vf_vf (vfloat x) { return 1 / x; }

static INLINE vfloat vabs_vf_vf(vfloat x) { versatileVector v = { .f = x }; v.i[0] &= 0x7fffffff; return v.f; }
static INLINE vfloat vabs_vf_vf(vfloat x) { versatileVector v; v.f = x; v.i[0] &= 0x7fffffff; return v.f; }
static INLINE vfloat vneg_vf_vf(vfloat x) { return -x; }

static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return x > y ? x : y; }
Expand Down Expand Up @@ -351,9 +352,9 @@ static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return x <= y ? ~(uint
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return x > y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return x >= y ? ~(uint32_t)0 : 0; }

static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v = { .i2 = x }, w = { .i2 = y }; v.i[0] += w.i[0]; v.i[1] += w.i[1]; return v.i2; }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v = { .i2 = x }, w = { .i2 = y }; v.i[0] -= w.i[0]; v.i[1] -= w.i[1]; return v.i2; }
static INLINE vint2 vneg_vi2_vi2(vint2 x) { versatileVector v = { .i2 = x }; v.i[0] = -v.i[0]; v.i[1] = -v.i[1]; return v.i2; }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v, w; v.i2 = x; w.i2 = y; v.i[0] += w.i[0]; v.i[1] += w.i[1]; return v.i2; }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { versatileVector v, w; v.i2 = x; w.i2 = y; v.i[0] -= w.i[0]; v.i[1] -= w.i[1]; return v.i2; }
static INLINE vint2 vneg_vi2_vi2(vint2 x) { versatileVector v; v.i2 = x; v.i[0] = -v.i[0]; v.i[1] = -v.i[1]; return v.i2; }

static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
Expand All @@ -374,9 +375,9 @@ static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vcast_vm_vo(x) & y; }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~vcast_vm_vo(x); }

static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { versatileVector v = { .i2 = x }; v.u[0] <<= c; v.u[1] <<= c; return v.i2; }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { versatileVector v = { .i2 = x }; v.u[0] >>= c; v.u[1] >>= c; return v.i2; }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { versatileVector v = { .i2 = x }; v.i[0] >>= c; v.i[1] >>= c; return v.i2; }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { versatileVector v; v.i2 = x; v.u[0] <<= c; v.u[1] <<= c; return v.i2; }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { versatileVector v; v.i2 = x; v.u[0] >>= c; v.u[1] >>= c; return v.i2; }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { versatileVector v; v.i2 = x; v.i[0] >>= c; v.i[1] >>= c; return v.i2; }

static INLINE vopmask visinf_vo_vf (vfloat d) { return (d == SLEEF_INFINITYf || d == -SLEEF_INFINITYf) ? ~(uint32_t)0 : 0; }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return d == SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
Expand Down
25 changes: 20 additions & 5 deletions src/common/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,13 +287,21 @@ typedef union {

#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)

#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'

#if defined(SLEEF_GENHEADER)

#define INLINE SLEEF_ALWAYS_INLINE
#define CONST SLEEF_CONST
#define EXPORT SLEEF_INLINE
#define NOEXPORT

#else // #if defined(SLEEF_GENHEADER)

#define INLINE __forceinline
#define CONST
#define RESTRICT
#define ALIGNED(x)
#define LIKELY(condition) (condition)
#define UNLIKELY(condition) (condition)

#ifndef SLEEF_STATIC_LIBS
#define EXPORT __declspec(dllexport)
#define NOEXPORT
Expand All @@ -302,6 +310,13 @@ typedef union {
#define NOEXPORT
#endif

#endif // #if defined(SLEEF_GENHEADER)

#define RESTRICT
#define ALIGNED(x)
#define LIKELY(condition) (condition)
#define UNLIKELY(condition) (condition)

#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
#include <x86intrin.h>
#endif
Expand Down
13 changes: 12 additions & 1 deletion src/libm-tester/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
add_test_iut(${TARGET_IUT})
set(IUT_LIST ${TARGET_IUT})

# Compile executable 'iutcuda'
if (BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
add_executable(iutcuda iutcuda.cu)
set_target_properties(iutcuda PROPERTIES LINKER_LANGUAGE CUDA)
target_compile_options(iutcuda PRIVATE "--fmad=false")
target_include_directories(iutcuda PRIVATE ${PROJECT_BINARY_DIR}/inline)
add_dependencies(iutcuda ${TARGET_INLINE_HEADERS})
add_test_iut(iutcuda)
list(APPEND IUT_LIST iutcuda)
endif()

set(IUT_SRC iutsimd.c iutsimdmain.c testerutil.c)

# Add vector extension `iut`s
Expand Down Expand Up @@ -150,7 +161,7 @@ macro(test_extension SIMD)
SIMD_SUFFIX=_${LCSIMD}_sleef
)
target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/inline)
target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT} ${TARGET_LIBINLINE})
target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
add_test_iut(${IUTINAME})
Expand Down
Loading