Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/trws/clang cuda #1350

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions include/RAJA/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,21 @@ namespace RAJA {

#if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
#define RAJA_CUDA_ACTIVE

// Compiler numeric identifiers
#define RAJA_CUDA_COMPILER_NVCC -1
#define RAJA_CUDA_COMPILER_CLANG 1
#define RAJA_CUDA_COMPILER_NVCXX 2

#if defined(__NVCOMPILER_CUDA__) // it's NVC++
#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_NVCXX
#elif defined(__NVCC__) // it's NVCC
#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_NVCC
#elif defined(__clang__) && defined(__CUDA__) // clang cuda
#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_CLANG
#else
#error Unknown compiler is claiming to be a CUDA compiler
#endif
#endif // RAJA_ENABLE_CUDA && __CUDACC__

#if defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
Expand Down
51 changes: 50 additions & 1 deletion include/RAJA/policy/cuda/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,47 @@
#include "RAJA/policy/cuda/policy.hpp"
#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"

#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
__Mask, __Type) \
inline __device__ int __FnName(unsigned int __mask, int __val, \
__Type __offset, int __width = warpSize) { \
return __IntIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
__Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ unsigned long long __FnName( \
unsigned int __mask, unsigned long long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
__mask, static_cast<long long>(__val), __offset, __width)); \
}

__MAKE_SYNC_SHUFFLES(__shfl_sync_fixed, __nvvm_shfl_sync_idx_i32,
__nvvm_shfl_sync_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync_fixed, __nvvm_shfl_sync_bfly_i32,
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
#endif

namespace RAJA
{

Expand Down Expand Up @@ -243,7 +284,11 @@ RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int la
template <>
RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
{
#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda
return ::__shfl_xor_sync_fixed(0xffffffffu, var, laneMask);
#else
return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
#endif
}

template <>
Expand Down Expand Up @@ -310,7 +355,11 @@ RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLan
template <>
RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
{
#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda
return ::__shfl_sync_fixed(0xffffffffu, var, srcLane);
#else
return ::__shfl_sync(0xffffffffu, var, srcLane);
#endif
}

template <>
Expand Down Expand Up @@ -389,7 +438,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
T temp = val;

for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
T rhs = __shfl_xor_sync(0xffffffff, temp, i);
T rhs = shfl_xor_sync(temp, i);
Combiner{}(temp, rhs);
}

Expand Down
18 changes: 13 additions & 5 deletions test/integration/plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,21 @@ foreach( BACKEND ${PLUGIN_BACKENDS} )

configure_file( test-plugin-workgroup.cpp.in
test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp )
if(${BACKEND} STREQUAL "Hip")
raja_add_test( NAME test-plugin-workgroup-Known-Hip-Failure-${DISPATCHER}-${BACKEND}
SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp
plugin_to_test.cpp )

raja_add_test( NAME test-plugin-workgroup-${DISPATCHER}-${BACKEND}
SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp
plugin_to_test.cpp )
target_include_directories(test-plugin-workgroup-Known-Hip-Failure-${DISPATCHER}-${BACKEND}.exe
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
else()
raja_add_test( NAME test-plugin-workgroup-${DISPATCHER}-${BACKEND}
SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp
plugin_to_test.cpp )

target_include_directories(test-plugin-workgroup-${DISPATCHER}-${BACKEND}.exe
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
target_include_directories(test-plugin-workgroup-${DISPATCHER}-${BACKEND}.exe
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
endif()

endforeach()
endforeach()
Expand Down
3 changes: 2 additions & 1 deletion test/unit/algorithm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_
test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )

#Some tests are known to fail for Hip, mark those tests (Will not be run in Gitlab CI)
if(${SORT_BACKEND} STREQUAL "Hip" AND (${UTIL_SORT} STREQUAL "Heap" OR
if((${SORT_BACKEND} STREQUAL "Hip" OR RAJA_ENABLE_CLANG_CUDA)
AND (${UTIL_SORT} STREQUAL "Heap" OR
${UTIL_SORT} STREQUAL "Insertion" OR
${UTIL_SORT} STREQUAL "Intro"))
raja_add_test( NAME test-algorithm-util-sort-Known-Hip-Failure-${UTIL_SORT}-${SORT_BACKEND}
Expand Down