From 0c8e021b4d92e3cd89955b13786c32e4b45b17e9 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 9 Jun 2023 03:47:48 -0700 Subject: [PATCH 1/3] This PR adds support to __half and nb_bfloat16 to myAtomicReduce --- cpp/include/raft/util/cuda_utils.cuh | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh index 0523dcc81c..534f481764 100644 --- a/cpp/include/raft/util/cuda_utils.cuh +++ b/cpp/include/raft/util/cuda_utils.cuh @@ -79,6 +79,38 @@ DI void myAtomicReduce(float* address, float val, ReduceLambda op) } while (assumed != old); } +template +DI void myAtomicReduce(__half* address, __half val, ReduceLambda op) +{ +#if (__CUDA_ARCH__ >= 530) + unsigned short int* address_as_uint = (unsigned short int*)address; + unsigned short int old = *address_as_uint, assumed; + do { + assumed = old; + old = atomicCAS(address_as_uint, assumed, __half_as_ushort(op(val, __ushort_as_half(assumed)))); + } while (assumed != old); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(__half) != sizeof(__half), "__half is only supported on __CUDA_ARCH__ >= 530"); +#endif +} + +template +DI void myAtomicReduce(nv_bfloat16* address, nv_bfloat16 val, ReduceLambda op) +{ +#if (__CUDA_ARCH__ >= 800) + unsigned short int* address_as_uint = (unsigned short int*)address; + unsigned short int old = *address_as_uint, assumed; + do { + assumed = old; + old = atomicCAS(address_as_uint, assumed, __bfloat16_as_ushort(op(val, __ushort_as_bfloat16(assumed)))); + } while (assumed != old); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(nv_bfloat16) != sizeof(nv_bfloat16), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); +#endif +} + template DI void myAtomicReduce(int* address, int val, ReduceLambda op) { From 3dab53229defa782dac1a618ad0f885dc2525bbb Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 9 Jun 2023 04:09:57 -0700 Subject: [PATCH 2/3] add formating --- cpp/include/raft/util/cuda_utils.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh index 534f481764..86f29ee02a 100644 --- a/cpp/include/raft/util/cuda_utils.cuh +++ b/cpp/include/raft/util/cuda_utils.cuh @@ -91,7 +91,8 @@ DI void myAtomicReduce(__half* address, __half val, ReduceLambda op) } while (assumed != old); #else // Fail during template instantiation if the compute capability doesn't support this operation - static_assert(sizeof(__half) != sizeof(__half), "__half is only supported on __CUDA_ARCH__ >= 530"); + static_assert(sizeof(__half) != sizeof(__half), + "__half is only supported on __CUDA_ARCH__ >= 530"); #endif } @@ -103,11 +104,13 @@ DI void myAtomicReduce(nv_bfloat16* address, nv_bfloat16 val, ReduceLambda op) unsigned short int old = *address_as_uint, assumed; do { assumed = old; - old = atomicCAS(address_as_uint, assumed, __bfloat16_as_ushort(op(val, __ushort_as_bfloat16(assumed)))); + old = atomicCAS( + address_as_uint, assumed, __bfloat16_as_ushort(op(val, __ushort_as_bfloat16(assumed)))); } while (assumed != old); #else // Fail during template instantiation if the compute capability doesn't support this operation - static_assert(sizeof(nv_bfloat16) != sizeof(nv_bfloat16), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + static_assert(sizeof(nv_bfloat16) != sizeof(nv_bfloat16), + "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); #endif } From 1cfcd9073ca0d50592b1696f7576606409da4f15 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 9 Jun 2023 06:45:04 -0700 Subject: [PATCH 3/3] fix unsupported arch issue --- cpp/include/raft/util/cuda_utils.cuh | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh index 86f29ee02a..e718ca3545 100644 --- a/cpp/include/raft/util/cuda_utils.cuh +++ b/cpp/include/raft/util/cuda_utils.cuh @@ -20,6 +20,11 @@ #include #include +#if defined(_RAFT_HAS_CUDA) +#include +#include +#endif + #include #include #include @@ -79,27 +84,25 @@ DI void myAtomicReduce(float* address, float val, ReduceLambda op) } while (assumed != old); } +// Needed for atomicCas on ushort +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) template DI void myAtomicReduce(__half* address, __half val, ReduceLambda op) { -#if (__CUDA_ARCH__ >= 530) unsigned short int* address_as_uint = (unsigned short int*)address; unsigned short int old = *address_as_uint, assumed; do { assumed = old; old = atomicCAS(address_as_uint, assumed, __half_as_ushort(op(val, __ushort_as_half(assumed)))); } while (assumed != old); -#else - // Fail during template instantiation if the compute capability doesn't support this operation - static_assert(sizeof(__half) != sizeof(__half), - "__half is only supported on __CUDA_ARCH__ >= 530"); -#endif } +#endif +// Needed for nv_bfloat16 support +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) template DI void myAtomicReduce(nv_bfloat16* address, nv_bfloat16 val, ReduceLambda op) { -#if (__CUDA_ARCH__ >= 800) unsigned short int* address_as_uint = (unsigned short int*)address; unsigned short int old = *address_as_uint, assumed; do { @@ -107,12 +110,8 @@ DI void myAtomicReduce(nv_bfloat16* address, nv_bfloat16 val, ReduceLambda op) old = atomicCAS( address_as_uint, assumed, __bfloat16_as_ushort(op(val, __ushort_as_bfloat16(assumed)))); } while (assumed != old); -#else - // Fail during template instantiation if the compute capability doesn't support this operation - static_assert(sizeof(nv_bfloat16) != sizeof(nv_bfloat16), - "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); -#endif } +#endif template DI void myAtomicReduce(int* address, int val, ReduceLambda op)