From 5ff4b16db0cdebd250fc637eb209927dec04b4cb Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Tue, 6 Dec 2022 12:40:31 +0000 Subject: [PATCH] SWDEV-372032 - Support cooperative groups header with HIPRTC Change-Id: Ia64652fd52b210a4eeee9d38fdfdfa6fdc7163ff --- .../amd_detail/amd_hip_cooperative_groups.h | 57 ++++++++++++------- .../hip_cooperative_groups_helper.h | 13 ++--- src/hiprtc/CMakeLists.txt | 4 +- src/hiprtc/cmake/HIPRTC.cmake | 4 ++ 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/include/hip/amd_detail/amd_hip_cooperative_groups.h b/include/hip/amd_detail/amd_hip_cooperative_groups.h index 03aa4d32..747f65a4 100644 --- a/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -33,7 +33,22 @@ THE SOFTWARE. #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H #if __cplusplus +#if !defined(__HIPCC_RTC__) #include +#endif + +#define __hip_abort() \ + { asm("trap;"); } +#if defined(NDEBUG) +#define __hip_assert(COND) +#else +#define __hip_assert(COND) \ + { \ + if (!COND) { \ + __hip_abort(); \ + } \ + } +#endif namespace cooperative_groups { @@ -183,8 +198,8 @@ class thread_block : public thread_group { __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); // Invalid tile size, assert - if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) { - assert(false && "invalid tile size"); + if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + __hip_assert(false && "invalid tile size"); } thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); @@ -230,8 +245,8 @@ class tiled_group : public thread_group { __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); - if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) { - assert(false && "invalid tile size"); + if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + __hip_assert(false && "invalid tile size"); } if (size() <= tile_size) { @@ -286,7 +301,7 @@ class coalesced_group : public thread_group { if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); unsigned int masklength = min((unsigned int)size() - base_offset, tile_size); - lane_mask member_mask = (lane_mask)(-1) >> (WAVEFRONT_SIZE - masklength); + lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); coalesced_group coalesced_tile = coalesced_group(member_mask); @@ -299,7 +314,7 @@ class coalesced_group : public thread_group { unsigned int tile_rank = 0; int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size; - for (unsigned int i = 0; i < WAVEFRONT_SIZE; i++) { + for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) { lane_mask active = coalesced_info.member_mask & (1 << i); // Make sure the lane is active if (active) { @@ -345,11 +360,11 @@ class coalesced_group : public thread_group { srcRank = srcRank % size(); - int lane = (size() == WAVEFRONT_SIZE) ? srcRank - : (WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) + int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank + : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); - return __shfl(var, lane, WAVEFRONT_SIZE); + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); } template @@ -360,12 +375,12 @@ class coalesced_group : public thread_group { // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. - if (size() == WAVEFRONT_SIZE) { - return __shfl_down(var, lane_delta, WAVEFRONT_SIZE); + if (size() == __AMDGCN_WAVEFRONT_SIZE) { + return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); } int lane; - if (WAVEFRONT_SIZE == 64) { + if (__AMDGCN_WAVEFRONT_SIZE == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1); } else { @@ -376,7 +391,7 @@ class coalesced_group : public thread_group { lane = __lane_id(); } - return __shfl(var, lane, WAVEFRONT_SIZE); + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); } template @@ -387,15 +402,15 @@ class coalesced_group : public thread_group { // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. - if (size() == WAVEFRONT_SIZE) { - return __shfl_up(var, lane_delta, WAVEFRONT_SIZE); + if (size() == __AMDGCN_WAVEFRONT_SIZE) { + return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); } int lane; - if (WAVEFRONT_SIZE == 64) { + if (__AMDGCN_WAVEFRONT_SIZE == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } - else if (WAVEFRONT_SIZE == 32) { + else if (__AMDGCN_WAVEFRONT_SIZE == 32) { lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } @@ -403,7 +418,7 @@ class coalesced_group : public thread_group { lane = __lane_id(); } - return __shfl(var, lane, WAVEFRONT_SIZE); + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); } }; @@ -437,7 +452,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { return (static_cast(this)->thread_rank()); } default: { - assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type"); return -1; } } @@ -461,7 +476,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const { return (static_cast(this)->is_valid()); } default: { - assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type"); return false; } } @@ -490,7 +505,7 @@ __CG_QUALIFIER__ void thread_group::sync() const { break; } default: { - assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type"); } } } diff --git a/include/hip/amd_detail/hip_cooperative_groups_helper.h b/include/hip/amd_detail/hip_cooperative_groups_helper.h index e8610e22..a90f0a3a 100644 --- a/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -32,8 +32,9 @@ THE SOFTWARE. #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H #if __cplusplus +#if !defined(__HIPCC_RTC__) #include -#include +#endif #if !defined(__align__) #define __align__(x) __attribute__((aligned(x))) #endif @@ -50,12 +51,9 @@ THE SOFTWARE. #define _CG_STATIC_CONST_DECL_ static constexpr #endif -#if !defined(WAVEFRONT_SIZE) -#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__ -#define WAVEFRONT_SIZE 32 +#if __AMDGCN_WAVEFRONT_SIZE == 32 using lane_mask = unsigned int; #else -#define WAVEFRONT_SIZE 64 using lane_mask = unsigned long long int; #endif @@ -66,7 +64,7 @@ template using is_power_of_2 = std::integral_constant; template -using is_valid_wavefront = std::integral_constant; +using is_valid_wavefront = std::integral_constant; template using is_valid_tile_size = @@ -191,7 +189,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, " // have i-th bit of x set and come before the current thread. __device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { int counter=0; - #if WAVEFRONT_SIZE == 32 + #if __AMDGCN_WAVEFRONT_SIZE == 32 counter = __builtin_amdgcn_mbcnt_lo(x, add); #else counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); @@ -210,4 +208,3 @@ __device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { #endif // __cplusplus #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H -#endif diff --git a/src/hiprtc/CMakeLists.txt b/src/hiprtc/CMakeLists.txt index 33aa1c0c..e8cc47f8 100644 --- a/src/hiprtc/CMakeLists.txt +++ b/src/hiprtc/CMakeLists.txt @@ -130,6 +130,8 @@ set(HIPRTC_GEN_MCIN "${HIPRTC_GEN_DIR}/hipRTC_header.mcin") set(HIPRTC_GEN_PREPROCESSED "${HIPRTC_GEN_DIR}/hipRTC") set(HIPRTC_GEN_OBJ "${HIPRTC_GEN_DIR}/hipRTC_header${CMAKE_CXX_OUTPUT_EXTENSION}") set(HIPRTC_WARP_FUNCS "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_warp_functions.h") +set(HIPRTC_COOP_GROUPS "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups.h") +set(HIPRTC_COOP_GRPS_HELPER "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/hip_cooperative_groups_helper.h") # Generate required HIPRTC files. FILE(MAKE_DIRECTORY ${HIPRTC_GEN_DIR}) @@ -142,7 +144,7 @@ generate_hiprtc_mcin("${HIPRTC_GEN_MCIN}" "${HIPRTC_GEN_PREPROCESSED}") add_custom_command( OUTPUT ${HIPRTC_GEN_PREPROCESSED} COMMAND $ -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=3.6 -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -x hip ${HIPRTC_GEN_HEADER} -E -o ${HIPRTC_GEN_PREPROCESSED} - COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_WARP_HEADER_FILE=${HIPRTC_WARP_FUNCS} -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE} + COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_WARP_HEADER_FILE=${HIPRTC_WARP_FUNCS} -DHIPRTC_COOP_HEADER_FILE=${HIPRTC_COOP_GROUPS} -DHIPRTC_COOP_HELPER_FILE=${HIPRTC_COOP_GRPS_HELPER} -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE} DEPENDS clang ${HIPRTC_GEN_HEADER}) add_custom_command( OUTPUT ${HIPRTC_GEN_OBJ} diff --git a/src/hiprtc/cmake/HIPRTC.cmake b/src/hiprtc/cmake/HIPRTC.cmake index 02a9cc14..41e02b5f 100644 --- a/src/hiprtc/cmake/HIPRTC.cmake +++ b/src/hiprtc/cmake/HIPRTC.cmake @@ -63,6 +63,10 @@ if(HIPRTC_ADD_MACROS) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_DEFINES}") FILE(READ "${HIPRTC_WARP_HEADER_FILE}" HIPRTC_WARP_HEADER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_WARP_HEADER}") + FILE(READ "${HIPRTC_COOP_HELPER_FILE}" HIPRTC_COOP_HELPER) + FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HELPER}") + FILE(READ "${HIPRTC_COOP_HEADER_FILE}" HIPRTC_COOP_HEADER) + FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HEADER}") endif() macro(generate_hiprtc_header HiprtcHeader)