Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Commit

Permalink
SWDEV-372032 - Support cooperative groups header with HIPRTC
Browse files Browse the repository at this point in the history
Change-Id: Ia64652fd52b210a4eeee9d38fdfdfa6fdc7163ff
  • Loading branch information
satyanveshd committed Dec 9, 2022
1 parent 574a582 commit 5ff4b16
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 30 deletions.
57 changes: 36 additions & 21 deletions include/hip/amd_detail/amd_hip_cooperative_groups.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,22 @@ THE SOFTWARE.
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H

#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/hip_cooperative_groups_helper.h>
#endif

#define __hip_abort() \
{ asm("trap;"); }
#if defined(NDEBUG)
#define __hip_assert(COND)
#else
#define __hip_assert(COND) \
{ \
if (!COND) { \
__hip_abort(); \
} \
}
#endif

namespace cooperative_groups {

Expand Down Expand Up @@ -183,8 +198,8 @@ class thread_block : public thread_group {
__CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
// Invalid tile size, assert
if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) {
assert(false && "invalid tile size");
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
__hip_assert(false && "invalid tile size");
}

thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
Expand Down Expand Up @@ -230,8 +245,8 @@ class tiled_group : public thread_group {
__CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);

if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) {
assert(false && "invalid tile size");
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
__hip_assert(false && "invalid tile size");
}

if (size() <= tile_size) {
Expand Down Expand Up @@ -286,7 +301,7 @@ class coalesced_group : public thread_group {
if (coalesced_info.tiled_info.is_tiled) {
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
unsigned int masklength = min((unsigned int)size() - base_offset, tile_size);
lane_mask member_mask = (lane_mask)(-1) >> (WAVEFRONT_SIZE - masklength);
lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);

member_mask <<= (__lane_id() & ~(tile_size - 1));
coalesced_group coalesced_tile = coalesced_group(member_mask);
Expand All @@ -299,7 +314,7 @@ class coalesced_group : public thread_group {
unsigned int tile_rank = 0;
int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;

for (unsigned int i = 0; i < WAVEFRONT_SIZE; i++) {
for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
lane_mask active = coalesced_info.member_mask & (1 << i);
// Make sure the lane is active
if (active) {
Expand Down Expand Up @@ -345,11 +360,11 @@ class coalesced_group : public thread_group {

srcRank = srcRank % size();

int lane = (size() == WAVEFRONT_SIZE) ? srcRank
: (WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
: (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));

return __shfl(var, lane, WAVEFRONT_SIZE);
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
}

template <class T>
Expand All @@ -360,12 +375,12 @@ class coalesced_group : public thread_group {
// and WARP_SIZE as the shift value rather than lane_delta itself.
// This is not described in the documentation and is not done here.

if (size() == WAVEFRONT_SIZE) {
return __shfl_down(var, lane_delta, WAVEFRONT_SIZE);
if (size() == __AMDGCN_WAVEFRONT_SIZE) {
return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
}

int lane;
if (WAVEFRONT_SIZE == 64) {
if (__AMDGCN_WAVEFRONT_SIZE == 64) {
lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
}
else {
Expand All @@ -376,7 +391,7 @@ class coalesced_group : public thread_group {
lane = __lane_id();
}

return __shfl(var, lane, WAVEFRONT_SIZE);
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
}

template <class T>
Expand All @@ -387,23 +402,23 @@ class coalesced_group : public thread_group {
// and WARP_SIZE as the shift value rather than lane_delta itself.
// This is not described in the documentation and is not done here.

if (size() == WAVEFRONT_SIZE) {
return __shfl_up(var, lane_delta, WAVEFRONT_SIZE);
if (size() == __AMDGCN_WAVEFRONT_SIZE) {
return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
}

int lane;
if (WAVEFRONT_SIZE == 64) {
if (__AMDGCN_WAVEFRONT_SIZE == 64) {
lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
}
else if (WAVEFRONT_SIZE == 32) {
else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
}

if (lane == -1) {
lane = __lane_id();
}

return __shfl(var, lane, WAVEFRONT_SIZE);
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
}
};

Expand Down Expand Up @@ -437,7 +452,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
return (static_cast<const coalesced_group*>(this)->thread_rank());
}
default: {
assert(false && "invalid cooperative group type");
__hip_assert(false && "invalid cooperative group type");
return -1;
}
}
Expand All @@ -461,7 +476,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const {
return (static_cast<const coalesced_group*>(this)->is_valid());
}
default: {
assert(false && "invalid cooperative group type");
__hip_assert(false && "invalid cooperative group type");
return false;
}
}
Expand Down Expand Up @@ -490,7 +505,7 @@ __CG_QUALIFIER__ void thread_group::sync() const {
break;
}
default: {
assert(false && "invalid cooperative group type");
__hip_assert(false && "invalid cooperative group type");
}
}
}
Expand Down
13 changes: 5 additions & 8 deletions include/hip/amd_detail/hip_cooperative_groups_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ THE SOFTWARE.
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H

#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_device_functions.h>
#include <bitset>
#endif
#if !defined(__align__)
#define __align__(x) __attribute__((aligned(x)))
#endif
Expand All @@ -50,12 +51,9 @@ THE SOFTWARE.
#define _CG_STATIC_CONST_DECL_ static constexpr
#endif

#if !defined(WAVEFRONT_SIZE)
#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
#define WAVEFRONT_SIZE 32
#if __AMDGCN_WAVEFRONT_SIZE == 32
using lane_mask = unsigned int;
#else
#define WAVEFRONT_SIZE 64
using lane_mask = unsigned long long int;
#endif

Expand All @@ -66,7 +64,7 @@ template <unsigned int size>
using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;

template <unsigned int size>
using is_valid_wavefront = std::integral_constant<bool, (size <= WAVEFRONT_SIZE)>;
using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;

template <unsigned int size>
using is_valid_tile_size =
Expand Down Expand Up @@ -191,7 +189,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "
// have i-th bit of x set and come before the current thread.
__device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
int counter=0;
#if WAVEFRONT_SIZE == 32
#if __AMDGCN_WAVEFRONT_SIZE == 32
counter = __builtin_amdgcn_mbcnt_lo(x, add);
#else
counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
Expand All @@ -210,4 +208,3 @@ __device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {

#endif // __cplusplus
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
#endif
4 changes: 3 additions & 1 deletion src/hiprtc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ set(HIPRTC_GEN_MCIN "${HIPRTC_GEN_DIR}/hipRTC_header.mcin")
set(HIPRTC_GEN_PREPROCESSED "${HIPRTC_GEN_DIR}/hipRTC")
set(HIPRTC_GEN_OBJ "${HIPRTC_GEN_DIR}/hipRTC_header${CMAKE_CXX_OUTPUT_EXTENSION}")
set(HIPRTC_WARP_FUNCS "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_warp_functions.h")
set(HIPRTC_COOP_GROUPS "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups.h")
set(HIPRTC_COOP_GRPS_HELPER "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/hip_cooperative_groups_helper.h")

# Generate required HIPRTC files.
FILE(MAKE_DIRECTORY ${HIPRTC_GEN_DIR})
Expand All @@ -142,7 +144,7 @@ generate_hiprtc_mcin("${HIPRTC_GEN_MCIN}" "${HIPRTC_GEN_PREPROCESSED}")
add_custom_command(
OUTPUT ${HIPRTC_GEN_PREPROCESSED}
COMMAND $<TARGET_FILE:clang> -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=3.6 -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -x hip ${HIPRTC_GEN_HEADER} -E -o ${HIPRTC_GEN_PREPROCESSED}
COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_WARP_HEADER_FILE=${HIPRTC_WARP_FUNCS} -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE}
COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_WARP_HEADER_FILE=${HIPRTC_WARP_FUNCS} -DHIPRTC_COOP_HEADER_FILE=${HIPRTC_COOP_GROUPS} -DHIPRTC_COOP_HELPER_FILE=${HIPRTC_COOP_GRPS_HELPER} -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE}
DEPENDS clang ${HIPRTC_GEN_HEADER})
add_custom_command(
OUTPUT ${HIPRTC_GEN_OBJ}
Expand Down
4 changes: 4 additions & 0 deletions src/hiprtc/cmake/HIPRTC.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ if(HIPRTC_ADD_MACROS)
FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_DEFINES}")
FILE(READ "${HIPRTC_WARP_HEADER_FILE}" HIPRTC_WARP_HEADER)
FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_WARP_HEADER}")
FILE(READ "${HIPRTC_COOP_HELPER_FILE}" HIPRTC_COOP_HELPER)
FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HELPER}")
FILE(READ "${HIPRTC_COOP_HEADER_FILE}" HIPRTC_COOP_HEADER)
FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HEADER}")
endif()

macro(generate_hiprtc_header HiprtcHeader)
Expand Down

0 comments on commit 5ff4b16

Please sign in to comment.