diff --git a/include/hc.hpp b/include/hc.hpp index dd3e6be72eb..b98c2abded4 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -1898,7 +1898,15 @@ tiled_extent<3> extent::tile(int t0, int t1, int t2) const __CPU__ __HC__ { * * @return The size of a wavefront. */ -extern "C" unsigned int __wavesize() __HC__; +#define __HSA_WAVEFRONT_SIZE__ (64) +extern "C" unsigned int __wavesize() __HC__; + + +#if __hcc_backend__==HCC_BACKEND_AMDGPU +extern "C" inline unsigned int __wavesize() __HC__ { + return __HSA_WAVEFRONT_SIZE__; +} +#endif /** * Count number of 1 bits in the input @@ -2331,8 +2339,6 @@ extern "C" inline uint64_t __ballot(int predicate) __HC__ { // Wavefront Shuffle Functions // ------------------------------------------------------------------------ -#define __HSA_WAVEFRONT_SIZE__ (64) - // utility union type union __u { int i; @@ -2361,6 +2367,11 @@ union __u { extern "C" __attribute__((const)) unsigned int __hsail_get_lane_id(void) __HC__; +// returns the lane ID within a wavefront +inline int __lane_id(void) [[hc]] { + return __hsail_get_lane_id(); +} + #if __hcc_backend__==HCC_BACKEND_AMDGPU extern "C" int amdgcn_ds_bpermute(int index, int src) [[hc]]; @@ -2369,12 +2380,12 @@ extern "C" int amdgcn_ds_bpermute(int index, int src) [[hc]]; extern "C" unsigned int __hsail_activelanepermute_b32(unsigned int src, unsigned int lid, unsigned int ival, bool useival) __HC__; inline int __wavefront_shift_right(int var) __HC__ { - return __hsail_activelanepermute_b32(var, __hsail_get_lane_id()-1 - , var, __hsail_get_lane_id()==0); + return __hsail_activelanepermute_b32(var, __lane_id()-1 + , var, __lane_id()==0); } inline int __wavefront_shift_left(int var) __HC__ { - return __hsail_activelanepermute_b32(var, __hsail_get_lane_id()+1 - , var, __hsail_get_lane_id()==63); + return __hsail_activelanepermute_b32(var, __lane_id()+1 + , var, __lane_id()==63); } #endif @@ -2388,7 +2399,7 @@ inline int __wavefront_shift_left(int var) __HC__ { #if __hcc_backend__==HCC_BACKEND_AMDGPU inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { - int self = __hsail_get_lane_id(); + int self = __lane_id(); int index = srcLane + (self & ~(width-1)); return amdgcn_ds_bpermute(index<<2, var); } @@ -2402,7 +2413,7 @@ inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ default: { unsigned int ulane = (unsigned int)srcLane; unsigned int uwidth = (unsigned int)width; - unsigned int laneId = __hsail_get_lane_id(); + unsigned int laneId = __lane_id(); unsigned int newSrcLane = (laneId&((unsigned int)0xFFFFFFFF-(uwidth-1))) + (ulane&(uwidth-1)); return __hsail_activelanepermute_b32(var,newSrcLane, 0, 0); } @@ -2445,7 +2456,7 @@ inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __ #if __hcc_backend__==HCC_BACKEND_AMDGPU inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ { - int self = __hsail_get_lane_id(); + int self = __lane_id(); int index = self - delta; index = (index < (self & ~(width-1)))?self:index; return amdgcn_ds_bpermute(index<<2, var); @@ -2459,7 +2470,7 @@ inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WA return __wavefront_shift_right(var); } else { - int laneId = __hsail_get_lane_id(); + int laneId = __lane_id(); int newSrcLane = laneId - delta; return __hsail_activelanepermute_b32(var, newSrcLane, var, newSrcLane < (laneId&(~(width-1)))); } @@ -2501,7 +2512,7 @@ inline float __shfl_up(float var, const unsigned int delta, const int width=__HS #if __hcc_backend__==HCC_BACKEND_AMDGPU inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ { - int self = __hsail_get_lane_id(); + int self = __lane_id(); int index = self + delta; index = ((self&(width-1))+delta) >= width?self:index; return amdgcn_ds_bpermute(index<<2, var); @@ -2515,7 +2526,7 @@ inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_ return __wavefront_shift_left(var); } else { - unsigned int laneId = __hsail_get_lane_id(); + unsigned int laneId = __lane_id(); unsigned int newSrcLane = laneId + delta; return __hsail_activelanepermute_b32(var, newSrcLane, var, newSrcLane >= ((laneId&(~(width-1))) + width )); } @@ -2556,7 +2567,7 @@ inline float __shfl_down(float var, const unsigned int delta, const int width=__ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { - int self = __hsail_get_lane_id(); + int self = __lane_id(); int index = self^laneMask; index = index >= ((self+width)&~(width-1))?self:index; return amdgcn_ds_bpermute(index<<2, var); @@ -2567,10 +2578,10 @@ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) _ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { - unsigned int laneId = __hsail_get_lane_id(); - unsigned int target = laneId ^ laneMask; - unsigned int w = width; - return __hsail_activelanepermute_b32(var, target, var, target>=((laneId+w)&~(w-1))); + int self = __lane_id(); + int index = self^laneMask; + index = index >= ((self+width)&~(width-1))?self:index; + return __hsail_activelanepermute_b32(var, index, 0, 0); } #endif diff --git a/include/hc_defines.h b/include/hc_defines.h index 31afcd4472a..cbccd10241a 100644 --- a/include/hc_defines.h +++ b/include/hc_defines.h @@ -79,7 +79,7 @@ class auto_voidp { // Valid values for__hcc_backend__ to indicate the // compiler backend -#define HCC_BACKEND_AMDGPU 1 -#define HCC_BACKEND_HSAIL 2 -#define HCC_BACKEND_CL 3 +#define HCC_BACKEND_AMDGPU (1) +#define HCC_BACKEND_HSAIL (2) +#define HCC_BACKEND_CL (3) diff --git a/tests/Unit/HSAIL/activelaneid.cpp b/tests/Unit/HSAIL/activelaneid.cpp index 6334be3ac77..9bb5eebbfd9 100644 --- a/tests/Unit/HSAIL/activelaneid.cpp +++ b/tests/Unit/HSAIL/activelaneid.cpp @@ -1,4 +1,4 @@ -// XFAIL: Linux +// XFAIL: * // RUN: %hc %s -o %t.out && %t.out #include diff --git a/tests/Unit/HSAIL/activelanepermute.cpp b/tests/Unit/HSAIL/activelanepermute.cpp index cc378471f8d..3f39ff53ca3 100644 --- a/tests/Unit/HSAIL/activelanepermute.cpp +++ b/tests/Unit/HSAIL/activelanepermute.cpp @@ -1,4 +1,4 @@ -// XFAIL: Linux +// XFAIL: * // RUN: %hc %s -o %t.out && %t.out #include diff --git a/tests/Unit/HSAIL/shfl.cpp b/tests/Unit/HSAIL/shfl.cpp index 708fb9f0f01..ac8e7cb66f1 100644 --- a/tests/Unit/HSAIL/shfl.cpp +++ b/tests/Unit/HSAIL/shfl.cpp @@ -26,7 +26,7 @@ bool test__shfl(int grid_size, T arg) { // broadcast of a single value across a wavefront parallel_for_each(ex, [&, arg](index<1>& idx) [[hc]] { T value = T(); - if (__activelaneid_u32() == 0) + if (__lane_id() == 0) value = arg; value = __shfl(value, 0); table(idx) = value; @@ -58,7 +58,7 @@ bool test__shfl2(int grid_size, int sub_wavefront_width, T arg) { // broadcast of a single value across a sub-wavefront parallel_for_each(ex, [&, arg, sub_wavefront_width](index<1>& idx) [[hc]] { T value = T(); - unsigned int laneId = __activelaneid_u32(); + unsigned int laneId = __lane_id(); // each subsection of a wavefront would have a different test value if (laneId % sub_wavefront_width == 0) value = (arg + laneId / sub_wavefront_width); diff --git a/tests/Unit/HSAIL/shfl_down.cpp b/tests/Unit/HSAIL/shfl_down.cpp index 54306054886..2b7f4dace0e 100644 --- a/tests/Unit/HSAIL/shfl_down.cpp +++ b/tests/Unit/HSAIL/shfl_down.cpp @@ -25,7 +25,7 @@ bool test__shfl_down(int grid_size, int offset, T init_value) { // shift values down in a wavefront parallel_for_each(ex, [&, offset, init_value](index<1>& idx) [[hc]] { - T value = init_value + __activelaneid_u32(); + T value = init_value + __lane_id(); value = __shfl_down(value, offset); table(idx) = value; }).wait(); @@ -58,7 +58,7 @@ bool test__shfl_down2(int grid_size, int sub_wavefront_width, int offset, T init // shift values down in a wavefront, divided into subsections parallel_for_each(ex, [&, offset, sub_wavefront_width, init_value](index<1>& idx) [[hc]] { - T value = init_value + (__activelaneid_u32() % sub_wavefront_width); + T value = init_value + (__lane_id() % sub_wavefront_width); value = __shfl_down(value, offset, sub_wavefront_width); table(idx) = value; }).wait(); diff --git a/tests/Unit/HSAIL/shfl_up.cpp b/tests/Unit/HSAIL/shfl_up.cpp index 551b80495dc..f7a3e977c18 100644 --- a/tests/Unit/HSAIL/shfl_up.cpp +++ b/tests/Unit/HSAIL/shfl_up.cpp @@ -25,7 +25,7 @@ bool test__shfl_up(int grid_size, int offset, T init_value) { // shift values up in a wavefront parallel_for_each(ex, [&, offset, init_value](index<1>& idx) [[hc]] { - T value = init_value + __activelaneid_u32(); + T value = init_value + __lane_id(); value = __shfl_up(value, offset); table(idx) = value; }).wait(); @@ -56,7 +56,7 @@ bool test__shfl_up2(int grid_size, int sub_wavefront_width, int offset, T init_v // shift values up in a wavefront, divided into subsections parallel_for_each(ex, [&, offset, sub_wavefront_width, init_value](index<1>& idx) [[hc]] { - T value = init_value + (__activelaneid_u32() % sub_wavefront_width); + T value = init_value + (__lane_id() % sub_wavefront_width); value = __shfl_up(value, offset, sub_wavefront_width); table(idx) = value; }).wait(); @@ -84,7 +84,7 @@ bool test_scan(int grid_size, int sub_wavefront_width) { array table(grid_size); parallel_for_each(ex, [&, sub_wavefront_width](index<1>& idx) [[hc]] { - int laneId = __activelaneid_u32(); + int laneId = __lane_id(); int logicalLaneId = laneId % sub_wavefront_width; int value = (WAVEFRONT_SIZE - 1) - laneId; diff --git a/tests/Unit/HSAIL/shfl_xor.cpp b/tests/Unit/HSAIL/shfl_xor.cpp index 2c90d2a6c43..f51ab6f7f67 100644 --- a/tests/Unit/HSAIL/shfl_xor.cpp +++ b/tests/Unit/HSAIL/shfl_xor.cpp @@ -22,7 +22,7 @@ bool test_reduce(int grid_size) { array table(grid_size); parallel_for_each(ex, [&](index<1>& idx) [[hc]] { - int laneId = __activelaneid_u32(); + int laneId = __lane_id(); int value = (WAVEFRONT_SIZE - 1) - laneId; // use xor mode to perform butterfly reduction