Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

various fixes for shfl unit tests, added __wavesize for lc #56

Merged
merged 7 commits into from
May 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 29 additions & 18 deletions include/hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1898,7 +1898,15 @@ tiled_extent<3> extent<N>::tile(int t0, int t1, int t2) const __CPU__ __HC__ {
*
* @return The size of a wavefront.
*/
extern "C" unsigned int __wavesize() __HC__;
#define __HSA_WAVEFRONT_SIZE__ (64)
extern "C" unsigned int __wavesize() __HC__;


#if __hcc_backend__==HCC_BACKEND_AMDGPU
extern "C" inline unsigned int __wavesize() __HC__ {
return __HSA_WAVEFRONT_SIZE__;
}
#endif

/**
* Count number of 1 bits in the input
Expand Down Expand Up @@ -2331,8 +2339,6 @@ extern "C" inline uint64_t __ballot(int predicate) __HC__ {
// Wavefront Shuffle Functions
// ------------------------------------------------------------------------

#define __HSA_WAVEFRONT_SIZE__ (64)

// utility union type
union __u {
int i;
Expand Down Expand Up @@ -2361,6 +2367,11 @@ union __u {

extern "C" __attribute__((const)) unsigned int __hsail_get_lane_id(void) __HC__;

// returns the lane ID within a wavefront
inline int __lane_id(void) [[hc]] {
return __hsail_get_lane_id();
}

#if __hcc_backend__==HCC_BACKEND_AMDGPU

extern "C" int amdgcn_ds_bpermute(int index, int src) [[hc]];
Expand All @@ -2369,12 +2380,12 @@ extern "C" int amdgcn_ds_bpermute(int index, int src) [[hc]];

extern "C" unsigned int __hsail_activelanepermute_b32(unsigned int src, unsigned int lid, unsigned int ival, bool useival) __HC__;
inline int __wavefront_shift_right(int var) __HC__ {
return __hsail_activelanepermute_b32(var, __hsail_get_lane_id()-1
, var, __hsail_get_lane_id()==0);
return __hsail_activelanepermute_b32(var, __lane_id()-1
, var, __lane_id()==0);
}
inline int __wavefront_shift_left(int var) __HC__ {
return __hsail_activelanepermute_b32(var, __hsail_get_lane_id()+1
, var, __hsail_get_lane_id()==63);
return __hsail_activelanepermute_b32(var, __lane_id()+1
, var, __lane_id()==63);
}
#endif

Expand All @@ -2388,7 +2399,7 @@ inline int __wavefront_shift_left(int var) __HC__ {
#if __hcc_backend__==HCC_BACKEND_AMDGPU

inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
int self = __hsail_get_lane_id();
int self = __lane_id();
int index = srcLane + (self & ~(width-1));
return amdgcn_ds_bpermute(index<<2, var);
}
Expand All @@ -2402,7 +2413,7 @@ inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
default: {
unsigned int ulane = (unsigned int)srcLane;
unsigned int uwidth = (unsigned int)width;
unsigned int laneId = __hsail_get_lane_id();
unsigned int laneId = __lane_id();
unsigned int newSrcLane = (laneId&((unsigned int)0xFFFFFFFF-(uwidth-1))) + (ulane&(uwidth-1));
return __hsail_activelanepermute_b32(var,newSrcLane, 0, 0);
}
Expand Down Expand Up @@ -2445,7 +2456,7 @@ inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __
#if __hcc_backend__==HCC_BACKEND_AMDGPU

inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
int self = __hsail_get_lane_id();
int self = __lane_id();
int index = self - delta;
index = (index < (self & ~(width-1)))?self:index;
return amdgcn_ds_bpermute(index<<2, var);
Expand All @@ -2459,7 +2470,7 @@ inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WA
return __wavefront_shift_right(var);
}
else {
int laneId = __hsail_get_lane_id();
int laneId = __lane_id();
int newSrcLane = laneId - delta;
return __hsail_activelanepermute_b32(var, newSrcLane, var, newSrcLane < (laneId&(~(width-1))));
}
Expand Down Expand Up @@ -2501,7 +2512,7 @@ inline float __shfl_up(float var, const unsigned int delta, const int width=__HS
#if __hcc_backend__==HCC_BACKEND_AMDGPU

inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
int self = __hsail_get_lane_id();
int self = __lane_id();
int index = self + delta;
index = ((self&(width-1))+delta) >= width?self:index;
return amdgcn_ds_bpermute(index<<2, var);
Expand All @@ -2515,7 +2526,7 @@ inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_
return __wavefront_shift_left(var);
}
else {
unsigned int laneId = __hsail_get_lane_id();
unsigned int laneId = __lane_id();
unsigned int newSrcLane = laneId + delta;
return __hsail_activelanepermute_b32(var, newSrcLane, var, newSrcLane >= ((laneId&(~(width-1))) + width ));
}
Expand Down Expand Up @@ -2556,7 +2567,7 @@ inline float __shfl_down(float var, const unsigned int delta, const int width=__


inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
int self = __hsail_get_lane_id();
int self = __lane_id();
int index = self^laneMask;
index = index >= ((self+width)&~(width-1))?self:index;
return amdgcn_ds_bpermute(index<<2, var);
Expand All @@ -2567,10 +2578,10 @@ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) _


inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
unsigned int laneId = __hsail_get_lane_id();
unsigned int target = laneId ^ laneMask;
unsigned int w = width;
return __hsail_activelanepermute_b32(var, target, var, target>=((laneId+w)&~(w-1)));
int self = __lane_id();
int index = self^laneMask;
index = index >= ((self+width)&~(width-1))?self:index;
return __hsail_activelanepermute_b32(var, index, 0, 0);
}

#endif
Expand Down
6 changes: 3 additions & 3 deletions include/hc_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class auto_voidp {

// Valid values for__hcc_backend__ to indicate the
// compiler backend
#define HCC_BACKEND_AMDGPU 1
#define HCC_BACKEND_HSAIL 2
#define HCC_BACKEND_CL 3
#define HCC_BACKEND_AMDGPU (1)
#define HCC_BACKEND_HSAIL (2)
#define HCC_BACKEND_CL (3)

2 changes: 1 addition & 1 deletion tests/Unit/HSAIL/activelaneid.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// XFAIL: Linux
// XFAIL: *
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would make this test fail even on HSAIL backend. it's better to use hcc_backend macro to check. I'll fix it. the same goes to activelanepermute right below.

// RUN: %hc %s -o %t.out && %t.out

#include <hc.hpp>
Expand Down
2 changes: 1 addition & 1 deletion tests/Unit/HSAIL/activelanepermute.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// XFAIL: Linux
// XFAIL: *
// RUN: %hc %s -o %t.out && %t.out

#include <hc.hpp>
Expand Down
4 changes: 2 additions & 2 deletions tests/Unit/HSAIL/shfl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ bool test__shfl(int grid_size, T arg) {
// broadcast of a single value across a wavefront
parallel_for_each(ex, [&, arg](index<1>& idx) [[hc]] {
T value = T();
if (__activelaneid_u32() == 0)
if (__lane_id() == 0)
value = arg;
value = __shfl(value, 0);
table(idx) = value;
Expand Down Expand Up @@ -58,7 +58,7 @@ bool test__shfl2(int grid_size, int sub_wavefront_width, T arg) {
// broadcast of a single value across a sub-wavefront
parallel_for_each(ex, [&, arg, sub_wavefront_width](index<1>& idx) [[hc]] {
T value = T();
unsigned int laneId = __activelaneid_u32();
unsigned int laneId = __lane_id();
// each subsection of a wavefront would have a different test value
if (laneId % sub_wavefront_width == 0)
value = (arg + laneId / sub_wavefront_width);
Expand Down
4 changes: 2 additions & 2 deletions tests/Unit/HSAIL/shfl_down.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ bool test__shfl_down(int grid_size, int offset, T init_value) {

// shift values down in a wavefront
parallel_for_each(ex, [&, offset, init_value](index<1>& idx) [[hc]] {
T value = init_value + __activelaneid_u32();
T value = init_value + __lane_id();
value = __shfl_down(value, offset);
table(idx) = value;
}).wait();
Expand Down Expand Up @@ -58,7 +58,7 @@ bool test__shfl_down2(int grid_size, int sub_wavefront_width, int offset, T init

// shift values down in a wavefront, divided into subsections
parallel_for_each(ex, [&, offset, sub_wavefront_width, init_value](index<1>& idx) [[hc]] {
T value = init_value + (__activelaneid_u32() % sub_wavefront_width);
T value = init_value + (__lane_id() % sub_wavefront_width);
value = __shfl_down(value, offset, sub_wavefront_width);
table(idx) = value;
}).wait();
Expand Down
6 changes: 3 additions & 3 deletions tests/Unit/HSAIL/shfl_up.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ bool test__shfl_up(int grid_size, int offset, T init_value) {

// shift values up in a wavefront
parallel_for_each(ex, [&, offset, init_value](index<1>& idx) [[hc]] {
T value = init_value + __activelaneid_u32();
T value = init_value + __lane_id();
value = __shfl_up(value, offset);
table(idx) = value;
}).wait();
Expand Down Expand Up @@ -56,7 +56,7 @@ bool test__shfl_up2(int grid_size, int sub_wavefront_width, int offset, T init_v

// shift values up in a wavefront, divided into subsections
parallel_for_each(ex, [&, offset, sub_wavefront_width, init_value](index<1>& idx) [[hc]] {
T value = init_value + (__activelaneid_u32() % sub_wavefront_width);
T value = init_value + (__lane_id() % sub_wavefront_width);
value = __shfl_up(value, offset, sub_wavefront_width);
table(idx) = value;
}).wait();
Expand Down Expand Up @@ -84,7 +84,7 @@ bool test_scan(int grid_size, int sub_wavefront_width) {
array<int, 1> table(grid_size);

parallel_for_each(ex, [&, sub_wavefront_width](index<1>& idx) [[hc]] {
int laneId = __activelaneid_u32();
int laneId = __lane_id();
int logicalLaneId = laneId % sub_wavefront_width;
int value = (WAVEFRONT_SIZE - 1) - laneId;

Expand Down
2 changes: 1 addition & 1 deletion tests/Unit/HSAIL/shfl_xor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ bool test_reduce(int grid_size) {
array<int, 1> table(grid_size);

parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
int laneId = __activelaneid_u32();
int laneId = __lane_id();
int value = (WAVEFRONT_SIZE - 1) - laneId;

// use xor mode to perform butterfly reduction
Expand Down