diff --git a/include/hc.hpp b/include/hc.hpp
index dd3e6be72eb..b98c2abded4 100644
--- a/include/hc.hpp
+++ b/include/hc.hpp
@@ -1898,7 +1898,15 @@ tiled_extent<3> extent<N>::tile(int t0, int t1, int t2) const __CPU__ __HC__ {
  *
  * @return The size of a wavefront.
  */
-extern "C" unsigned int __wavesize() __HC__;
+#define __HSA_WAVEFRONT_SIZE__ (64)
+extern "C" unsigned int __wavesize() __HC__; 
+
+
+#if __hcc_backend__==HCC_BACKEND_AMDGPU
+extern "C" inline unsigned int __wavesize() __HC__ {
+  return __HSA_WAVEFRONT_SIZE__;
+}
+#endif
 
 /**
  * Count number of 1 bits in the input
@@ -2331,8 +2339,6 @@ extern "C" inline uint64_t __ballot(int predicate) __HC__ {
 // Wavefront Shuffle Functions
 // ------------------------------------------------------------------------
 
-#define __HSA_WAVEFRONT_SIZE__ (64)
-
 // utility union type
 union __u {
     int i;
@@ -2361,6 +2367,11 @@ union __u {
 
 extern "C" __attribute__((const)) unsigned int __hsail_get_lane_id(void) __HC__;
 
+// returns the lane ID within a wavefront
+inline int __lane_id(void) [[hc]] {
+  return __hsail_get_lane_id();
+}
+
 #if __hcc_backend__==HCC_BACKEND_AMDGPU
 
 extern "C" int amdgcn_ds_bpermute(int index, int src) [[hc]];
@@ -2369,12 +2380,12 @@ extern "C" int amdgcn_ds_bpermute(int index, int src) [[hc]];
 
 extern "C" unsigned int __hsail_activelanepermute_b32(unsigned int src, unsigned int lid, unsigned int ival, bool useival) __HC__;
 inline int __wavefront_shift_right(int var) __HC__ {
-    return  __hsail_activelanepermute_b32(var, __hsail_get_lane_id()-1
-                                        , var, __hsail_get_lane_id()==0);
+    return  __hsail_activelanepermute_b32(var, __lane_id()-1
+                                        , var, __lane_id()==0);
 }
 inline int __wavefront_shift_left(int var) __HC__ {
-    return  __hsail_activelanepermute_b32(var, __hsail_get_lane_id()+1
-                                        , var, __hsail_get_lane_id()==63);
+    return  __hsail_activelanepermute_b32(var, __lane_id()+1
+                                        , var, __lane_id()==63);
 }
 #endif
 
@@ -2388,7 +2399,7 @@ inline int __wavefront_shift_left(int var) __HC__ {
 #if __hcc_backend__==HCC_BACKEND_AMDGPU
 
 inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
-  int self = __hsail_get_lane_id();
+  int self = __lane_id();
   int index = srcLane + (self & ~(width-1));
   return amdgcn_ds_bpermute(index<<2, var);
 }
@@ -2402,7 +2413,7 @@ inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
       default: {
         unsigned int ulane = (unsigned int)srcLane;
         unsigned int uwidth = (unsigned int)width;
-        unsigned int laneId = __hsail_get_lane_id();
+        unsigned int laneId = __lane_id();
         unsigned int newSrcLane = (laneId&((unsigned int)0xFFFFFFFF-(uwidth-1))) + (ulane&(uwidth-1));
         return __hsail_activelanepermute_b32(var,newSrcLane, 0, 0);
       }
@@ -2445,7 +2456,7 @@ inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __
 #if __hcc_backend__==HCC_BACKEND_AMDGPU
 
 inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
-  int self = __hsail_get_lane_id();
+  int self = __lane_id();
   int index = self - delta;
   index = (index < (self & ~(width-1)))?self:index;
   return amdgcn_ds_bpermute(index<<2, var);
@@ -2459,7 +2470,7 @@ inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WA
         return __wavefront_shift_right(var);
     }
     else {
-        int laneId = __hsail_get_lane_id();
+        int laneId = __lane_id();
         int newSrcLane = laneId - delta;
         return __hsail_activelanepermute_b32(var, newSrcLane, var, newSrcLane < (laneId&(~(width-1))));
     }
@@ -2501,7 +2512,7 @@ inline float __shfl_up(float var, const unsigned int delta, const int width=__HS
 #if __hcc_backend__==HCC_BACKEND_AMDGPU
 
 inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
-  int self = __hsail_get_lane_id();
+  int self = __lane_id();
   int index = self + delta;
   index = ((self&(width-1))+delta) >= width?self:index;
   return amdgcn_ds_bpermute(index<<2, var);
@@ -2515,7 +2526,7 @@ inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_
         return __wavefront_shift_left(var);
     }
     else {
-        unsigned int laneId = __hsail_get_lane_id();
+        unsigned int laneId = __lane_id();
         unsigned int newSrcLane = laneId + delta;
         return __hsail_activelanepermute_b32(var, newSrcLane, var, newSrcLane >= ((laneId&(~(width-1))) + width ));
     }
@@ -2556,7 +2567,7 @@ inline float __shfl_down(float var, const unsigned int delta, const int width=__
 
 
 inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
-  int self = __hsail_get_lane_id();
+  int self = __lane_id();
   int index = self^laneMask;
   index = index >= ((self+width)&~(width-1))?self:index;
   return amdgcn_ds_bpermute(index<<2, var);
@@ -2567,10 +2578,10 @@ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) _
 
 
 inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
-    unsigned int laneId = __hsail_get_lane_id();
-    unsigned int target = laneId ^ laneMask;
-    unsigned int w = width;
-    return __hsail_activelanepermute_b32(var, target, var, target>=((laneId+w)&~(w-1)));
+    int self = __lane_id();
+    int index = self^laneMask;
+    index = index >= ((self+width)&~(width-1))?self:index;
+    return __hsail_activelanepermute_b32(var, index, 0, 0);
 }
 
 #endif
diff --git a/include/hc_defines.h b/include/hc_defines.h
index 31afcd4472a..cbccd10241a 100644
--- a/include/hc_defines.h
+++ b/include/hc_defines.h
@@ -79,7 +79,7 @@ class auto_voidp {
 
 // Valid values for__hcc_backend__ to indicate the
 // compiler backend
-#define HCC_BACKEND_AMDGPU 1
-#define HCC_BACKEND_HSAIL  2
-#define HCC_BACKEND_CL     3
+#define HCC_BACKEND_AMDGPU (1)
+#define HCC_BACKEND_HSAIL  (2)
+#define HCC_BACKEND_CL     (3)
 
diff --git a/tests/Unit/HSAIL/activelaneid.cpp b/tests/Unit/HSAIL/activelaneid.cpp
index 6334be3ac77..9bb5eebbfd9 100644
--- a/tests/Unit/HSAIL/activelaneid.cpp
+++ b/tests/Unit/HSAIL/activelaneid.cpp
@@ -1,4 +1,4 @@
-// XFAIL: Linux
+// XFAIL: *
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <hc.hpp>
diff --git a/tests/Unit/HSAIL/activelanepermute.cpp b/tests/Unit/HSAIL/activelanepermute.cpp
index cc378471f8d..3f39ff53ca3 100644
--- a/tests/Unit/HSAIL/activelanepermute.cpp
+++ b/tests/Unit/HSAIL/activelanepermute.cpp
@@ -1,4 +1,4 @@
-// XFAIL: Linux
+// XFAIL: *
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <hc.hpp>
diff --git a/tests/Unit/HSAIL/shfl.cpp b/tests/Unit/HSAIL/shfl.cpp
index 708fb9f0f01..ac8e7cb66f1 100644
--- a/tests/Unit/HSAIL/shfl.cpp
+++ b/tests/Unit/HSAIL/shfl.cpp
@@ -26,7 +26,7 @@ bool test__shfl(int grid_size, T arg) {
   // broadcast of a single value across a wavefront
   parallel_for_each(ex, [&, arg](index<1>& idx) [[hc]] {
     T value = T();
-    if (__activelaneid_u32() == 0)
+    if (__lane_id() == 0)
       value = arg;
     value = __shfl(value, 0);
     table(idx) = value;
@@ -58,7 +58,7 @@ bool test__shfl2(int grid_size, int sub_wavefront_width, T arg) {
   // broadcast of a single value across a sub-wavefront
   parallel_for_each(ex, [&, arg, sub_wavefront_width](index<1>& idx) [[hc]] {
     T value = T();
-    unsigned int laneId = __activelaneid_u32();
+    unsigned int laneId = __lane_id();
     // each subsection of a wavefront would have a different test value
     if (laneId % sub_wavefront_width == 0)
       value = (arg + laneId / sub_wavefront_width);
diff --git a/tests/Unit/HSAIL/shfl_down.cpp b/tests/Unit/HSAIL/shfl_down.cpp
index 54306054886..2b7f4dace0e 100644
--- a/tests/Unit/HSAIL/shfl_down.cpp
+++ b/tests/Unit/HSAIL/shfl_down.cpp
@@ -25,7 +25,7 @@ bool test__shfl_down(int grid_size, int offset, T init_value) {
 
   // shift values down in a wavefront
   parallel_for_each(ex, [&, offset, init_value](index<1>& idx) [[hc]] {
-    T value = init_value + __activelaneid_u32();
+    T value = init_value + __lane_id();
     value = __shfl_down(value, offset);
     table(idx) = value;
   }).wait();
@@ -58,7 +58,7 @@ bool test__shfl_down2(int grid_size, int sub_wavefront_width, int offset, T init
 
   // shift values down in a wavefront, divided into subsections
   parallel_for_each(ex, [&, offset, sub_wavefront_width, init_value](index<1>& idx) [[hc]] {
-    T value = init_value + (__activelaneid_u32() % sub_wavefront_width);
+    T value = init_value + (__lane_id() % sub_wavefront_width);
     value = __shfl_down(value, offset, sub_wavefront_width);
     table(idx) = value;
   }).wait();
diff --git a/tests/Unit/HSAIL/shfl_up.cpp b/tests/Unit/HSAIL/shfl_up.cpp
index 551b80495dc..f7a3e977c18 100644
--- a/tests/Unit/HSAIL/shfl_up.cpp
+++ b/tests/Unit/HSAIL/shfl_up.cpp
@@ -25,7 +25,7 @@ bool test__shfl_up(int grid_size, int offset, T init_value) {
 
   // shift values up in a wavefront
   parallel_for_each(ex, [&, offset, init_value](index<1>& idx) [[hc]] {
-    T value = init_value + __activelaneid_u32();
+    T value = init_value + __lane_id();
     value = __shfl_up(value, offset);
     table(idx) = value;
   }).wait();
@@ -56,7 +56,7 @@ bool test__shfl_up2(int grid_size, int sub_wavefront_width, int offset, T init_v
 
   // shift values up in a wavefront, divided into subsections
   parallel_for_each(ex, [&, offset, sub_wavefront_width, init_value](index<1>& idx) [[hc]] {
-    T value = init_value + (__activelaneid_u32() % sub_wavefront_width);
+    T value = init_value + (__lane_id() % sub_wavefront_width);
     value = __shfl_up(value, offset, sub_wavefront_width);
     table(idx) = value;
   }).wait();
@@ -84,7 +84,7 @@ bool test_scan(int grid_size, int sub_wavefront_width) {
   array<int, 1> table(grid_size);
 
   parallel_for_each(ex, [&, sub_wavefront_width](index<1>& idx) [[hc]] {
-    int laneId = __activelaneid_u32();
+    int laneId = __lane_id();
     int logicalLaneId = laneId % sub_wavefront_width;
     int value = (WAVEFRONT_SIZE - 1) - laneId;
 
diff --git a/tests/Unit/HSAIL/shfl_xor.cpp b/tests/Unit/HSAIL/shfl_xor.cpp
index 2c90d2a6c43..f51ab6f7f67 100644
--- a/tests/Unit/HSAIL/shfl_xor.cpp
+++ b/tests/Unit/HSAIL/shfl_xor.cpp
@@ -22,7 +22,7 @@ bool test_reduce(int grid_size) {
   array<int, 1> table(grid_size);
 
   parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    int laneId = __activelaneid_u32();
+    int laneId = __lane_id();
     int value = (WAVEFRONT_SIZE - 1) - laneId;
 
     // use xor mode to perform butterfly reduction