From 6e3f24402a4dbf9a696711c3135f611d01796a8f Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Wed, 11 Nov 2020 07:52:35 -0800
Subject: [PATCH] [SYCL][CUDA] Support GroupBroadcast with 32-bit id (#2759)

Use of the broadcast algorithm with the sub_group class clamps the
sub-group local id into a uint32_t.  libspirv was missing an entry
point for this case.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 libclc/ptx-nvidiacl/libspirv/group/collectives.cl | 8 ++++++++
 sycl/test/on-device/sub_group/broadcast.cpp       | 5 +----
 sycl/test/on-device/sub_group/broadcast_fp16.cpp  | 6 ++----
 sycl/test/on-device/sub_group/broadcast_fp64.cpp  | 5 +----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
index 00ed7b600155..fba9ad72d8a5 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
@@ -385,6 +385,10 @@ long __clc__3d_to_linear_local_id(ulong3 id) {
       uint scope, TYPE x, ulong3 local_id) {                                   \
     ulong linear_local_id = __clc__3d_to_linear_local_id(local_id);            \
     return __spirv_GroupBroadcast(scope, x, linear_local_id);                  \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast(          \
+      uint scope, TYPE x, uint local_id) {                                     \
+    return __spirv_GroupBroadcast(scope, x, (ulong)local_id);                  \
   }
 __CLC_GROUP_BROADCAST(char);
 __CLC_GROUP_BROADCAST(uchar);
@@ -411,6 +415,10 @@ _CLC_DECL _CLC_CONVERGENT half
 _Z17__spirv_GroupBroadcastjDF16_Dv3_m(uint scope, half x, ulong3 local_id) {
   return __spirv_GroupBroadcast(scope, x, local_id);
 }
+_CLC_DECL _CLC_CONVERGENT half
+_Z22__spirv_GroupBroadcastjDF16_j(uint scope, half x, uint local_id) {
+  return __spirv_GroupBroadcast(scope, x, (ulong)local_id);
+}
 
 #undef __CLC_GROUP_BROADCAST
 
diff --git a/sycl/test/on-device/sub_group/broadcast.cpp b/sycl/test/on-device/sub_group/broadcast.cpp
index 1cce98a5f685..f49d35c531b6 100644
--- a/sycl/test/on-device/sub_group/broadcast.cpp
+++ b/sycl/test/on-device/sub_group/broadcast.cpp
@@ -1,6 +1,3 @@
-// XFAIL: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -19,7 +16,7 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/sycl/test/on-device/sub_group/broadcast_fp16.cpp b/sycl/test/on-device/sub_group/broadcast_fp16.cpp
index cf2832b96b23..88c900300465 100644
--- a/sycl/test/on-device/sub_group/broadcast_fp16.cpp
+++ b/sycl/test/on-device/sub_group/broadcast_fp16.cpp
@@ -1,6 +1,3 @@
-// XFAIL: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
@@ -16,10 +13,11 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }
   check<cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
   return 0;
 }
diff --git a/sycl/test/on-device/sub_group/broadcast_fp64.cpp b/sycl/test/on-device/sub_group/broadcast_fp64.cpp
index 0c4f1f5974e3..e4b3eb09cc7b 100644
--- a/sycl/test/on-device/sub_group/broadcast_fp64.cpp
+++ b/sycl/test/on-device/sub_group/broadcast_fp64.cpp
@@ -1,6 +1,3 @@
-// XFAIL: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -19,7 +16,7 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }