From 6e3f24402a4dbf9a696711c3135f611d01796a8f Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 11 Nov 2020 07:52:35 -0800 Subject: [PATCH] [SYCL][CUDA] Support GroupBroadcast with 32-bit id (#2759) Use of the broadcast algorithm with the sub_group class clamps the sub-group local id into a uint32_t. libspirv was missing an entry point for this case. Signed-off-by: John Pennycook --- libclc/ptx-nvidiacl/libspirv/group/collectives.cl | 8 ++++++++ sycl/test/on-device/sub_group/broadcast.cpp | 5 +---- sycl/test/on-device/sub_group/broadcast_fp16.cpp | 6 ++---- sycl/test/on-device/sub_group/broadcast_fp64.cpp | 5 +---- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl index 00ed7b600155..fba9ad72d8a5 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl @@ -385,6 +385,10 @@ long __clc__3d_to_linear_local_id(ulong3 id) { uint scope, TYPE x, ulong3 local_id) { \ ulong linear_local_id = __clc__3d_to_linear_local_id(local_id); \ return __spirv_GroupBroadcast(scope, x, linear_local_id); \ + } \ + _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ + uint scope, TYPE x, uint local_id) { \ + return __spirv_GroupBroadcast(scope, x, (ulong)local_id); \ } __CLC_GROUP_BROADCAST(char); __CLC_GROUP_BROADCAST(uchar); @@ -411,6 +415,10 @@ _CLC_DECL _CLC_CONVERGENT half _Z17__spirv_GroupBroadcastjDF16_Dv3_m(uint scope, half x, ulong3 local_id) { return __spirv_GroupBroadcast(scope, x, local_id); } +_CLC_DECL _CLC_CONVERGENT half +_Z22__spirv_GroupBroadcastjDF16_j(uint scope, half x, uint local_id) { + return __spirv_GroupBroadcast(scope, x, (ulong)local_id); +} #undef __CLC_GROUP_BROADCAST diff --git a/sycl/test/on-device/sub_group/broadcast.cpp b/sycl/test/on-device/sub_group/broadcast.cpp index 1cce98a5f685..f49d35c531b6 100644 --- a/sycl/test/on-device/sub_group/broadcast.cpp +++ b/sycl/test/on-device/sub_group/broadcast.cpp @@ -1,6 +1,3 @@ -// XFAIL: cuda -// CUDA compilation and runtime do not yet support sub-groups. - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -19,7 +16,7 @@ int main() { queue Queue; - if (!core_sg_supported(Queue.get_device())) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; } diff --git a/sycl/test/on-device/sub_group/broadcast_fp16.cpp b/sycl/test/on-device/sub_group/broadcast_fp16.cpp index cf2832b96b23..88c900300465 100644 --- a/sycl/test/on-device/sub_group/broadcast_fp16.cpp +++ b/sycl/test/on-device/sub_group/broadcast_fp16.cpp @@ -1,6 +1,3 @@ -// XFAIL: cuda -// CUDA compilation and runtime do not yet support sub-groups. - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -16,10 +13,11 @@ int main() { queue Queue; - if (!core_sg_supported(Queue.get_device())) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; } check(Queue); + std::cout << "Test passed." << std::endl; return 0; } diff --git a/sycl/test/on-device/sub_group/broadcast_fp64.cpp b/sycl/test/on-device/sub_group/broadcast_fp64.cpp index 0c4f1f5974e3..e4b3eb09cc7b 100644 --- a/sycl/test/on-device/sub_group/broadcast_fp64.cpp +++ b/sycl/test/on-device/sub_group/broadcast_fp64.cpp @@ -1,6 +1,3 @@ -// XFAIL: cuda -// CUDA compilation and runtime do not yet support sub-groups. - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -19,7 +16,7 @@ int main() { queue Queue; - if (!core_sg_supported(Queue.get_device())) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; }