diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 170a7d8eb3170..36d8718faf231 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -261,10 +261,13 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) foreach( d ${${t}_devices} ) # Some targets don't have a specific GPU to target if( ${d} STREQUAL "none" OR ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" ) - set( mcpu ) + # FIXME: Ideally we would not be tied to a specific PTX ISA version + if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 ) + set( flags "SHELL:-Xclang -target-feature" "SHELL:-Xclang +ptx64") + endif() set( arch_suffix "${t}" ) else() - set( mcpu "-mcpu=${d}" ) + set( flags "-mcpu=${d}" ) set( arch_suffix "${d}-${t}" ) endif() message( " DEVICE: ${d} ( ${${d}_aliases} )" ) @@ -276,14 +279,14 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 ) add_libclc_sycl_binding(libspirv_files TRIPLE ${t} - COMPILE_OPT ${mcpu} + COMPILE_OPT ${flags} FILES generic/libspirv/sycldevice-binding.cpp) endif() add_libclc_builtin_set(libspirv-${arch_suffix} TRIPLE ${t} TARGET_ENV libspirv - COMPILE_OPT ${mcpu} + COMPILE_OPT ${flags} FILES ${libspirv_files} ALIASES ${${d}_aliases} GENERATE_TARGET "generate_convert_spirv.cl" "generate_convert_core.cl" @@ -292,7 +295,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) add_libclc_builtin_set(clc-${arch_suffix} TRIPLE ${t} TARGET_ENV clc - COMPILE_OPT ${mcpu} + COMPILE_OPT ${flags} FILES ${lib_files} LIB_DEP libspirv-${arch_suffix} ALIASES ${${d}_aliases} diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index cd1f6593134ba..a9dc482da7ed7 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -214,6 +214,7 @@ function(add_libclc_sycl_binding OUT_LIST) file( TO_CMAKE_PATH ${LIBCLC_ROOT_DIR}/${file} SYCLDEVICE_BINDING ) if( EXISTS ${SYCLDEVICE_BINDING} ) set( SYCLDEVICE_BINDING_OUT ${CMAKE_CURRENT_BINARY_DIR}/sycldevice-binding-${ARG_TRIPLE}/sycldevice-binding.bc ) + string( REGEX REPLACE "SHELL:" "" SYLCDEVICE_OPT ${ARG_COMPILE_OPT} ) add_custom_command( OUTPUT ${SYCLDEVICE_BINDING_OUT} COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/sycldevice-binding-${ARG_TRIPLE} @@ -223,7 +224,7 @@ function(add_libclc_sycl_binding OUT_LIST) -fsycl-device-only -Dcl_khr_fp64 -I${LIBCLC_ROOT_DIR}/generic/include - ${ARG_COMPILE_OPT} + ${SYCLDEVICE_OPT} ${SYCLDEVICE_BINDING} -o ${SYCLDEVICE_BINDING_OUT} MAIN_DEPENDENCY ${SYCLDEVICE_BINDING} diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl index c63bbffebaa69..e545276c5c7f6 100644 --- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl +++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory, unsigned int semantics) { @@ -16,5 +17,12 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory, _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __spirv_ControlBarrier(unsigned int scope, unsigned int memory, unsigned int semantics) { - __syncthreads(); + if (scope == Subgroup) { + uint FULL_MASK = 0xFFFFFFFF; + uint max_size = __spirv_SubgroupMaxSize(); + uint sg_size = __spirv_SubgroupSize(); + __nvvm_bar_warp_sync(FULL_MASK >> (max_size - sg_size)); + } else { + __syncthreads(); + } } diff --git a/sycl/test/sub_group/barrier.cpp b/sycl/test/sub_group/barrier.cpp index b8aeefa9ca0e3..aca849411b12a 100644 --- a/sycl/test/sub_group/barrier.cpp +++ b/sycl/test/sub_group/barrier.cpp @@ -1,6 +1,3 @@ -// UNSUPPORTED: cuda -// CUDA compilation and runtime do not yet support sub-groups. -// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -73,7 +70,7 @@ template void check(queue &Queue, size_t G = 240, size_t L = 60) { } int main() { queue Queue; - if (!core_sg_supported(Queue.get_device())) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; }