From 98e088f88efe8c81e025ca27ce61e305a535bd71 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 6 May 2024 15:13:29 -0700 Subject: [PATCH 1/2] [SYCL] Optimize sub-group group_load via BlockRead in simple cases --- .../sycl/detail/generic_type_traits.hpp | 7 + sycl/include/sycl/detail/helpers.hpp | 1 + .../oneapi/experimental/group_load_store.hpp | 169 +++++++- sycl/test/check_device_code/group_load.cpp | 388 ++++++++++-------- 4 files changed, 386 insertions(+), 179 deletions(-) diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index 1ace8f31a2f4..3b0ce7988f57 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -262,6 +262,13 @@ template using make_unsinged_integer_t = make_type_t; +template +using cl_unsigned = std::conditional_t< + Size == 1, opencl::cl_uchar, + std::conditional_t< + Size == 2, opencl::cl_ushort, + std::conditional_t>>; + // select_apply_cl_scalar_t selects from T8/T16/T32/T64 basing on // sizeof(IN). expected to handle scalar types. template diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp index 7e1fcb00a8ae..9c7d0eddd59e 100644 --- a/sycl/include/sycl/detail/helpers.hpp +++ b/sycl/include/sycl/detail/helpers.hpp @@ -253,6 +253,7 @@ void loop_impl(std::integer_sequence, F &&f) { template void loop(F &&f) { loop_impl(std::make_index_sequence{}, std::forward(f)); } +inline constexpr bool is_power_of_two(int x) { return (x & (x - 1)) == 0; } } // namespace detail } // namespace _V1 diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index d158758080c8..ef718b6f10f3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -13,6 +13,8 @@ #include #include +#include + namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { @@ -106,6 +108,116 @@ int get_mem_idx(GroupTy g, int vec_or_array_idx) { return g.get_local_linear_id() + g.get_local_linear_range() * vec_or_array_idx; } + +// SPIR-V extension: +// https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_subgroups.asciidoc, +// however it doesn't describe limitations/requirements. Those seem to be +// listed in the Intel OpenCL extensions for sub-groups: +// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html +// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_char.html +// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_long.html +// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_short.html +// Reads require 4-byte alignment, writes 16-byte alignment. Supported +// sizes: +// +// +--------+-------------+ +// | uchar | 1,2,4,8,16 | +// | ushort | 1,2,4,8 | +// | uint | 1,2,4,8 | +// | ulong | 1,2,4,8 | +// +--------+-------------+ +// +// Utility type traits below are used to map user type to one of the block +// read/write types above. + +template +struct BlockInfo { + using value_type = + remove_decoration_t::value_type>; + + static constexpr int block_size = + sizeof(value_type) * (blocked ? ElementsPerWorkItem : 1); + static constexpr int num_blocks = blocked ? 1 : ElementsPerWorkItem; + static constexpr bool has_builtin = + detail::is_power_of_two(block_size) && + detail::is_power_of_two(num_blocks) && block_size <= 8 && + (num_blocks <= 8 || (num_blocks == 16 && block_size == 1)); +}; + +template struct BlockTypeInfo; + +template +struct BlockTypeInfo> { + using BlockInfoTy = BlockInfo; + static_assert(BlockInfoTy::has_builtin); + + using block_type = detail::cl_unsigned; + + using block_pointer_elem_type = std::conditional_t< + std::is_const_v::reference>>, + std::add_const_t, block_type>; + + using block_pointer_type = typename detail::DecoratedType< + block_pointer_elem_type, access::address_space::global_space>::type *; + using block_load_type = std::conditional_t< + BlockInfoTy::num_blocks == 1, block_type, + detail::ConvertToOpenCLType_t>>; +}; + +// Returns either a pointer suitable to use in a block read/write builtin or +// nullptr if some legality conditions aren't satisfied. +template +auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) { + using value_type = + remove_decoration_t::value_type>; + using iter_no_cv = std::remove_cv_t; + + constexpr bool blocked = detail::isBlocked(props); + using BlkInfo = BlockInfo; + +#if defined(__SPIR__) + // TODO: What about non-Intel SPIR-V devices? + constexpr bool is_spir = true; +#else + constexpr bool is_spir = false; +#endif + + if constexpr (!is_spir || !BlkInfo::has_builtin) { + return nullptr; + } else if constexpr (!props.template has_property()) { + return nullptr; + } else if constexpr (detail::is_multi_ptr_v) { + return get_block_op_ptr( + iter.get_decorated(), props); + } else if constexpr (!std::is_pointer_v) { + if constexpr (props.template has_property()) + return get_block_op_ptr(&*iter, + props); + else + return nullptr; + } else { + __builtin_assume(iter != nullptr); + static_assert(BlkInfo::has_builtin); + bool aligned = alignof(value_type) >= RequiredAlign || + reinterpret_cast(iter) % RequiredAlign == 0; + + constexpr auto AS = detail::deduce_AS::value; + using block_pointer_type = + typename BlockTypeInfo::block_pointer_type; + if constexpr (AS == access::address_space::global_space) { + return aligned ? reinterpret_cast(iter) : nullptr; + } else if constexpr (AS == access::address_space::generic_space) { + return aligned ? reinterpret_cast( + __SYCL_GenericCastToPtrExplicit_ToGlobal( + iter)) + : nullptr; + } else { + return nullptr; + } + } +} } // namespace detail // Load API span overload. @@ -117,17 +229,66 @@ std::enable_if_t && group_load(Group g, InputIteratorT in_ptr, span out, Properties props = {}) { constexpr bool blocked = detail::isBlocked(props); + using use_naive = + detail::merged_properties_t; if constexpr (props.template has_property()) { group_barrier(g); for (int i = 0; i < out.size(); ++i) out[i] = in_ptr[detail::get_mem_idx(g, i)]; group_barrier(g); - } else { - using use_naive = - detail::merged_properties_t; + return; + } else if constexpr (!std::is_same_v) { return group_load(g, in_ptr, out, use_naive{}); + } else { + auto ptr = + detail::get_block_op_ptr<4 /* load align */, ElementsPerWorkItem>( + in_ptr, props); + if (!ptr) + return group_load(g, in_ptr, out, use_naive{}); + + if constexpr (!std::is_same_v) { + // Do optimized load. + using value_type = remove_decoration_t< + typename std::iterator_traits::value_type>; + + auto load = __spirv_SubgroupBlockReadINTEL< + typename detail::BlockTypeInfo>::block_load_type>( + ptr); + + // TODO: accessor_iterator's value_type is weird, so we need + // `std::remove_const_t` below: + // + // static_assert( + // std::is_same_v< + // typename std::iterator_traits< + // sycl::detail::accessor_iterator>::value_type, + // const int>); + // + // yet + // + // static_assert( + // std::is_same_v< + // typename std::iterator_traits::value_type, int>); + + if constexpr (std::is_same_v, OutputT>) { + static_assert(sizeof(load) == out.size_bytes()); + std::memcpy(out.begin(), &load, out.size_bytes()); + } else { + std::remove_const_t values[ElementsPerWorkItem]; + static_assert(sizeof(load) == sizeof(values)); + std::memcpy(values, &load, sizeof(values)); + + // Note: can't `memcpy` directly into `out` because that might bypass + // an implicit conversion required by the specification. + for (int i = 0; i < ElementsPerWorkItem; ++i) + out[i] = values[i]; + } + + return; + } } } diff --git a/sycl/test/check_device_code/group_load.cpp b/sycl/test/check_device_code/group_load.cpp index 75b8130568e7..2ef19704e8f8 100644 --- a/sycl/test/check_device_code/group_load.cpp +++ b/sycl/test/check_device_code/group_load.cpp @@ -44,13 +44,13 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_RSM_T2_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] comdat !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4:[0-9]+]] // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 // CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP0]] to i64 // CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7:![0-9]+]] // CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void // Check that optimized implementation is selected. @@ -60,13 +60,10 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_RSN_T2_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { // CHECK-NEXT: entry: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[CALL4_I]], ptr addrspace(4) [[OUT]], align 4 // CHECK-NEXT: ret void // Check that contiguous_memory can be auto-detected. @@ -76,13 +73,10 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESM_SK_RSL_T2_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.2") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { // CHECK-NEXT: entry: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[CALL4_I]], ptr addrspace(4) [[OUT]], align 4 // CHECK-NEXT: ret void // SYCL 2020's accessor can't be statically known to be contiguous. @@ -99,14 +93,14 @@ sycl::ext::oneapi::experimental::group_load, span, opt_striped); // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META45:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META48:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP51:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: cleanup: +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) +// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA44]] +// CHECK-NEXT: store <2 x i32> [[CALL4]], ptr addrspace(4) [[TMP0]], align 4 // CHECK-NEXT: ret void // Check that contiguous_memory can be auto-detected. @@ -345,30 +360,12 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< sycl::sub_group, plain_global_ptr, span, full_group_striped); // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESM_SK_NS0_4spanISL_XT2_EEET3_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.13") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META52:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META55:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP58:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: cleanup: +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) +// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA44]] +// CHECK-NEXT: store <2 x i32> [[CALL4]], ptr addrspace(4) [[TMP0]], align 4 // CHECK-NEXT: ret void // SYCL 2020's accessor can't be statically known to be contiguous. @@ -381,15 +378,15 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.13") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] +// CHECK-NEXT: [[AGG_TMP3_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[AGG_TMP3_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 +// CHECK-NEXT: [[AGG_TMP3_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP3_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META59:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META62:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META47:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META50:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP3_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP3_SROA_2_0_COPYLOAD]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: // CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] @@ -405,9 +402,9 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP65:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP53:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEENSC_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void // Explicit property - optimize. @@ -419,18 +416,24 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_SO_NS0_4spanISP_XT2_EEET3_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] +// CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[AGG_TMP_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 +// CHECK-NEXT: [[AGG_TMP_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] +// CHECK-NEXT: [[ADD_PTR_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I]], i32 noundef 5) #[[ATTR5]] +// CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I]], null +// CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +// CHECK: if.then: // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META66:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META69:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META54:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META57:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] // CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 // CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEET3__EXIT:%.*]] // CHECK: for.body.i: @@ -438,14 +441,21 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] // CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] // CHECK-NEXT: [[CONV3_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I]], i64 [[CONV3_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA7]] // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP72:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP60:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: br label [[CLEANUP:%.*]] +// CHECK: if.end: +// CHECK-NEXT: [[CALL6:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA44]] +// CHECK-NEXT: store <2 x i32> [[CALL6]], ptr addrspace(4) [[TMP5]], align 4 +// CHECK-NEXT: br label [[CLEANUP]] +// CHECK: cleanup: // CHECK-NEXT: ret void // Run-time alignment check is needed if type's alignment is less than BlockRead @@ -456,29 +466,43 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.15") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META73:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META76:![0-9]+]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) +// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[IN_PTR]] to i64 +// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 3 +// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META61:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META64:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I14:%.*]] = icmp ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I14]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] // CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 // CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I]], align 1, !tbaa [[TBAA15]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i8 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I]], align 1, !tbaa [[TBAA15]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] +// CHECK-NEXT: store i8 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA15]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP79:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP67:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: br label [[CLEANUP:%.*]] +// CHECK: if.end: +// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupBlockReadINTELIDv2_hET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA68:![0-9]+]] +// CHECK-NEXT: store <2 x i8> [[CALL4]], ptr addrspace(4) [[TMP6]], align 1 +// CHECK-NEXT: br label [[CLEANUP]] +// CHECK: cleanup: // CHECK-NEXT: ret void // Just because there is a blocked data layout testcase, nothing inherently @@ -489,29 +513,43 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( // CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.5") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META80:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META83:![0-9]+]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) +// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[IN_PTR]] to i64 +// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 3 +// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META70:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META73:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I14:%.*]] = icmp ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I14]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] // CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 // CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA20]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA20]] +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA20]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA20]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP86:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP76:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: br label [[CLEANUP:%.*]] +// CHECK: if.end: +// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv4_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA24]] +// CHECK-NEXT: store <4 x i16> [[CALL4]], ptr addrspace(4) [[TMP6]], align 2 +// CHECK-NEXT: br label [[CLEANUP]] +// CHECK: cleanup: // CHECK-NEXT: ret void // Check for non-power-of-two size. @@ -523,9 +561,9 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META87:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META90:![0-9]+]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META77:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META80:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: // CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] @@ -541,9 +579,9 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP93:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP83:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void // Even though power of two, still too many to map directly onto BloadRead API. @@ -555,9 +593,9 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META94:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META97:![0-9]+]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META84:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META87:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: // CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] @@ -573,9 +611,9 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP100:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP90:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void // Non-power of two case bigger than max natively supported power of two case. @@ -587,9 +625,9 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META101:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META104:![0-9]+]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META91:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META94:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: // CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] @@ -605,7 +643,7 @@ template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP107:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP97:![0-9]+]] // CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void From 4512a6d331820c52f2e7af835ad3d4b0ce90122f Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Thu, 9 May 2024 11:19:30 -0700 Subject: [PATCH 2/2] Address code review comments --- .../oneapi/experimental/group_load_store.hpp | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index ef718b6f10f3..d9b661d7c82e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -120,24 +120,28 @@ int get_mem_idx(GroupTy g, int vec_or_array_idx) { // Reads require 4-byte alignment, writes 16-byte alignment. Supported // sizes: // -// +--------+-------------+ -// | uchar | 1,2,4,8,16 | -// | ushort | 1,2,4,8 | -// | uint | 1,2,4,8 | -// | ulong | 1,2,4,8 | -// +--------+-------------+ +// +------------+-------------+ +// | block type | # of blocks | +// +------------+-------------+ +// | uchar | 1,2,4,8,16 | +// | ushort | 1,2,4,8 | +// | uint | 1,2,4,8 | +// | ulong | 1,2,4,8 | +// +------------+-------------+ // // Utility type traits below are used to map user type to one of the block // read/write types above. -template +template struct BlockInfo { using value_type = remove_decoration_t::value_type>; static constexpr int block_size = - sizeof(value_type) * (blocked ? ElementsPerWorkItem : 1); - static constexpr int num_blocks = blocked ? 1 : ElementsPerWorkItem; + sizeof(value_type) * (Blocked ? ElementsPerWorkItem : 1); + static constexpr int num_blocks = Blocked ? 1 : ElementsPerWorkItem; + // There is an overload in the table above that could be used for the block + // operation: static constexpr bool has_builtin = detail::is_power_of_two(block_size) && detail::is_power_of_two(num_blocks) && block_size <= 8 && @@ -146,9 +150,9 @@ struct BlockInfo { template struct BlockTypeInfo; -template -struct BlockTypeInfo> { - using BlockInfoTy = BlockInfo; +template +struct BlockTypeInfo> { + using BlockInfoTy = BlockInfo; static_assert(BlockInfoTy::has_builtin); using block_type = detail::cl_unsigned; @@ -198,21 +202,25 @@ auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) { else return nullptr; } else { + // Load/store to/from nullptr would be an UB, this assume allows the + // compiler to optimize the IR further. __builtin_assume(iter != nullptr); - static_assert(BlkInfo::has_builtin); - bool aligned = alignof(value_type) >= RequiredAlign || - reinterpret_cast(iter) % RequiredAlign == 0; + + // No early return as that would mess up return type deduction. + bool is_aligned = alignof(value_type) >= RequiredAlign || + reinterpret_cast(iter) % RequiredAlign == 0; constexpr auto AS = detail::deduce_AS::value; using block_pointer_type = typename BlockTypeInfo::block_pointer_type; if constexpr (AS == access::address_space::global_space) { - return aligned ? reinterpret_cast(iter) : nullptr; + return is_aligned ? reinterpret_cast(iter) : nullptr; } else if constexpr (AS == access::address_space::generic_space) { - return aligned ? reinterpret_cast( - __SYCL_GenericCastToPtrExplicit_ToGlobal( - iter)) - : nullptr; + return is_aligned + ? reinterpret_cast( + __SYCL_GenericCastToPtrExplicit_ToGlobal( + iter)) + : nullptr; } else { return nullptr; }