From 377ddfd8c23120b6c471910e4fe835bb2fb1eaf6 Mon Sep 17 00:00:00 2001 From: "Cai, Justin" Date: Tue, 28 May 2024 10:14:09 -0700 Subject: [PATCH 1/4] [SYCL] Record aspect names when computing device requirements --- .../llvm/SYCLLowerIR/SYCLDeviceRequirements.h | 14 ++++++- .../SYCLLowerIR/SYCLDeviceRequirements.cpp | 24 +++++++----- llvm/tools/sycl-post-link/sycl-post-link.cpp | 39 +++---------------- 3 files changed, 32 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h index abb78b51af154..8891f7f550c5f 100644 --- a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h +++ b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h @@ -30,7 +30,19 @@ class PropertyValue; } struct SYCLDeviceRequirements { - std::set Aspects; + struct AspectNameValuePair { + llvm::SmallString<64> Name; + uint32_t Value; + AspectNameValuePair(StringRef Name, uint32_t Value) + : Name(Name), Value(Value) {} + bool operator<(const AspectNameValuePair &rhs) const { + return Value < rhs.Value; + } + bool operator==(const AspectNameValuePair &rhs) const { + return Value == rhs.Value; + } + }; + std::set Aspects; std::set FixedTarget; std::optional> ReqdWorkGroupSize; std::optional WorkGroupNumDim; diff --git a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp index 8ebec7f54013d..60424c04027fa 100644 --- a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp @@ -43,19 +43,20 @@ llvm::computeDeviceRequirements(const module_split::ModuleDesc &MD) { // Process all functions in the module for (const Function &F : MD.getModule()) { if (auto *MDN = F.getMetadata("sycl_used_aspects")) { - for (auto &MDOp : MDN->operands()) { - int64_t Val; - if (auto Pair = dyn_cast(MDOp)) { + for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) { + StringRef AspectName = ""; + int64_t AspectValue; + if (auto Pair = dyn_cast(MDN->getOperand(I))) { assert(Pair->getNumOperands() == 2); - Val = mdconst::extract(Pair->getOperand(1)) - ->getZExtValue(); + AspectName = ExtractStringFromMDNodeOperand(Pair, 0); + AspectValue = ExtractSignedIntegerFromMDNodeOperand(Pair, 1); } else { - Val = mdconst::extract(MDOp)->getZExtValue(); + AspectValue = ExtractSignedIntegerFromMDNodeOperand(MDN, I); } // Don't put internal aspects (with negative integer value) into the // requirements, they are used only for device image splitting. - if (Val >= 0) - Reqs.Aspects.insert(Val); + if (AspectValue >= 0) + Reqs.Aspects.insert({AspectName, uint32_t(AspectValue)}); } } @@ -133,8 +134,11 @@ std::map SYCLDeviceRequirements::asMap() const { // For all properties except for "aspects", we'll only add the // value to the map if the corresponding value from // SYCLDeviceRequirements has a value/is non-empty. - Requirements["aspects"] = - std::vector(Aspects.begin(), Aspects.end()); + std::vector AspectValues; + AspectValues.reserve(Aspects.size()); + for (auto Aspect : Aspects) + AspectValues.push_back(Aspect.Value); + Requirements["aspects"] = std::move(AspectValues); if (!FixedTarget.empty()) Requirements["fixed_target"] = diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 6c6db956c383a..0d060e0c9aaf9 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -1014,41 +1014,12 @@ bool isTargetCompatibleWithModule(const std::optional &Target, DeviceConfigFile::TargetTable[*Target]; const SYCLDeviceRequirements &ModuleReqs = IrMD.getOrComputeDeviceRequirements(); - // The device config file data stores the target's supported - // aspects as a vector of the strings, so we need to translate - // the values to a common format. - const NamedMDNode *Node = IrMD.getModule().getNamedMetadata("sycl_aspects"); - if (Node) { - SmallMapVector AspectNameToValue; - for (const MDNode *N : Node->operands()) { - assert(N->getNumOperands() == 2 && - "Each operand of sycl_aspects must be a pair."); - - // The aspect's name is the first operand. - const auto *AspectName = cast(N->getOperand(0)); - - // The aspect's integral value is the second operand. - const auto *AspectCAM = cast(N->getOperand(1)); - const Constant *AspectC = AspectCAM->getValue(); - - AspectNameToValue[AspectName->getString()] = - cast(AspectC)->getSExtValue(); - } - - // Make the set of aspects values the target supports. - SmallSet TargetAspectValueSet; - for (const auto &Aspect : TargetInfo.aspects) { - auto It = AspectNameToValue.find(Aspect); - assert(It != AspectNameToValue.end() && "Aspect value mapping unknown!"); - TargetAspectValueSet.insert(It->second); - } - // Now check to see if all the requirements of the input module - // are compatbile with the target. - for (const auto &Aspect : ModuleReqs.Aspects) { - if (!TargetAspectValueSet.contains(Aspect)) - return false; - } + // Check to see if all the requirements of the input module + // are compatbile with the target. + for (const auto &Aspect : ModuleReqs.Aspects) { + if (!is_contained(TargetInfo.aspects, Aspect.Name)) + return false; } // Check if module sub group size is compatible with the target. From 0485adb94f685b4aa712f77236e04215714e5247 Mon Sep 17 00:00:00 2001 From: "Cai, Justin" Date: Tue, 28 May 2024 10:17:42 -0700 Subject: [PATCH 2/4] Update sycl_used_aspects in multi-filtered-outputs.ll --- llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll index 1f014410d0a1c..3aa947f8bea55 100644 --- a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll +++ b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll @@ -184,7 +184,7 @@ attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trap !64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"} !65 = !{i32 233} !66 = !{i32 -1} -!67 = !{i32 6} +!67 = !{!9} !68 = !{} !69 = !{i1 false} !70 = !{!71, !71, i64 0} From 7c1c25ae403fa6fe2667f5aeb62f72ffa7cbc2c1 Mon Sep 17 00:00:00 2001 From: "Cai, Justin" Date: Tue, 28 May 2024 10:48:15 -0700 Subject: [PATCH 3/4] Cleanup multiple-filtered-outputs.ll --- .../multiple-filtered-outputs.ll | 100 ++---------------- 1 file changed, 10 insertions(+), 90 deletions(-) diff --git a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll index 3aa947f8bea55..7c2ab6e91b925 100644 --- a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll +++ b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll @@ -65,136 +65,56 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) local_unnamed_addr #0 !srcloc !65 !kernel_arg_buffer_location !66 !sycl_used_aspects !67 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 { +define spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) #0 !sycl_used_aspects !67 { entry: - %0 = load double, ptr addrspace(1) %_arg_out, align 8, !tbaa !70 + %0 = load double, ptr addrspace(1) %_arg_out, align 8 %mul.i = fmul double %0, 2.000000e-01 - store double %mul.i, ptr addrspace(1) %_arg_out, align 8, !tbaa !70 + store double %mul.i, ptr addrspace(1) %_arg_out, align 8 ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) local_unnamed_addr #0 !srcloc !74 !kernel_arg_buffer_location !66 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 { +define spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) #0 { entry: - %0 = load float, ptr addrspace(1) %_arg_out, align 4, !tbaa !75 + %0 = load float, ptr addrspace(1) %_arg_out, align 4 %mul.i = fmul float %0, 0x3FC99999A0000000 - store float %mul.i, ptr addrspace(1) %_arg_out, align 4, !tbaa !75 + store float %mul.i, ptr addrspace(1) %_arg_out, align 4 ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_8() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !78 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_8() #0 !intel_reqd_sub_group_size !78 { entry: ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_16() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !79 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_16() #0 !intel_reqd_sub_group_size !79 { entry: ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_32() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !80 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_32() #0 !intel_reqd_sub_group_size !80 { entry: ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_64() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !81 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_64() #0 !intel_reqd_sub_group_size !81 { entry: ret void } -declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...) - attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="double.cpp" "sycl-optlevel"="3" "uniform-work-group-size"="true" } !llvm.module.flags = !{!0, !1} !opencl.spir.version = !{!2} !spirv.Source = !{!3} -!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63} !llvm.ident = !{!64} !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 7, !"frame-pointer", i32 2} !2 = !{i32 1, i32 2} !3 = !{i32 4, i32 100000} -!4 = !{!"cpu", i32 1} -!5 = !{!"gpu", i32 2} -!6 = !{!"accelerator", i32 3} -!7 = !{!"custom", i32 4} -!8 = !{!"fp16", i32 5} !9 = !{!"fp64", i32 6} -!10 = !{!"image", i32 9} -!11 = !{!"online_compiler", i32 10} -!12 = !{!"online_linker", i32 11} -!13 = !{!"queue_profiling", i32 12} -!14 = !{!"usm_device_allocations", i32 13} -!15 = !{!"usm_host_allocations", i32 14} -!16 = !{!"usm_shared_allocations", i32 15} -!17 = !{!"usm_system_allocations", i32 17} -!18 = !{!"ext_intel_pci_address", i32 18} -!19 = !{!"ext_intel_gpu_eu_count", i32 19} -!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20} -!21 = !{!"ext_intel_gpu_slices", i32 21} -!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22} -!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23} -!24 = !{!"ext_intel_max_mem_bandwidth", i32 24} -!25 = !{!"ext_intel_mem_channel", i32 25} -!26 = !{!"usm_atomic_host_allocations", i32 26} -!27 = !{!"usm_atomic_shared_allocations", i32 27} -!28 = !{!"atomic64", i32 28} -!29 = !{!"ext_intel_device_info_uuid", i32 29} -!30 = !{!"ext_oneapi_srgb", i32 30} -!31 = !{!"ext_oneapi_native_assert", i32 31} -!32 = !{!"host_debuggable", i32 32} -!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33} -!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34} -!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35} -!36 = !{!"ext_intel_free_memory", i32 36} -!37 = !{!"ext_intel_device_id", i32 37} -!38 = !{!"ext_intel_memory_clock_rate", i32 38} -!39 = !{!"ext_intel_memory_bus_width", i32 39} -!40 = !{!"emulated", i32 40} -!41 = !{!"ext_intel_legacy_image", i32 41} -!42 = !{!"ext_oneapi_bindless_images", i32 42} -!43 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43} -!44 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44} -!45 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45} -!46 = !{!"ext_oneapi_interop_memory_import", i32 46} -!47 = !{!"ext_oneapi_interop_memory_export", i32 47} -!48 = !{!"ext_oneapi_interop_semaphore_import", i32 48} -!49 = !{!"ext_oneapi_interop_semaphore_export", i32 49} -!50 = !{!"ext_oneapi_mipmap", i32 50} -!51 = !{!"ext_oneapi_mipmap_anisotropy", i32 51} -!52 = !{!"ext_oneapi_mipmap_level_reference", i32 52} -!53 = !{!"ext_intel_esimd", i32 53} -!54 = !{!"ext_oneapi_ballot_group", i32 54} -!55 = !{!"ext_oneapi_fixed_size_group", i32 55} -!56 = !{!"ext_oneapi_opportunistic_group", i32 56} -!57 = !{!"ext_oneapi_tangle_group", i32 57} -!58 = !{!"ext_intel_matrix", i32 58} -!59 = !{!"int64_base_atomics", i32 7} -!60 = !{!"int64_extended_atomics", i32 8} -!61 = !{!"usm_system_allocator", i32 17} -!62 = !{!"usm_restricted_shared_allocations", i32 16} -!63 = !{!"host", i32 0} !64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"} -!65 = !{i32 233} -!66 = !{i32 -1} !67 = !{!9} -!68 = !{} -!69 = !{i1 false} -!70 = !{!71, !71, i64 0} -!71 = !{!"double", !72, i64 0} -!72 = !{!"omnipotent char", !73, i64 0} -!73 = !{!"Simple C++ TBAA"} -!74 = !{i32 364} -!75 = !{!76, !76, i64 0} -!76 = !{!"float", !72, i64 0} -!77 = !{i32 529} !78 = !{i32 8} !79 = !{i32 16} !80 = !{i32 32} From 337ef91512f8f9eb3dc56838e54a0edf0c8f222c Mon Sep 17 00:00:00 2001 From: "Cai, Justin" Date: Wed, 29 May 2024 14:57:28 -0700 Subject: [PATCH 4/4] Add E2E tests --- sycl/test-e2e/AOT/double.cpp | 26 +++++++++++ sycl/test-e2e/AOT/reqd-sg-size.cpp | 74 ++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 sycl/test-e2e/AOT/double.cpp create mode 100644 sycl/test-e2e/AOT/reqd-sg-size.cpp diff --git a/sycl/test-e2e/AOT/double.cpp b/sycl/test-e2e/AOT/double.cpp new file mode 100644 index 0000000000000..813fb194e017b --- /dev/null +++ b/sycl/test-e2e/AOT/double.cpp @@ -0,0 +1,26 @@ +// This test ensures that a program that has a kernel +// using fp64 can be compiled AOT. + +// REQUIRES: ocloc +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s + +#include + +using namespace sycl; + +int main() { + queue q; + if (q.get_device().has(aspect::fp64)) { + double d = 2.5; + { + buffer buf(&d, 1); + q.submit([&](handler &cgh) { + accessor acc{buf, cgh}; + cgh.single_task([=] { acc[0] *= 2; }); + }); + } + std::cout << d << "\n"; + } +} diff --git a/sycl/test-e2e/AOT/reqd-sg-size.cpp b/sycl/test-e2e/AOT/reqd-sg-size.cpp new file mode 100644 index 0000000000000..5272f25e83017 --- /dev/null +++ b/sycl/test-e2e/AOT/reqd-sg-size.cpp @@ -0,0 +1,74 @@ +// This test ensures that a program that has a kernel +// using various required sub-group sizes can be compiled AOT. + +// REQUIRES: ocloc +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s + +#include +#include + +#include + +using namespace sycl; + +template class kernel_name; + +template struct SubgroupDispatcher { + std::vector> fails; + SubgroupDispatcher(queue &q) : q(q) {} + + void operator()(const std::vector &v) { + for (auto i : v) + (*this)(i); + } + + void operator()(size_t n) { (dispatch(n), ...); } + +private: + queue &q; + + template void dispatch(size_t n) { + if (n == size) { + size_t res = 0; + { + buffer buf(&res, 1); + q.submit([&](handler &cgh) { + accessor acc{buf, cgh}; + cgh.parallel_for>( + nd_range<1>(1, 1), + [=](auto item) [[intel::reqd_sub_group_size(size)]] { + acc[0] = item.get_sub_group().get_max_local_range()[0]; + }); + }); + } + if (res != size) + fails.push_back({res, size}); + } + } +}; + +int main() { + queue q; + auto ctx = q.get_context(); + auto dev = q.get_device(); + auto sizes = dev.get_info(); + std::cout << " sub-group sizes supported by the device: " << sizes[0]; + for (int i = 1; i < sizes.size(); ++i) { + std::cout << ", " << sizes[i]; + } + std::cout << '\n'; + + using dispatcher_t = SubgroupDispatcher<4, 8, 16, 32, 64, 128>; + dispatcher_t dispatcher(q); + dispatcher(sizes); + if (dispatcher.fails.size() > 0) { + for (auto [actual, expected] : dispatcher.fails) { + std::cout << "actual: " << actual << "\n" + << "expected: " << expected << "\n"; + } + } else { + std::cout << "pass\n"; + } +}