Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,19 @@ class PropertyValue;
}

struct SYCLDeviceRequirements {
std::set<uint32_t> Aspects;
struct AspectNameValuePair {
llvm::SmallString<64> Name;
uint32_t Value;
AspectNameValuePair(StringRef Name, uint32_t Value)
: Name(Name), Value(Value) {}
bool operator<(const AspectNameValuePair &rhs) const {
return Value < rhs.Value;
}
bool operator==(const AspectNameValuePair &rhs) const {
return Value == rhs.Value;
}
};
std::set<AspectNameValuePair> Aspects;
std::set<uint32_t> FixedTarget;
std::optional<llvm::SmallVector<uint64_t, 3>> ReqdWorkGroupSize;
std::optional<uint32_t> WorkGroupNumDim;
Expand Down
24 changes: 14 additions & 10 deletions llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,20 @@ llvm::computeDeviceRequirements(const module_split::ModuleDesc &MD) {
// Process all functions in the module
for (const Function &F : MD.getModule()) {
if (auto *MDN = F.getMetadata("sycl_used_aspects")) {
for (auto &MDOp : MDN->operands()) {
int64_t Val;
if (auto Pair = dyn_cast<MDNode>(MDOp)) {
for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
StringRef AspectName = "";
int64_t AspectValue;
if (auto Pair = dyn_cast<MDNode>(MDN->getOperand(I))) {
assert(Pair->getNumOperands() == 2);
Val = mdconst::extract<ConstantInt>(Pair->getOperand(1))
->getZExtValue();
AspectName = ExtractStringFromMDNodeOperand(Pair, 0);
AspectValue = ExtractSignedIntegerFromMDNodeOperand(Pair, 1);
} else {
Val = mdconst::extract<ConstantInt>(MDOp)->getZExtValue();
AspectValue = ExtractSignedIntegerFromMDNodeOperand(MDN, I);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for my understanding: we could have a non-pair value in case of an internal aspect with a negative value and no name, right? Plus, it should serve as path for backwards compatibility if we link object files produced by an older toolchain, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, those are both correct.

}
// Don't put internal aspects (with negative integer value) into the
// requirements, they are used only for device image splitting.
if (Val >= 0)
Reqs.Aspects.insert(Val);
if (AspectValue >= 0)
Reqs.Aspects.insert({AspectName, uint32_t(AspectValue)});
}
}

Expand Down Expand Up @@ -133,8 +134,11 @@ std::map<StringRef, util::PropertyValue> SYCLDeviceRequirements::asMap() const {
// For all properties except for "aspects", we'll only add the
// value to the map if the corresponding value from
// SYCLDeviceRequirements has a value/is non-empty.
Requirements["aspects"] =
std::vector<uint32_t>(Aspects.begin(), Aspects.end());
std::vector<uint32_t> AspectValues;
AspectValues.reserve(Aspects.size());
for (auto Aspect : Aspects)
AspectValues.push_back(Aspect.Value);
Requirements["aspects"] = std::move(AspectValues);

if (!FixedTarget.empty())
Requirements["fixed_target"] =
Expand Down
102 changes: 11 additions & 91 deletions llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,136 +65,56 @@
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
target triple = "spir64-unknown-unknown"

; Function Attrs: mustprogress norecurse nounwind
define weak_odr dso_local spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) local_unnamed_addr #0 !srcloc !65 !kernel_arg_buffer_location !66 !sycl_used_aspects !67 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
define spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) #0 !sycl_used_aspects !67 {
entry:
%0 = load double, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
%0 = load double, ptr addrspace(1) %_arg_out, align 8
%mul.i = fmul double %0, 2.000000e-01
store double %mul.i, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
store double %mul.i, ptr addrspace(1) %_arg_out, align 8
ret void
}

; Function Attrs: mustprogress norecurse nounwind
define weak_odr dso_local spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) local_unnamed_addr #0 !srcloc !74 !kernel_arg_buffer_location !66 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
define spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) #0 {
entry:
%0 = load float, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
%0 = load float, ptr addrspace(1) %_arg_out, align 4
%mul.i = fmul float %0, 0x3FC99999A0000000
store float %mul.i, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
store float %mul.i, ptr addrspace(1) %_arg_out, align 4
ret void
}

; Function Attrs: mustprogress norecurse nounwind
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_8() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !78 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
define spir_kernel void @reqd_sub_group_size_kernel_8() #0 !intel_reqd_sub_group_size !78 {
entry:
ret void
}

; Function Attrs: mustprogress norecurse nounwind
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_16() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !79 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
define spir_kernel void @reqd_sub_group_size_kernel_16() #0 !intel_reqd_sub_group_size !79 {
entry:
ret void
}

; Function Attrs: mustprogress norecurse nounwind
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_32() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !80 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
define spir_kernel void @reqd_sub_group_size_kernel_32() #0 !intel_reqd_sub_group_size !80 {
entry:
ret void
}

; Function Attrs: mustprogress norecurse nounwind
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_64() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !81 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
define spir_kernel void @reqd_sub_group_size_kernel_64() #0 !intel_reqd_sub_group_size !81 {
entry:
ret void
}

declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)

attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="double.cpp" "sycl-optlevel"="3" "uniform-work-group-size"="true" }

!llvm.module.flags = !{!0, !1}
!opencl.spir.version = !{!2}
!spirv.Source = !{!3}
!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63}
!llvm.ident = !{!64}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"frame-pointer", i32 2}
!2 = !{i32 1, i32 2}
!3 = !{i32 4, i32 100000}
!4 = !{!"cpu", i32 1}
!5 = !{!"gpu", i32 2}
!6 = !{!"accelerator", i32 3}
!7 = !{!"custom", i32 4}
!8 = !{!"fp16", i32 5}
!9 = !{!"fp64", i32 6}
!10 = !{!"image", i32 9}
!11 = !{!"online_compiler", i32 10}
!12 = !{!"online_linker", i32 11}
!13 = !{!"queue_profiling", i32 12}
!14 = !{!"usm_device_allocations", i32 13}
!15 = !{!"usm_host_allocations", i32 14}
!16 = !{!"usm_shared_allocations", i32 15}
!17 = !{!"usm_system_allocations", i32 17}
!18 = !{!"ext_intel_pci_address", i32 18}
!19 = !{!"ext_intel_gpu_eu_count", i32 19}
!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20}
!21 = !{!"ext_intel_gpu_slices", i32 21}
!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22}
!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23}
!24 = !{!"ext_intel_max_mem_bandwidth", i32 24}
!25 = !{!"ext_intel_mem_channel", i32 25}
!26 = !{!"usm_atomic_host_allocations", i32 26}
!27 = !{!"usm_atomic_shared_allocations", i32 27}
!28 = !{!"atomic64", i32 28}
!29 = !{!"ext_intel_device_info_uuid", i32 29}
!30 = !{!"ext_oneapi_srgb", i32 30}
!31 = !{!"ext_oneapi_native_assert", i32 31}
!32 = !{!"host_debuggable", i32 32}
!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33}
!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34}
!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35}
!36 = !{!"ext_intel_free_memory", i32 36}
!37 = !{!"ext_intel_device_id", i32 37}
!38 = !{!"ext_intel_memory_clock_rate", i32 38}
!39 = !{!"ext_intel_memory_bus_width", i32 39}
!40 = !{!"emulated", i32 40}
!41 = !{!"ext_intel_legacy_image", i32 41}
!42 = !{!"ext_oneapi_bindless_images", i32 42}
!43 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43}
!44 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44}
!45 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45}
!46 = !{!"ext_oneapi_interop_memory_import", i32 46}
!47 = !{!"ext_oneapi_interop_memory_export", i32 47}
!48 = !{!"ext_oneapi_interop_semaphore_import", i32 48}
!49 = !{!"ext_oneapi_interop_semaphore_export", i32 49}
!50 = !{!"ext_oneapi_mipmap", i32 50}
!51 = !{!"ext_oneapi_mipmap_anisotropy", i32 51}
!52 = !{!"ext_oneapi_mipmap_level_reference", i32 52}
!53 = !{!"ext_intel_esimd", i32 53}
!54 = !{!"ext_oneapi_ballot_group", i32 54}
!55 = !{!"ext_oneapi_fixed_size_group", i32 55}
!56 = !{!"ext_oneapi_opportunistic_group", i32 56}
!57 = !{!"ext_oneapi_tangle_group", i32 57}
!58 = !{!"ext_intel_matrix", i32 58}
!59 = !{!"int64_base_atomics", i32 7}
!60 = !{!"int64_extended_atomics", i32 8}
!61 = !{!"usm_system_allocator", i32 17}
!62 = !{!"usm_restricted_shared_allocations", i32 16}
!63 = !{!"host", i32 0}
!64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"}
!65 = !{i32 233}
!66 = !{i32 -1}
!67 = !{i32 6}
!68 = !{}
!69 = !{i1 false}
!70 = !{!71, !71, i64 0}
!71 = !{!"double", !72, i64 0}
!72 = !{!"omnipotent char", !73, i64 0}
!73 = !{!"Simple C++ TBAA"}
!74 = !{i32 364}
!75 = !{!76, !76, i64 0}
!76 = !{!"float", !72, i64 0}
!77 = !{i32 529}
!67 = !{!9}
!78 = !{i32 8}
!79 = !{i32 16}
!80 = !{i32 32}
Expand Down
39 changes: 5 additions & 34 deletions llvm/tools/sycl-post-link/sycl-post-link.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1014,41 +1014,12 @@ bool isTargetCompatibleWithModule(const std::optional<std::string> &Target,
DeviceConfigFile::TargetTable[*Target];
const SYCLDeviceRequirements &ModuleReqs =
IrMD.getOrComputeDeviceRequirements();
// The device config file data stores the target's supported
// aspects as a vector of the strings, so we need to translate
// the values to a common format.
const NamedMDNode *Node = IrMD.getModule().getNamedMetadata("sycl_aspects");
if (Node) {
SmallMapVector<StringRef, int, 32> AspectNameToValue;
for (const MDNode *N : Node->operands()) {
assert(N->getNumOperands() == 2 &&
"Each operand of sycl_aspects must be a pair.");

// The aspect's name is the first operand.
const auto *AspectName = cast<MDString>(N->getOperand(0));

// The aspect's integral value is the second operand.
const auto *AspectCAM = cast<ConstantAsMetadata>(N->getOperand(1));
const Constant *AspectC = AspectCAM->getValue();

AspectNameToValue[AspectName->getString()] =
cast<ConstantInt>(AspectC)->getSExtValue();
}

// Make the set of aspects values the target supports.
SmallSet<int64_t, 32> TargetAspectValueSet;
for (const auto &Aspect : TargetInfo.aspects) {
auto It = AspectNameToValue.find(Aspect);
assert(It != AspectNameToValue.end() && "Aspect value mapping unknown!");
TargetAspectValueSet.insert(It->second);
}

// Now check to see if all the requirements of the input module
// are compatbile with the target.
for (const auto &Aspect : ModuleReqs.Aspects) {
if (!TargetAspectValueSet.contains(Aspect))
return false;
}
// Check to see if all the requirements of the input module
// are compatbile with the target.
for (const auto &Aspect : ModuleReqs.Aspects) {
if (!is_contained(TargetInfo.aspects, Aspect.Name))
return false;
}

// Check if module sub group size is compatible with the target.
Expand Down
26 changes: 26 additions & 0 deletions sycl/test-e2e/AOT/double.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// This test ensures that a program that has a kernel
// using fp64 can be compiled AOT.

// REQUIRES: ocloc
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s

#include <sycl/detail/core.hpp>

using namespace sycl;

int main() {
queue q;
if (q.get_device().has(aspect::fp64)) {
double d = 2.5;
{
buffer<double, 1> buf(&d, 1);
q.submit([&](handler &cgh) {
accessor acc{buf, cgh};
cgh.single_task([=] { acc[0] *= 2; });
});
}
std::cout << d << "\n";
}
}
74 changes: 74 additions & 0 deletions sycl/test-e2e/AOT/reqd-sg-size.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// This test ensures that a program that has a kernel
// using various required sub-group sizes can be compiled AOT.

// REQUIRES: ocloc
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s

#include <cstdio>
#include <iostream>

#include <sycl/detail/core.hpp>

using namespace sycl;

template <int N> class kernel_name;

template <size_t... Ns> struct SubgroupDispatcher {
std::vector<std::pair<size_t, size_t>> fails;
SubgroupDispatcher(queue &q) : q(q) {}

void operator()(const std::vector<size_t> &v) {
for (auto i : v)
(*this)(i);
}

void operator()(size_t n) { (dispatch<Ns>(n), ...); }

private:
queue &q;

template <size_t size> void dispatch(size_t n) {
if (n == size) {
size_t res = 0;
{
buffer<size_t, 1> buf(&res, 1);
q.submit([&](handler &cgh) {
accessor acc{buf, cgh};
cgh.parallel_for<kernel_name<size>>(
nd_range<1>(1, 1),
[=](auto item) [[intel::reqd_sub_group_size(size)]] {
acc[0] = item.get_sub_group().get_max_local_range()[0];
});
});
}
if (res != size)
fails.push_back({res, size});
}
}
};

int main() {
queue q;
auto ctx = q.get_context();
auto dev = q.get_device();
auto sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
std::cout << " sub-group sizes supported by the device: " << sizes[0];
for (int i = 1; i < sizes.size(); ++i) {
std::cout << ", " << sizes[i];
}
std::cout << '\n';

using dispatcher_t = SubgroupDispatcher<4, 8, 16, 32, 64, 128>;
dispatcher_t dispatcher(q);
dispatcher(sizes);
if (dispatcher.fails.size() > 0) {
for (auto [actual, expected] : dispatcher.fails) {
std::cout << "actual: " << actual << "\n"
<< "expected: " << expected << "\n";
}
} else {
std::cout << "pass\n";
}
}