From f7d02db951051a782cc67fdd24133f6d12253134 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Fri, 27 Nov 2020 16:52:52 +0300 Subject: [PATCH 1/6] [SYCL] Enable device code split by default This patch introduces new device code split mode `auto`, which is intended to automatically select the best device code split mode and apply it. At the moment, `auto` is equivalent to `per_source` for most cases and it is equivalent to `off` in case of precense of function pointers. --- clang/include/clang/Driver/Options.td | 8 ++-- clang/lib/Driver/ToolChains/Clang.cpp | 7 +++- llvm/tools/sycl-post-link/sycl-post-link.cpp | 40 ++++++++++++++++---- 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 54fa681c11e4..7ca31c6f63a8 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1987,11 +1987,11 @@ def fsycl_link_targets_EQ : CommaJoined<["-"], "fsycl-link-targets=">, Flags<[No HelpText<"Specify comma-separated list of triples SYCL offloading targets to produce linked device images">; def fsycl_device_code_split_EQ : Joined<["-"], "fsycl-device-code-split=">, Flags<[CC1Option, CoreOption]>, HelpText<"Perform SYCL device code split: per_kernel (device code module is " - "created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). " - "Default is 'off' - all kernels go into a single module`">, Values<"per_source, per_kernel, off">; + "created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). | auto (use heuristic to select the best way of splitting device code)" + "Default is 'auto' - automatically select how to split device code into modules">, Values<"per_source, per_kernel, off, auto">; def fsycl_device_code_split : Flag<["-"], "fsycl-device-code-split">, Alias, - AliasArgs<["per_source"]>, Flags<[CC1Option, CoreOption]>, - HelpText<"Perform SYCL device code split in the per_source mode i.e. create a device code module for each source (translation unit)">; + AliasArgs<["auto"]>, Flags<[CC1Option, CoreOption]>, + HelpText<"Perform SYCL device code split in the 'auto' mode i.e. use heuristic to distribute device code across modules">; def fsycl_id_queries_fit_in_int : Flag<["-"], "fsycl-id-queries-fit-in-int">, Flags<[CC1Option, CoreOption]>, HelpText<"Assume that SYCL ID queries fit " "within MAX_INT.">; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7abf2f04ad6c..b18e191c7af4 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8142,12 +8142,17 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA, addArgs(CmdArgs, TCArgs, {"-split=kernel"}); else if (StringRef(A->getValue()) == "per_source") addArgs(CmdArgs, TCArgs, {"-split=source"}); + else if (StringRef(A->getValue()) == "auto") + addArgs(CmdArgs, TCArgs, {"-split=auto"}); else // split must be off assert(StringRef(A->getValue()) == "off"); + } else { + // auto is the default split mode + addArgs(CmdArgs, TCArgs, {"-split=auto"}); } // OPT_fsycl_device_code_split is not checked as it is an alias to - // -fsycl-device-code-split=per_source + // -fsycl-device-code-split=auto // Turn on Dead Parameter Elimination Optimization with early optimizations if (!getToolChain().getTriple().isNVPTX() && diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 0f09f6d5c932..72c09f596054 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -90,16 +90,18 @@ static cl::opt OutputAssembly{"S", enum IRSplitMode { SPLIT_PER_TU, // one module per translation unit - SPLIT_PER_KERNEL // one module per kernel + SPLIT_PER_KERNEL, // one module per kernel + SPLIT_AUTO // automatically select split mode }; static cl::opt SplitMode( "split", cl::desc("split input module"), cl::Optional, cl::init(SPLIT_PER_TU), - cl::values(clEnumValN(SPLIT_PER_TU, "source", - "1 output module per source (translation unit)"), - clEnumValN(SPLIT_PER_KERNEL, "kernel", - "1 output module per kernel")), + cl::values( + clEnumValN(SPLIT_PER_TU, "source", + "1 output module per source (translation unit)"), + clEnumValN(SPLIT_PER_KERNEL, "kernel", "1 output module per kernel"), + clEnumValN(SPLIT_AUTO, "auto", "Choose split mode automatically")), cl::cat(PostLinkCat)); static cl::opt DoSymGen{"symbols", @@ -289,6 +291,25 @@ enum KernelMapEntryScope { Scope_Global // single entry in the map for all kernels }; +static KernelMapEntryScope selectDeviceCodeSplitModeAutomatically(Module &M) { + // Here we can employ various heuristics to decide which way to split kernels + // is the best in each particular situation. + // At the moment, we assume that per-kernel split is the best way of splitting + // device code and it can be always selected unless there are functions marked + // with [[intel::device_indirectly_callable]] attribute, because it instructs + // us to make this function available to the whole program as it was compiled + // as a single module. + bool HasDeviceIndirectlyCallable = false; + for (auto &F : M.functions()) { + if (F.hasFnAttribute("referenced-indirectly")) + HasDeviceIndirectlyCallable = true; + } + + if (HasDeviceIndirectlyCallable) + return Scope_Global; + return Scope_PerModule; +} + // This function decides how kernels of the input module M will be distributed // ("split") into multiple modules based on the command options and IR // attributes. The decision is recorded in the output map parameter @@ -656,8 +677,13 @@ int main(int argc, char **argv) { if (DoSplit || DoSymGen) { KernelMapEntryScope Scope = Scope_Global; - if (DoSplit) - Scope = SplitMode == SPLIT_PER_KERNEL ? Scope_PerKernel : Scope_PerModule; + if (DoSplit) { + if (SplitMode == SPLIT_AUTO) + Scope = selectDeviceCodeSplitModeAutomatically(*MPtr); + else + Scope = + SplitMode == SPLIT_PER_KERNEL ? Scope_PerKernel : Scope_PerModule; + } collectKernelModuleMap(*MPtr, GlobalsSet, Scope); } From d29970f65335474fa9e0ddbe49ea6a8f6d3d626f Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Tue, 1 Dec 2020 18:39:42 +0300 Subject: [PATCH 2/6] Allow -split=auto together with -ir-output-only --- llvm/tools/sycl-post-link/sycl-post-link.cpp | 22 ++++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 72c09f596054..96e047dd21da 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -291,7 +291,7 @@ enum KernelMapEntryScope { Scope_Global // single entry in the map for all kernels }; -static KernelMapEntryScope selectDeviceCodeSplitModeAutomatically(Module &M) { +static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) { // Here we can employ various heuristics to decide which way to split kernels // is the best in each particular situation. // At the moment, we assume that per-kernel split is the best way of splitting @@ -299,14 +299,18 @@ static KernelMapEntryScope selectDeviceCodeSplitModeAutomatically(Module &M) { // with [[intel::device_indirectly_callable]] attribute, because it instructs // us to make this function available to the whole program as it was compiled // as a single module. - bool HasDeviceIndirectlyCallable = false; + if (IROutputOnly) { + // We allow enabling auto split mode even in presence of -ir-output-only + // flag, but in this case we are limited by it so we can't do any split at + // all. + return Scope_Global; + } + for (auto &F : M.functions()) { if (F.hasFnAttribute("referenced-indirectly")) - HasDeviceIndirectlyCallable = true; + return Scope_Global; } - if (HasDeviceIndirectlyCallable) - return Scope_Global; return Scope_PerModule; } @@ -645,9 +649,9 @@ int main(int argc, char **argv) { errs() << "no actions specified; try --help for usage info\n"; return 1; } - if (IROutputOnly && DoSplit) { - errs() << "error: -" << SplitMode.ArgStr << " can't be used with -" - << IROutputOnly.ArgStr << "\n"; + if (IROutputOnly && (DoSplit && SplitMode != SPLIT_AUTO)) { + errs() << "error: -" << SplitMode.ArgStr << "=" << SplitMode.ValueStr + << " can't be used with -" << IROutputOnly.ArgStr << "\n"; return 1; } if (IROutputOnly && DoSymGen) { @@ -679,7 +683,7 @@ int main(int argc, char **argv) { KernelMapEntryScope Scope = Scope_Global; if (DoSplit) { if (SplitMode == SPLIT_AUTO) - Scope = selectDeviceCodeSplitModeAutomatically(*MPtr); + Scope = selectDeviceCodeSplitScopeAutomatically(*MPtr); else Scope = SplitMode == SPLIT_PER_KERNEL ? Scope_PerKernel : Scope_PerModule; From 73faebb4b013d04a5393153ce68b197743008d23 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Tue, 22 Dec 2020 14:50:25 +0300 Subject: [PATCH 3/6] Improve implementation and tests Added LIT tests. Update heuristic to disable device code split when indirect calls are present in the input module. --- clang/test/Driver/sycl-offload-with-split.c | 24 +++- .../sycl-post-link/auto-module-split-1.ll | 121 ++++++++++++++++++ .../sycl-post-link/auto-module-split-2.ll | 81 ++++++++++++ .../sycl-post-link/auto-module-split-3.ll | 81 ++++++++++++ llvm/test/tools/sycl-post-link/help.test | 1 + llvm/tools/sycl-post-link/sycl-post-link.cpp | 23 +++- 6 files changed, 324 insertions(+), 7 deletions(-) create mode 100644 llvm/test/tools/sycl-post-link/auto-module-split-1.ll create mode 100644 llvm/test/tools/sycl-post-link/auto-module-split-2.ll create mode 100644 llvm/test/tools/sycl-post-link/auto-module-split-3.ll diff --git a/clang/test/Driver/sycl-offload-with-split.c b/clang/test/Driver/sycl-offload-with-split.c index a21a3027bf4e..c4c61109ddbd 100644 --- a/clang/test/Driver/sycl-offload-with-split.c +++ b/clang/test/Driver/sycl-offload-with-split.c @@ -206,7 +206,7 @@ // RUN: | FileCheck %s -check-prefixes=CHK-TOOLS-AOT,CHK-TOOLS-CPU // CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device" {{.*}} "-o" "[[OUTPUT1:.+\.bc]]" // CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]" -// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]" +// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto" {{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]" // CHK-TOOLS-AOT: file-table-tform{{.*}} "-o" "[[OUTPUT4:.+\.txt]]" "[[OUTPUT3]]" // CHK-TOOLS-AOT: llvm-foreach{{.*}} "--in-file-list=[[OUTPUT4]]" "--in-replace=[[OUTPUT4]]" "--out-ext=spv" "--out-file-list=[[OUTPUT5:.+\.txt]]" "--out-replace=[[OUTPUT5]]" "--" "{{.*}}llvm-spirv{{.*}}" "-o" "[[OUTPUT5]]" {{.*}} "[[OUTPUT4]]" // CHK-TOOLS-FPGA: llvm-foreach{{.*}} "--out-file-list=[[OUTPUT6:.+\.txt]]{{.*}} "--" "{{.*}}aoc{{.*}} "-o" "[[OUTPUT6]]" "[[OUTPUT5]]" @@ -271,13 +271,33 @@ // CHK-PHASE-MULTI-TARG: 36: clang-offload-wrapper, {35}, object, (device-sycl) // CHK-PHASE-MULTI-TARG: 37: offload, "host-sycl (x86_64-unknown-linux-gnu)" {9}, "device-sycl (spir64-unknown-unknown-sycldevice)" {18}, "device-sycl (spir64_fpga-unknown-unknown-sycldevice)" {28}, "device-sycl (spir64_gen-unknown-unknown-sycldevice)" {36}, image -// Check -fsycl-one-kernel-per-module option passing. +// Check -fsycl-device-code-split=per_kernel option passing. // RUN: %clang -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \ // RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL // RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \ // RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL // CHK-ONE-KERNEL: sycl-post-link{{.*}} "-split=kernel"{{.*}} "-o"{{.*}} +// Check -fsycl-device-code-split=per_source option passing. +// RUN: %clang -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE +// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE +// CHK-PER-SOURCE: sycl-post-link{{.*}} "-split=source"{{.*}} "-o"{{.*}} + +// Check -fsycl-device-code-split option passing. +// RUN: %clang -### -fsycl -fsycl-device-code-split %s 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHK-AUTO +// RUN: %clang_cl -### -fsycl -fsycl-device-code-split %s 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHK-AUTO +// RUN: %clang -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHK-AUTO +// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHK-AUTO +// RUN: %clang -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO +// RUN: %clang_cl -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO +// CHK-AUTO: sycl-post-link{{.*}} "-split=auto"{{.*}} "-o"{{.*}} + // Check no device code split mode. // RUN: %clang -### -fsycl -fsycl-device-code-split -fsycl-device-code-split=off %s 2>&1 \ // RUN: | FileCheck %s -check-prefixes=CHK-NO-SPLIT diff --git a/llvm/test/tools/sycl-post-link/auto-module-split-1.ll b/llvm/test/tools/sycl-post-link/auto-module-split-1.ll new file mode 100644 index 000000000000..c46b901fb474 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/auto-module-split-1.ll @@ -0,0 +1,121 @@ +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; By default auto mode is equal to source mode +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux-sycldevice" + +$_Z3barIiET_S0_ = comdat any + +; CHECK-TU0-NOT: @{{.*}}GV{{.*}} +; CHECK-TU1: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU0: define dso_local spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU0: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; CHECK-TU0: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU1-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU0: define dso_local spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU1: define dso_local spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU1: %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @{{.*}}GV{{.*}} to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK: !0 = !{i32 1, i32 2} +; CHECK: !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/sycl-post-link/auto-module-split-2.ll b/llvm/test/tools/sycl-post-link/auto-module-split-2.ll new file mode 100644 index 000000000000..24cbfe46a3c6 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/auto-module-split-2.ll @@ -0,0 +1,81 @@ +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; In precense of indirectly callable function auto mode is equal to no split, +; which means that separate LLVM IR file for device is not generated and we only +; need to check generated symbol table +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux-sycldevice" + +$_Z3barIiET_S0_ = comdat any + +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK: {{.*}}TU0_kernel0{{.*}} + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +define dso_local spir_func void @_Z3foov() #2 { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +; CHECK: {{.*}}TU0_kernel1{{.*}} + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} +; CHECK: {{.*}}TU1_kernel{{.*}} + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } +attributes #2 = { "referenced-indirectly" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/sycl-post-link/auto-module-split-3.ll b/llvm/test/tools/sycl-post-link/auto-module-split-3.ll new file mode 100644 index 000000000000..2cc43c10b4a8 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/auto-module-split-3.ll @@ -0,0 +1,81 @@ +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; In precense of indirect calls auto mode is equal to no split, +; which means that separate LLVM IR file for device is not generated and we only +; need to check generated symbol table +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux-sycldevice" + +$_Z3barIiET_S0_ = comdat any + +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK: {{.*}}TU0_kernel0{{.*}} + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %ptr = bitcast i32* %a to i32 (i32)* + %call = call spir_func i32 %ptr(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +; CHECK: {{.*}}TU0_kernel1{{.*}} + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} +; CHECK: {{.*}}TU1_kernel{{.*}} + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/sycl-post-link/help.test b/llvm/test/tools/sycl-post-link/help.test index 66a47752e886..aa64d449dac1 100644 --- a/llvm/test/tools/sycl-post-link/help.test +++ b/llvm/test/tools/sycl-post-link/help.test @@ -52,4 +52,5 @@ CHECK: =default - set spec constants to C++ defaults CHECK: --split= - split input module CHECK: =source - 1 output module per source (translation unit) CHECK: =kernel - 1 output module per kernel +CHECK: =auto - Choose split mode automatically CHECK: --symbols - generate exported symbol files diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 96e047dd21da..14bc8a6a73e9 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -295,10 +295,13 @@ static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) { // Here we can employ various heuristics to decide which way to split kernels // is the best in each particular situation. // At the moment, we assume that per-kernel split is the best way of splitting - // device code and it can be always selected unless there are functions marked - // with [[intel::device_indirectly_callable]] attribute, because it instructs - // us to make this function available to the whole program as it was compiled - // as a single module. + // device code and it can be always selected unless: + // - there are functions marked with [[intel::device_indirectly_callable]] + // attribute, because it instructs us to make this function available to the + // whole program as it was compiled as a single module. + // - there are indirect calls in the module, which means that we don't know + // how to group functions so both caller and callee of indirect call are in + // the same module. if (IROutputOnly) { // We allow enabling auto split mode even in presence of -ir-output-only // flag, but in this case we are limited by it so we can't do any split at @@ -306,9 +309,19 @@ static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) { return Scope_Global; } - for (auto &F : M.functions()) { + for (const auto &F : M.functions()) { if (F.hasFnAttribute("referenced-indirectly")) return Scope_Global; + if (F.isDeclaration()) + continue; + for (const auto &BB: F) { + for (const auto &I : BB) { + if (auto *CI = dyn_cast(&I)) { + if (!CI->getCalledFunction()) + return Scope_Global; + } + } + } } return Scope_PerModule; From edfa44cc9a08c732d48f3bfe2b8caf3f8841d4ac Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 24 Dec 2020 12:30:10 +0300 Subject: [PATCH 4/6] Apply comments --- llvm/tools/sycl-post-link/sycl-post-link.cpp | 32 +++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 14bc8a6a73e9..8503b272f2bc 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -89,14 +89,14 @@ static cl::opt OutputAssembly{"S", cl::Hidden, cl::cat(PostLinkCat)}; enum IRSplitMode { - SPLIT_PER_TU, // one module per translation unit + SPLIT_PER_TU, // one module per translation unit SPLIT_PER_KERNEL, // one module per kernel - SPLIT_AUTO // automatically select split mode + SPLIT_AUTO // automatically select split mode }; static cl::opt SplitMode( "split", cl::desc("split input module"), cl::Optional, - cl::init(SPLIT_PER_TU), + cl::init(SPLIT_AUTO), cl::values( clEnumValN(SPLIT_PER_TU, "source", "1 output module per source (translation unit)"), @@ -292,16 +292,6 @@ enum KernelMapEntryScope { }; static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) { - // Here we can employ various heuristics to decide which way to split kernels - // is the best in each particular situation. - // At the moment, we assume that per-kernel split is the best way of splitting - // device code and it can be always selected unless: - // - there are functions marked with [[intel::device_indirectly_callable]] - // attribute, because it instructs us to make this function available to the - // whole program as it was compiled as a single module. - // - there are indirect calls in the module, which means that we don't know - // how to group functions so both caller and callee of indirect call are in - // the same module. if (IROutputOnly) { // We allow enabling auto split mode even in presence of -ir-output-only // flag, but in this case we are limited by it so we can't do any split at @@ -310,11 +300,17 @@ static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) { } for (const auto &F : M.functions()) { + // There are functions marked with [[intel::device_indirectly_callable]] + // attribute, because it instructs us to make this function available to the + // whole program as it was compiled as a single module. if (F.hasFnAttribute("referenced-indirectly")) return Scope_Global; if (F.isDeclaration()) continue; - for (const auto &BB: F) { + // There are indirect calls in the module, which means that we don't know + // how to group functions so both caller and callee of indirect call are in + // the same module. + for (const auto &BB : F) { for (const auto &I : BB) { if (auto *CI = dyn_cast(&I)) { if (!CI->getCalledFunction()) @@ -324,6 +320,8 @@ static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) { } } + // At the moment, we assume that per-source split is the best way of splitting + // device code and can always be used execpt for cases handled above. return Scope_PerModule; } @@ -633,6 +631,8 @@ int main(int argc, char **argv) { " kernels with the same values of the 'sycl-module-id' attribute will\n" " be put into the same module. If -split=kernel option is specified,\n" " one module per kernel will be emitted.\n" + " '-split=auto' mode automatically selects the best way of splitting\n" + " kernels into modules based on some heuristic.\n" "- If -symbols options is also specified, then for each produced module\n" " a text file containing names of all spir kernels in it is generated.\n" "- Specialization constant intrinsic transformer. Replaces symbolic\n" @@ -652,7 +652,9 @@ int main(int argc, char **argv) { " $ sycl-post-link --ir-output-only --spec-const=default \\\n" " -o example_p.bc example.bc\n" "will produce single output file example_p.bc suitable for SPIRV\n" - "translation.\n"); + "translation.\n" + "--ir-output-only option is not not compatible with split modes other\n" + "than 'auto'.\n"); bool DoSplit = SplitMode.getNumOccurrences() > 0; bool DoSpecConst = SpecConstLower.getNumOccurrences() > 0; From d374ffe5760ae551594c356dbe79377c0cf1fd6c Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 24 Dec 2020 12:33:55 +0300 Subject: [PATCH 5/6] Fix clang-format --- llvm/tools/sycl-post-link/sycl-post-link.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 8503b272f2bc..257ff7a8598d 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -95,8 +95,7 @@ enum IRSplitMode { }; static cl::opt SplitMode( - "split", cl::desc("split input module"), cl::Optional, - cl::init(SPLIT_AUTO), + "split", cl::desc("split input module"), cl::Optional, cl::init(SPLIT_AUTO), cl::values( clEnumValN(SPLIT_PER_TU, "source", "1 output module per source (translation unit)"), From 64412674ee933a0538b9381011d580ceced86843 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 24 Dec 2020 19:34:45 +0300 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Artem Gindinson --- clang/include/clang/Driver/Options.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7ca31c6f63a8..0ee3f0e0d58f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1988,10 +1988,10 @@ def fsycl_link_targets_EQ : CommaJoined<["-"], "fsycl-link-targets=">, Flags<[No def fsycl_device_code_split_EQ : Joined<["-"], "fsycl-device-code-split=">, Flags<[CC1Option, CoreOption]>, HelpText<"Perform SYCL device code split: per_kernel (device code module is " "created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). | auto (use heuristic to select the best way of splitting device code)" - "Default is 'auto' - automatically select how to split device code into modules">, Values<"per_source, per_kernel, off, auto">; + "Default is 'auto' - use heuristic to distribute device code across modules">, Values<"per_source, per_kernel, off, auto">; def fsycl_device_code_split : Flag<["-"], "fsycl-device-code-split">, Alias, AliasArgs<["auto"]>, Flags<[CC1Option, CoreOption]>, - HelpText<"Perform SYCL device code split in the 'auto' mode i.e. use heuristic to distribute device code across modules">; + HelpText<"Perform SYCL device code split in the 'auto' mode, i.e. use heuristic to distribute device code across modules">; def fsycl_id_queries_fit_in_int : Flag<["-"], "fsycl-id-queries-fit-in-int">, Flags<[CC1Option, CoreOption]>, HelpText<"Assume that SYCL ID queries fit " "within MAX_INT.">;