Skip to content

[SYCL] Add new auto device code split mode #2827

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1987,11 +1987,11 @@ def fsycl_link_targets_EQ : CommaJoined<["-"], "fsycl-link-targets=">, Flags<[No
HelpText<"Specify comma-separated list of triples SYCL offloading targets to produce linked device images">;
def fsycl_device_code_split_EQ : Joined<["-"], "fsycl-device-code-split=">,
Flags<[CC1Option, CoreOption]>, HelpText<"Perform SYCL device code split: per_kernel (device code module is "
"created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). "
"Default is 'off' - all kernels go into a single module`">, Values<"per_source, per_kernel, off">;
"created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). | auto (use heuristic to select the best way of splitting device code)"
"Default is 'auto' - use heuristic to distribute device code across modules">, Values<"per_source, per_kernel, off, auto">;
def fsycl_device_code_split : Flag<["-"], "fsycl-device-code-split">, Alias<fsycl_device_code_split_EQ>,
AliasArgs<["per_source"]>, Flags<[CC1Option, CoreOption]>,
HelpText<"Perform SYCL device code split in the per_source mode i.e. create a device code module for each source (translation unit)">;
AliasArgs<["auto"]>, Flags<[CC1Option, CoreOption]>,
HelpText<"Perform SYCL device code split in the 'auto' mode, i.e. use heuristic to distribute device code across modules">;
def fsycl_id_queries_fit_in_int : Flag<["-"], "fsycl-id-queries-fit-in-int">,
Flags<[CC1Option, CoreOption]>, HelpText<"Assume that SYCL ID queries fit "
"within MAX_INT.">;
Expand Down
7 changes: 6 additions & 1 deletion clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8142,12 +8142,17 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
addArgs(CmdArgs, TCArgs, {"-split=kernel"});
else if (StringRef(A->getValue()) == "per_source")
addArgs(CmdArgs, TCArgs, {"-split=source"});
else if (StringRef(A->getValue()) == "auto")
addArgs(CmdArgs, TCArgs, {"-split=auto"});
else
// split must be off
assert(StringRef(A->getValue()) == "off");
} else {
// auto is the default split mode
addArgs(CmdArgs, TCArgs, {"-split=auto"});
}
// OPT_fsycl_device_code_split is not checked as it is an alias to
// -fsycl-device-code-split=per_source
// -fsycl-device-code-split=auto

// Turn on Dead Parameter Elimination Optimization with early optimizations
if (!getToolChain().getTriple().isNVPTX() &&
Expand Down
24 changes: 22 additions & 2 deletions clang/test/Driver/sycl-offload-with-split.c
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@
// RUN: | FileCheck %s -check-prefixes=CHK-TOOLS-AOT,CHK-TOOLS-CPU
// CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device" {{.*}} "-o" "[[OUTPUT1:.+\.bc]]"
// CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]"
// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto" {{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
// CHK-TOOLS-AOT: file-table-tform{{.*}} "-o" "[[OUTPUT4:.+\.txt]]" "[[OUTPUT3]]"
// CHK-TOOLS-AOT: llvm-foreach{{.*}} "--in-file-list=[[OUTPUT4]]" "--in-replace=[[OUTPUT4]]" "--out-ext=spv" "--out-file-list=[[OUTPUT5:.+\.txt]]" "--out-replace=[[OUTPUT5]]" "--" "{{.*}}llvm-spirv{{.*}}" "-o" "[[OUTPUT5]]" {{.*}} "[[OUTPUT4]]"
// CHK-TOOLS-FPGA: llvm-foreach{{.*}} "--out-file-list=[[OUTPUT6:.+\.txt]]{{.*}} "--" "{{.*}}aoc{{.*}} "-o" "[[OUTPUT6]]" "[[OUTPUT5]]"
Expand Down Expand Up @@ -271,13 +271,33 @@
// CHK-PHASE-MULTI-TARG: 36: clang-offload-wrapper, {35}, object, (device-sycl)
// CHK-PHASE-MULTI-TARG: 37: offload, "host-sycl (x86_64-unknown-linux-gnu)" {9}, "device-sycl (spir64-unknown-unknown-sycldevice)" {18}, "device-sycl (spir64_fpga-unknown-unknown-sycldevice)" {28}, "device-sycl (spir64_gen-unknown-unknown-sycldevice)" {36}, image

// Check -fsycl-one-kernel-per-module option passing.
// Check -fsycl-device-code-split=per_kernel option passing.
// RUN: %clang -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL
// CHK-ONE-KERNEL: sycl-post-link{{.*}} "-split=kernel"{{.*}} "-o"{{.*}}

// Check -fsycl-device-code-split=per_source option passing.
// RUN: %clang -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE
// CHK-PER-SOURCE: sycl-post-link{{.*}} "-split=source"{{.*}} "-o"{{.*}}

// Check -fsycl-device-code-split option passing.
// RUN: %clang -### -fsycl -fsycl-device-code-split %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
// RUN: %clang -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
// RUN: %clang -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO
// RUN: %clang_cl -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO
// CHK-AUTO: sycl-post-link{{.*}} "-split=auto"{{.*}} "-o"{{.*}}

// Check no device code split mode.
// RUN: %clang -### -fsycl -fsycl-device-code-split -fsycl-device-code-split=off %s 2>&1 \
// RUN: | FileCheck %s -check-prefixes=CHK-NO-SPLIT
Expand Down
121 changes: 121 additions & 0 deletions llvm/test/tools/sycl-post-link/auto-module-split-1.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
; By default auto mode is equal to source mode
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT

target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-linux-sycldevice"

$_Z3barIiET_S0_ = comdat any

; CHECK-TU0-NOT: @{{.*}}GV{{.*}}
; CHECK-TU1: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4

; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
; CHECK-TU0-TXT: {{.*}}TU0_kernel0{{.*}}
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel0{{.*}}

; CHECK-TU0: call spir_func void @{{.*}}foo{{.*}}()

define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
entry:
call spir_func void @_Z3foov()
ret void
}

; CHECK-TU0: define dso_local spir_func void @{{.*}}foo{{.*}}()
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo{{.*}}()

; CHECK-TU0: call spir_func i32 @{{.*}}bar{{.*}}(i32 1)

define dso_local spir_func void @_Z3foov() {
entry:
%a = alloca i32, align 4
%call = call spir_func i32 @_Z3barIiET_S0_(i32 1)
%add = add nsw i32 2, %call
store i32 %add, i32* %a, align 4
ret void
}

; CHECK-TU0: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
; CHECK-TU1-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)

; Function Attrs: nounwind
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
entry:
%arg.addr = alloca i32, align 4
store i32 %arg, i32* %arg.addr, align 4
%0 = load i32, i32* %arg.addr, align 4
ret i32 %0
}

; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
; CHECK-TU0-TXT: {{.*}}TU0_kernel1{{.*}}
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel1{{.*}}

; CHECK-TU0: call spir_func void @{{.*}}foo1{{.*}}()

define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
entry:
call spir_func void @_Z4foo1v()
ret void
}

; CHECK-TU0: define dso_local spir_func void @{{.*}}foo1{{.*}}()
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo1{{.*}}()

; Function Attrs: nounwind
define dso_local spir_func void @_Z4foo1v() {
entry:
%a = alloca i32, align 4
store i32 2, i32* %a, align 4
ret void
}

; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
; CHECK-TU0-TXT-NOT: {{.*}}TU1_kernel{{.*}}
; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
; CHECK-TU1-TXT: {{.*}}TU1_kernel{{.*}}

; CHECK-TU1: call spir_func void @{{.*}}foo2{{.*}}()

define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
entry:
call spir_func void @_Z4foo2v()
ret void
}

; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo2{{.*}}()
; CHECK-TU1: define dso_local spir_func void @{{.*}}foo2{{.*}}()

; Function Attrs: nounwind
define dso_local spir_func void @_Z4foo2v() {
entry:
%a = alloca i32, align 4
; CHECK-TU1: %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @{{.*}}GV{{.*}} to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
%add = add nsw i32 4, %0
store i32 %add, i32* %a, align 4
ret void
}

attributes #0 = { "sycl-module-id"="TU1.cpp" }
attributes #1 = { "sycl-module-id"="TU2.cpp" }

; Metadata is saved in both modules.
; CHECK: !opencl.spir.version = !{!0, !0}
; CHECK: !spirv.Source = !{!1, !1}

!opencl.spir.version = !{!0, !0}
!spirv.Source = !{!1, !1}

; CHECK: !0 = !{i32 1, i32 2}
; CHECK: !1 = !{i32 4, i32 100000}

!0 = !{i32 1, i32 2}
!1 = !{i32 4, i32 100000}
81 changes: 81 additions & 0 deletions llvm/test/tools/sycl-post-link/auto-module-split-2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
; In precense of indirectly callable function auto mode is equal to no split,
; which means that separate LLVM IR file for device is not generated and we only
; need to check generated symbol table
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK

target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-linux-sycldevice"

$_Z3barIiET_S0_ = comdat any

@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4

; CHECK: {{.*}}TU0_kernel0{{.*}}

define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
entry:
call spir_func void @_Z3foov()
ret void
}

define dso_local spir_func void @_Z3foov() #2 {
entry:
%a = alloca i32, align 4
%call = call spir_func i32 @_Z3barIiET_S0_(i32 1)
%add = add nsw i32 2, %call
store i32 %add, i32* %a, align 4
ret void
}

; Function Attrs: nounwind
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
entry:
%arg.addr = alloca i32, align 4
store i32 %arg, i32* %arg.addr, align 4
%0 = load i32, i32* %arg.addr, align 4
ret i32 %0
}

; CHECK: {{.*}}TU0_kernel1{{.*}}

define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
entry:
call spir_func void @_Z4foo1v()
ret void
}

; Function Attrs: nounwind
define dso_local spir_func void @_Z4foo1v() {
entry:
%a = alloca i32, align 4
store i32 2, i32* %a, align 4
ret void
}
; CHECK: {{.*}}TU1_kernel{{.*}}

define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
entry:
call spir_func void @_Z4foo2v()
ret void
}

; Function Attrs: nounwind
define dso_local spir_func void @_Z4foo2v() {
entry:
%a = alloca i32, align 4
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
%add = add nsw i32 4, %0
store i32 %add, i32* %a, align 4
ret void
}

attributes #0 = { "sycl-module-id"="TU1.cpp" }
attributes #1 = { "sycl-module-id"="TU2.cpp" }
attributes #2 = { "referenced-indirectly" }

!opencl.spir.version = !{!0, !0}
!spirv.Source = !{!1, !1}

!0 = !{i32 1, i32 2}
!1 = !{i32 4, i32 100000}
81 changes: 81 additions & 0 deletions llvm/test/tools/sycl-post-link/auto-module-split-3.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
; In precense of indirect calls auto mode is equal to no split,
; which means that separate LLVM IR file for device is not generated and we only
; need to check generated symbol table
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK

target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-linux-sycldevice"

$_Z3barIiET_S0_ = comdat any

@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4

; CHECK: {{.*}}TU0_kernel0{{.*}}

define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
entry:
call spir_func void @_Z3foov()
ret void
}

define dso_local spir_func void @_Z3foov() {
entry:
%a = alloca i32, align 4
%ptr = bitcast i32* %a to i32 (i32)*
%call = call spir_func i32 %ptr(i32 1)
%add = add nsw i32 2, %call
store i32 %add, i32* %a, align 4
ret void
}

; Function Attrs: nounwind
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
entry:
%arg.addr = alloca i32, align 4
store i32 %arg, i32* %arg.addr, align 4
%0 = load i32, i32* %arg.addr, align 4
ret i32 %0
}

; CHECK: {{.*}}TU0_kernel1{{.*}}

define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
entry:
call spir_func void @_Z4foo1v()
ret void
}

; Function Attrs: nounwind
define dso_local spir_func void @_Z4foo1v() {
entry:
%a = alloca i32, align 4
store i32 2, i32* %a, align 4
ret void
}
; CHECK: {{.*}}TU1_kernel{{.*}}

define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
entry:
call spir_func void @_Z4foo2v()
ret void
}

; Function Attrs: nounwind
define dso_local spir_func void @_Z4foo2v() {
entry:
%a = alloca i32, align 4
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
%add = add nsw i32 4, %0
store i32 %add, i32* %a, align 4
ret void
}

attributes #0 = { "sycl-module-id"="TU1.cpp" }
attributes #1 = { "sycl-module-id"="TU2.cpp" }

!opencl.spir.version = !{!0, !0}
!spirv.Source = !{!1, !1}

!0 = !{i32 1, i32 2}
!1 = !{i32 4, i32 100000}
1 change: 1 addition & 0 deletions llvm/test/tools/sycl-post-link/help.test
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ CHECK: =default - set spec constants to C++ defaults
CHECK: --split=<value> - split input module
CHECK: =source - 1 output module per source (translation unit)
CHECK: =kernel - 1 output module per kernel
CHECK: =auto - Choose split mode automatically
CHECK: --symbols - generate exported symbol files
Loading