From 1ad6bfd8aaa4f11e7e2307ac5a48a3de16552139 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 5 Mar 2020 18:53:30 -0800 Subject: [PATCH 1/3] [SYCL] Move LowerWGScope pass to LLVM project LowerWGScope pass is an llvm pass that performs SYCL specific transformations in LLVM IR right after frontend. LLVM passes are supposed to be in llvm project and not in clang project. Signed-off-by: Artur Gainullin --- clang/lib/CodeGen/BackendUtil.cpp | 1 - clang/lib/CodeGen/CMakeLists.txt | 4 +- clang/lib/CodeGen/CodeGenAction.cpp | 4 +- clang/lib/CodeGen/SYCLLowerIR/CMakeLists.txt | 18 -------- clang/test/CodeGenSYCL/hier_par.cpp | 43 ------------------- llvm/include/llvm/InitializePasses.h | 1 + llvm/include/llvm/LinkAllPasses.h | 2 + .../include/llvm}/SYCLLowerIR/LowerWGScope.h | 1 - llvm/lib/CMakeLists.txt | 1 + llvm/lib/LLVMBuild.txt | 1 + llvm/lib/SYCLLowerIR/CMakeLists.txt | 9 ++++ llvm/lib/SYCLLowerIR/LLVMBuild.txt | 20 +++++++++ .../lib}/SYCLLowerIR/LowerWGScope.cpp | 6 +-- .../lib}/SYCLLowerIR/README.txt | 0 llvm/tools/bugpoint/CMakeLists.txt | 1 + llvm/tools/opt/CMakeLists.txt | 1 + llvm/tools/opt/opt.cpp | 1 + sycl/test/hier_par/hier_par_wgscope.cpp | 2 +- 18 files changed, 43 insertions(+), 73 deletions(-) delete mode 100644 clang/lib/CodeGen/SYCLLowerIR/CMakeLists.txt delete mode 100644 clang/test/CodeGenSYCL/hier_par.cpp rename {clang/lib/CodeGen => llvm/include/llvm}/SYCLLowerIR/LowerWGScope.h (94%) create mode 100644 llvm/lib/SYCLLowerIR/CMakeLists.txt create mode 100644 llvm/lib/SYCLLowerIR/LLVMBuild.txt rename {clang/lib/CodeGen => llvm/lib}/SYCLLowerIR/LowerWGScope.cpp (99%) rename {clang/lib/CodeGen => llvm/lib}/SYCLLowerIR/README.txt (100%) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 0401aa26dcbc6..707bea4092d05 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "clang/CodeGen/BackendUtil.h" -#include "SYCLLowerIR/LowerWGScope.h" #include "clang/Basic/CodeGenOptions.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LangOptions.h" diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index a06fef5195bc3..5af0ac7f57d40 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -1,5 +1,3 @@ -add_subdirectory(SYCLLowerIR) - set(LLVM_LINK_COMPONENTS Analysis BitReader @@ -23,6 +21,7 @@ set(LLVM_LINK_COMPONENTS Remarks ScalarOpts Support + SYCLLowerIR Target TransformUtils ) @@ -112,5 +111,4 @@ add_clang_library(clangCodeGen clangFrontend clangLex clangSerialization - clangSYCLLowerIR ) diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index de9a9385f2f9a..4e1fe1308119f 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -10,7 +10,6 @@ #include "CodeGenModule.h" #include "CoverageMappingGen.h" #include "MacroPPCallbacks.h" -#include "SYCLLowerIR/LowerWGScope.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/DeclCXX.h" @@ -39,6 +38,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Pass.h" +#include "llvm/SYCLLowerIR/LowerWGScope.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TimeProfiler.h" @@ -335,7 +335,7 @@ namespace clang { if (LangOpts.SYCLIsDevice) { PrettyStackTraceString CrashInfo("Pre-linking SYCL passes"); legacy::PassManager PreLinkingSyclPasses; - PreLinkingSyclPasses.add(createSYCLLowerWGScopePass()); + PreLinkingSyclPasses.add(llvm::createSYCLLowerWGScopePass()); PreLinkingSyclPasses.run(*getModule()); } diff --git a/clang/lib/CodeGen/SYCLLowerIR/CMakeLists.txt b/clang/lib/CodeGen/SYCLLowerIR/CMakeLists.txt deleted file mode 100644 index f9d744d74e676..0000000000000 --- a/clang/lib/CodeGen/SYCLLowerIR/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Core - Support - ) - -if(NOT CLANG_BUILT_STANDALONE) - set(tablegen_deps intrinsics_gen) -endif() - -add_clang_library(clangSYCLLowerIR - LowerWGScope.cpp - - DEPENDS - ${tablegen_deps} - - LINK_LIBS - clangBasic - ) diff --git a/clang/test/CodeGenSYCL/hier_par.cpp b/clang/test/CodeGenSYCL/hier_par.cpp deleted file mode 100644 index 6967600c2761b..0000000000000 --- a/clang/test/CodeGenSYCL/hier_par.cpp +++ /dev/null @@ -1,43 +0,0 @@ -//==- hier_par.cpp --- hierarchical parallelism regression tests -----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// RUN: %clangxx -O2 -I %S/Inputs -fsycl -fsycl-device-only -c -Xclang -emit-llvm -o %t.ll %s -// RUN: cat %t.ll | FileCheck %s - -// This test checks for bug fix regressions related to hierarchical parallelism. -// - bug1: private var's (cl::sycl::group argument) address shared locally -// the test checks that a "shadow" local variable is generated for the group -// argument -// -// This is compile-only test for now. -// -// XFAIL:* -#include "sycl.hpp" - -using namespace cl::sycl; - -void foo() { - int *ptr = nullptr; - - queue myQueue; - buffer buf(ptr, range<1>(1)); - - myQueue.submit([&](handler &cgh) { - auto dev_ptr = buf.get_access(cgh); - - cgh.parallel_for_work_group( - range<1>(1), range<1>(1), [=](group<1> g) { -// CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %[[GROUP_CLASS:"[^"]+"]] undef, align [[ALIGN:[0-9]+]] -// CHECK: define {{.*}} spir_func void @{{"[^"]+"}}({{[^,]+}}, %[[GROUP_CLASS]]* byval(%[[GROUP_CLASS]]) align {{[0-9]+}} %[[GROUP_OBJ:[A-Za-z_0-9]+]]) {{.*}}!work_group_scope{{.*}} { -// CHECK-NOT: {{^[ \t]*define}} -// CHECK: %[[TMP:[A-Za-z_0-9]+]] = bitcast %[[GROUP_CLASS]] addrspace(3)* @[[SHADOW]] to i8 addrspace(3)* -// CHECK: %[[OBJ:[A-Za-z_0-9]+]] = bitcast %[[GROUP_CLASS]]* %[[GROUP_OBJ]] to i8* -// CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align [[ALIGN]] %[[TMP]], {{[^,]+}} %[[OBJ]], {{[^)]+}}) - }); - }); -} diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 0499422e1b4b7..6de57384fa3e4 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -404,6 +404,7 @@ void initializeStripNonDebugSymbolsPass(PassRegistry&); void initializeStripNonLineTableDebugInfoPass(PassRegistry&); void initializeStripSymbolsPass(PassRegistry&); void initializeStructurizeCFGPass(PassRegistry&); +void initializeSYCLLowerWGScopeLegacyPassPass(PassRegistry &); void initializeTailCallElimPass(PassRegistry&); void initializeTailDuplicatePass(PassRegistry&); void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 51d89c4b16019..616dd7825f3bb 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -37,6 +37,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRPrintingPasses.h" +#include "llvm/SYCLLowerIR/LowerWGScope.h" #include "llvm/Support/Valgrind.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" #include "llvm/Transforms/IPO.h" @@ -199,6 +200,7 @@ namespace { (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsLegacyPass(); (void) llvm::createExpandMemCmpPass(); + (void)llvm::createSYCLLowerWGScopePass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/clang/lib/CodeGen/SYCLLowerIR/LowerWGScope.h b/llvm/include/llvm/SYCLLowerIR/LowerWGScope.h similarity index 94% rename from clang/lib/CodeGen/SYCLLowerIR/LowerWGScope.h rename to llvm/include/llvm/SYCLLowerIR/LowerWGScope.h index bd705c0d88af6..c3b537ebb923c 100644 --- a/clang/lib/CodeGen/SYCLLowerIR/LowerWGScope.h +++ b/llvm/include/llvm/SYCLLowerIR/LowerWGScope.h @@ -25,7 +25,6 @@ class SYCLLowerWGScopePass : public PassInfoMixin { }; FunctionPass *createSYCLLowerWGScopePass(); -void initializeSYCLLowerWGScopeLegacyPassPass(PassRegistry &); } // namespace llvm diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt index 8f8d417124c87..4865f595429f7 100644 --- a/llvm/lib/CMakeLists.txt +++ b/llvm/lib/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) +add_subdirectory(SYCLLowerIR) add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) diff --git a/llvm/lib/LLVMBuild.txt b/llvm/lib/LLVMBuild.txt index 1ae59791cd6c1..961ad89e40c56 100644 --- a/llvm/lib/LLVMBuild.txt +++ b/llvm/lib/LLVMBuild.txt @@ -42,6 +42,7 @@ subdirectories = Passes ProfileData Support + SYCLLowerIR TableGen TextAPI Target diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt new file mode 100644 index 0000000000000..7a327d7657b69 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -0,0 +1,9 @@ +add_llvm_component_library(LLVMSYCLLowerIR + LowerWGScope.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR + + DEPENDS + intrinsics_gen + ) diff --git a/llvm/lib/SYCLLowerIR/LLVMBuild.txt b/llvm/lib/SYCLLowerIR/LLVMBuild.txt new file mode 100644 index 0000000000000..19fd5a3f5d667 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/LLVMBuild.txt @@ -0,0 +1,20 @@ +;===- ./lib/SYCLLowerIR/LLVMBuild.txt -----------------------------*- Conf -*--===; +; +; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Group +name = SYCLLowerIR +parent = Libraries diff --git a/clang/lib/CodeGen/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp similarity index 99% rename from clang/lib/CodeGen/SYCLLowerIR/LowerWGScope.cpp rename to llvm/lib/SYCLLowerIR/LowerWGScope.cpp index 4e13eb2df9ca5..f8bb128a435e8 100644 --- a/clang/lib/CodeGen/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -73,10 +73,7 @@ // et. al. //===----------------------------------------------------------------------===// -#include "LowerWGScope.h" - -#include "clang/Basic/AddressSpaces.h" - +#include "llvm/SYCLLowerIR/LowerWGScope.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -85,6 +82,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" diff --git a/clang/lib/CodeGen/SYCLLowerIR/README.txt b/llvm/lib/SYCLLowerIR/README.txt similarity index 100% rename from clang/lib/CodeGen/SYCLLowerIR/README.txt rename to llvm/lib/SYCLLowerIR/README.txt diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt index 0b5998e181ebb..421889cfedb7f 100644 --- a/llvm/tools/bugpoint/CMakeLists.txt +++ b/llvm/tools/bugpoint/CMakeLists.txt @@ -16,6 +16,7 @@ set(LLVM_LINK_COMPONENTS ObjCARCOpts ScalarOpts Support + SYCLLowerIR Target TransformUtils Vectorize diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt index 79613c836c533..ad9e20bd0b439 100644 --- a/llvm/tools/opt/CMakeLists.txt +++ b/llvm/tools/opt/CMakeLists.txt @@ -18,6 +18,7 @@ set(LLVM_LINK_COMPONENTS Remarks ScalarOpts Support + SYCLLowerIR Target TransformUtils Vectorize diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index fe2500ad4ac32..46bdfd6fe0cc5 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -563,6 +563,7 @@ int main(int argc, char **argv) { initializeWriteBitcodePassPass(Registry); initializeHardwareLoopsPass(Registry); initializeTypePromotionPass(Registry); + initializeSYCLLowerWGScopeLegacyPassPass(Registry); #ifdef BUILD_EXAMPLES initializeExampleIRTransforms(Registry); diff --git a/sycl/test/hier_par/hier_par_wgscope.cpp b/sycl/test/hier_par/hier_par_wgscope.cpp index ae346a1789547..d624e6ba82d5f 100644 --- a/sycl/test/hier_par/hier_par_wgscope.cpp +++ b/sycl/test/hier_par/hier_par_wgscope.cpp @@ -12,7 +12,7 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -// RUN: %clangxx -O0 -fsycl %s -o %t.out +// RUN: %clangxx -O0 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out From 4436fa269f1e83fb3bcb0a362a44c2ac1fdfb474 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 9 Mar 2020 11:22:25 -0700 Subject: [PATCH 2/3] [SYCL] Add lit tests for SYCL specific CodeGen and LowerWGScope pass Signed-off-by: Artur Gainullin --- clang/test/CodeGenSYCL/wg_scope_var.cpp | 21 +++++++ llvm/test/SYCLLowerIR/byval_arg.ll | 27 +++++++++ llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 79 +++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 clang/test/CodeGenSYCL/wg_scope_var.cpp create mode 100644 llvm/test/SYCLLowerIR/byval_arg.ll create mode 100644 llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll diff --git a/clang/test/CodeGenSYCL/wg_scope_var.cpp b/clang/test/CodeGenSYCL/wg_scope_var.cpp new file mode 100644 index 0000000000000..27994ed19aeb2 --- /dev/null +++ b/clang/test/CodeGenSYCL/wg_scope_var.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -triple spir64-unknown-unknown-sycldevice -fsycl-is-device -disable-llvm-passes -I %S/Inputs -emit-llvm %s -o - | FileCheck %s + +// Checked that local variables declared by the user in PWFG scope are turned into globals in the local address space. +// CHECK: @{{.*myLocal.*}} = internal addrspace(3) global i32 0 + +#include "sycl.hpp" + +using namespace cl::sycl; + +int main() { + queue myQueue; + + myQueue.submit([&](handler &cgh) { + cgh.parallel_for_work_group( + range<3>(2, 2, 2), range<3>(2, 2, 2), [=](group<3> myGroup) { + int myLocal; + }); + }); + + return 0; +} diff --git a/llvm/test/SYCLLowerIR/byval_arg.ll b/llvm/test/SYCLLowerIR/byval_arg.ll new file mode 100644 index 0000000000000..5d65bdd982f8d --- /dev/null +++ b/llvm/test/SYCLLowerIR/byval_arg.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -LowerWGScope -S | FileCheck %s + +; Check that argument of the function marked with !work_group_scope +; attribute passed as byval is shared by leader work item via local +; memory to all work items + +%struct.baz = type { i64 } + +; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %struct.baz undef + +define internal spir_func void @wibble(%struct.baz* byval(%struct.baz) %arg1) !work_group_scope !0 { +; CHECK-LABEL: @wibble( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[CMPZ]], label [[LEADER:%.*]], label [[MERGE:%.*]] +; CHECK: leader: +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.baz* [[ARG1:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 bitcast (%struct.baz addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* [[TMP2]], i64 8, i1 false) +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: call void @__spirv_ControlBarrier(i32 2, i32 2, i32 272) +; CHECK-NEXT: ret void +; + ret void +} + +!0 = !{} diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll new file mode 100644 index 0000000000000..3eac5462bd296 --- /dev/null +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -LowerWGScope -S | FileCheck %s + +; Check that allocas which correspond to PFWI lambda object and a local copy of the PFWG lambda object +; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created +; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow, +; then all WIs load the shadow value into their private copies ("materialize" the private copy). + +%struct.bar = type { i8 } +%struct.zot = type { %struct.widget, %struct.widget, %struct.widget, %struct.foo } +%struct.widget = type { %struct.barney } +%struct.barney = type { [3 x i64] } +%struct.foo = type { %struct.barney } +%struct.foo.0 = type { i8 } + +; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)* +; CHECK: @[[PFWI_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.foo.0 +; CHECK: @[[GROUP_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.zot + +define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.zot* byval(%struct.zot) align 8 %arg1) align 2 !work_group_scope !0 { +; CHECK-LABEL: @wibble( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]] +; CHECK: leader: +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.zot* [[ARG1:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 96, i1 false) +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: call void @__spirv_ControlBarrier(i32 2, i32 2, i32 272) +; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 +; CHECK-NEXT: [[ID:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[ID]], 0 +; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]] +; CHECK: wg_leader: +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_ZOT:%.*]] addrspace(3)* @[[GROUP_SHADOW]] to [[STRUCT_ZOT]] addrspace(4)* +; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4 +; CHECK-NEXT: br label [[WG_CF]] +; CHECK: wg_cf: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]] +; CHECK: TestMat: +; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP4]], i64 1, i1 false) +; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]] +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]] +; CHECK-NEXT: br label [[LEADERMAT]] +; CHECK: LeaderMat: +; CHECK-NEXT: call void @__spirv_ControlBarrier(i32 2, i32 2, i32 272) +; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]] +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD1]], [[STRUCT_BAR]] addrspace(4)** [[TMP]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP5]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false) +; CHECK-NEXT: call void @__spirv_ControlBarrier(i32 2, i32 2, i32 272) +; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4 +; CHECK-NEXT: call spir_func void @bar(%struct.zot addrspace(4)* [[WG_VAL_TMP4]], %struct.foo.0* byval(%struct.foo.0) align 1 [[TMP2]]) +; CHECK-NEXT: ret void +; +bb: + %tmp = alloca %struct.bar addrspace(4)*, align 8 + %tmp2 = alloca %struct.foo.0, align 1 + store %struct.bar addrspace(4)* %arg, %struct.bar addrspace(4)** %tmp, align 8 + %tmp3 = load %struct.bar addrspace(4)*, %struct.bar addrspace(4)** %tmp, align 8 + %tmp4 = addrspacecast %struct.zot* %arg1 to %struct.zot addrspace(4)* + call spir_func void @bar(%struct.zot addrspace(4)* %tmp4, %struct.foo.0* byval(%struct.foo.0) align 1 %tmp2) + ret void +} + +define internal spir_func void @bar(%struct.zot addrspace(4)* %arg, %struct.foo.0* byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { +bb: + ret void +} + +!0 = !{} From 838163b84a3344865522fe9292e5984738610607 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 9 Mar 2020 17:39:28 -0700 Subject: [PATCH 3/3] [SYCL] WG-shared global variables must have external linkage Currently hierarchical parallelism semantics is handled by SYCL specific code generation and LowerWGScope pass. WG-shared global variables are created for automatic variables in PFWG scope by CG and WG-shared shadow variables are created by LowerWGScope pass to broadcast private value from leader work item to other work items. Currently these global variables are created with internal linkage which is not correct. As a result wrong transformations are happening in the LLVM middle end. For example, ... if (Leader work item) store %PrivateValue to @SharedGlobal -> leader shares the value memory_barrier() load %PrivateValue from @SharedGlobal -> all WIs load the shared value ... Generated load/store operations are not supposed to be moved across memory barrier but barrier intrinsics like @llvm.nvvm.barrier0() are considered as regular functions in the LLVM middle end. As soon as global has an interanl linkage it is considered as non-escaping and alias analysis thinks that @llvm.nvvm.barrier0() cannot modify global variable and only reads it. As a result the following transformation is performed by GVN: ... crit_edge: load %PrivateValue from @SharedGlobal -> all WIs load the shared value if (Leader work item) store %PrivateValue to @SharedGlobal -> leader shares the value memory_barrier() ... That is why all WG-shared variables should have external linkage. Signed-off-by: Artur Gainullin --- clang/lib/CodeGen/CGSYCLRuntime.cpp | 2 +- clang/test/CodeGenSYCL/wg_scope_var.cpp | 2 +- llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 2 +- llvm/test/SYCLLowerIR/byval_arg.ll | 2 +- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 6 +++--- sycl/test/hier_par/hier_par_basic.cpp | 3 --- sycl/test/hier_par/hier_par_wgscope.cpp | 3 --- 7 files changed, 7 insertions(+), 13 deletions(-) diff --git a/clang/lib/CodeGen/CGSYCLRuntime.cpp b/clang/lib/CodeGen/CGSYCLRuntime.cpp index fb3adf4e508cd..8ce1b884d5c5b 100644 --- a/clang/lib/CodeGen/CGSYCLRuntime.cpp +++ b/clang/lib/CodeGen/CGSYCLRuntime.cpp @@ -82,7 +82,7 @@ void CGSYCLRuntime::emitWorkGroupLocalVarDecl(CodeGenFunction &CGF, #endif // NDEBUG // generate global variable in the address space selected by the clang CodeGen // (should be local) - CGF.EmitStaticVarDecl(D, llvm::GlobalValue::InternalLinkage); + CGF.EmitStaticVarDecl(D, llvm::GlobalValue::ExternalLinkage); } bool CGSYCLRuntime::actOnAutoVarEmit(CodeGenFunction &CGF, const VarDecl &D, diff --git a/clang/test/CodeGenSYCL/wg_scope_var.cpp b/clang/test/CodeGenSYCL/wg_scope_var.cpp index 27994ed19aeb2..9ed9719ba260a 100644 --- a/clang/test/CodeGenSYCL/wg_scope_var.cpp +++ b/clang/test/CodeGenSYCL/wg_scope_var.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -triple spir64-unknown-unknown-sycldevice -fsycl-is-device -disable-llvm-passes -I %S/Inputs -emit-llvm %s -o - | FileCheck %s // Checked that local variables declared by the user in PWFG scope are turned into globals in the local address space. -// CHECK: @{{.*myLocal.*}} = internal addrspace(3) global i32 0 +// CHECK: @{{.*myLocal.*}} = addrspace(3) global i32 0 #include "sycl.hpp" diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index f8bb128a435e8..12ba1c6a714d2 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -838,7 +838,7 @@ GlobalVariable *spirv::createWGLocalVariable(Module &M, Type *T, new GlobalVariable(M, // module T, // type false, // isConstant - GlobalValue::InternalLinkage, // Linkage + GlobalValue::ExternalLinkage, // Linkage UndefValue::get(T), // Initializer Name, // Name nullptr, // InsertBefore diff --git a/llvm/test/SYCLLowerIR/byval_arg.ll b/llvm/test/SYCLLowerIR/byval_arg.ll index 5d65bdd982f8d..15808dadc85d6 100644 --- a/llvm/test/SYCLLowerIR/byval_arg.ll +++ b/llvm/test/SYCLLowerIR/byval_arg.ll @@ -6,7 +6,7 @@ %struct.baz = type { i64 } -; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %struct.baz undef +; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = unnamed_addr addrspace(3) global %struct.baz define internal spir_func void @wibble(%struct.baz* byval(%struct.baz) %arg1) !work_group_scope !0 { ; CHECK-LABEL: @wibble( diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 3eac5462bd296..3dadc922e877e 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -13,9 +13,9 @@ %struct.foo = type { %struct.barney } %struct.foo.0 = type { i8 } -; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)* -; CHECK: @[[PFWI_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.foo.0 -; CHECK: @[[GROUP_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.zot +; CHECK: @[[PFWG_SHADOW:.*]] = unnamed_addr addrspace(3) global %struct.bar addrspace(4)* +; CHECK: @[[PFWI_SHADOW:.*]] = unnamed_addr addrspace(3) global %struct.foo.0 +; CHECK: @[[GROUP_SHADOW:.*]] = unnamed_addr addrspace(3) global %struct.zot define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.zot* byval(%struct.zot) align 8 %arg1) align 2 !work_group_scope !0 { ; CHECK-LABEL: @wibble( diff --git a/sycl/test/hier_par/hier_par_basic.cpp b/sycl/test/hier_par/hier_par_basic.cpp index 6caf3169f555f..d1a94ea1a7112 100644 --- a/sycl/test/hier_par/hier_par_basic.cpp +++ b/sycl/test/hier_par/hier_par_basic.cpp @@ -12,9 +12,6 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -// TODO: ptxas fatal : Unresolved extern function '__spirv_ControlBarrier' -// XFAIL: cuda - // This test checks hierarchical parallelism invocation APIs, but without any // data or code with side-effects between the work group and work item scopes. diff --git a/sycl/test/hier_par/hier_par_wgscope.cpp b/sycl/test/hier_par/hier_par_wgscope.cpp index d624e6ba82d5f..91bacfa57f44a 100644 --- a/sycl/test/hier_par/hier_par_wgscope.cpp +++ b/sycl/test/hier_par/hier_par_wgscope.cpp @@ -18,9 +18,6 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -// TODO: ptxas fatal : Unresolved extern function '__spirv_ControlBarrier' -// UNSUPPORTED: cuda - // This test checks correctness of hierarchical kernel execution when there is // code and data in the work group scope.