From c3bb9e574a0b591cc3cd24421616adbde38f67db Mon Sep 17 00:00:00 2001 From: Denis Bakhvalov Date: Mon, 1 Feb 2021 19:12:12 -0800 Subject: [PATCH 1/3] [ESIMD] Process every function in LowerESIMD This patch is a preparation for moving ESIMD-specific passes into sycl-post-link. Right now, only functions marked with "sycl-explicit-simd" metadata are lowered in LowerESIMD pass. In sycl-post-link, once we split ESIMD kernels from SYCL kernels, there is no shared code between two types of kernels, and LowerESIMD pass can safely process a module with ESIMD kernels without worrying about another type of kernels (SYCL). This change is also safe to do now since, as of today, we do not allow to mix SYCL and ESIMD kernels in one source and in one program. Thus, LowerESIMD can operate in "exclusive" mode without worrying about SYCL kernels. The ESIMD CFG markup made in `Sema::MarkSyclSimd` will be removed in subsequent patches. --- llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 4 ---- llvm/test/SYCLLowerIR/esimd_lower_intrins.ll | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 2163891c15655..2617b80bf52f0 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -1234,10 +1234,6 @@ void SYCLLowerESIMDLegacyPass::collectGenXVolatileType(Module &M) { PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, FunctionAnalysisManager &FAM, SmallPtrSet &GVTS) { - // Only consider functions marked with !sycl_explicit_simd - if (F.getMetadata("sycl_explicit_simd") == nullptr) - return PreservedAnalyses::all(); - SmallVector ESIMDIntrCalls; SmallVector ESIMDToErases; diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index bfa05bf69b182..0093717364ab0 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -14,7 +14,9 @@ target triple = "spir64-unknown-unknown-sycldevice" @vg = dso_local global %"cm::gen::simd" zeroinitializer, align 64 #0 @vc = dso_local addrspace(1) global <32 x i32> zeroinitializer -define dso_local spir_func <32 x i32> @FUNC_1() !sycl_explicit_simd !1 { +; LowerESIMD pass should process every function, +; !sycl_explicit_simd metadata is not necessary. +define dso_local spir_func <32 x i32> @FUNC_1() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i16> From f298287a0ad498e5527b243f664d76dbc33f0c36 Mon Sep 17 00:00:00 2001 From: Denis Bakhvalov Date: Tue, 2 Feb 2021 10:24:52 -0800 Subject: [PATCH 2/3] Fixed esimd/slm_load4.cpp test --- sycl/test/esimd/slm_load4.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/sycl/test/esimd/slm_load4.cpp b/sycl/test/esimd/slm_load4.cpp index a544a5c781f61..d173e8b505097 100644 --- a/sycl/test/esimd/slm_load4.cpp +++ b/sycl/test/esimd/slm_load4.cpp @@ -9,15 +9,22 @@ using namespace sycl::INTEL::gpu; using namespace cl::sycl; -void kernel() __attribute__((sycl_device)) { - simd offsets(0, 1); - simd v1(0, 1); +template +__attribute__((sycl_kernel)) void kernel_call(Func kernelFunc) { + kernelFunc(); +} + +void caller() { + kernel_call([=]() SYCL_ESIMD_KERNEL { + simd offsets(0, 1); + simd v1(0, 1); - slm_init(1024); + slm_init(1024); - auto v0 = slm_load4(offsets); + auto v0 = slm_load4(offsets); - v0 = v0 + v1; + v0 = v0 + v1; - slm_store4(v0, offsets); + slm_store4(v0, offsets); + }); } From c6f5f23da4d5506f628b3a739cb0a8291ff590f3 Mon Sep 17 00:00:00 2001 From: Denis Bakhvalov Date: Tue, 2 Feb 2021 22:37:13 -0800 Subject: [PATCH 3/3] Removed sycl_explicit_simd metadata --- llvm/test/SYCLLowerIR/esimd_global.ll | 2 +- llvm/test/SYCLLowerIR/esimd_lower_intrins.ll | 64 ++++++++++---------- llvm/test/SYCLLowerIR/scalar_fptoui.ll | 2 +- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/llvm/test/SYCLLowerIR/esimd_global.ll b/llvm/test/SYCLLowerIR/esimd_global.ll index c6a30cfb055be..9613b0dea271a 100644 --- a/llvm/test/SYCLLowerIR/esimd_global.ll +++ b/llvm/test/SYCLLowerIR/esimd_global.ll @@ -79,7 +79,7 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg %0, i8* nocapture %1) #2 declare void @llvm.lifetime.end.p0i8(i64 immarg %0, i8* nocapture %1) #2 ; Function Attrs: noinline norecurse nounwind -define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 !sycl_explicit_simd !12 { +define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 { entry: %agg.tmp = alloca %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", align 64 %0 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index 0093717364ab0..aedd42865c8ad 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -26,7 +26,7 @@ define dso_local spir_func <32 x i32> @FUNC_1() { ret <32 x i32> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_2() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i32> @@ -38,7 +38,7 @@ define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_3() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i32> @@ -52,13 +52,13 @@ define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_4() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_4() { %ret_val = call spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 0) ; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32.i64(i64 0) ret <32 x i32> %ret_val } -define dso_local spir_func void @FUNC_5() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_5() { %a_1 = alloca <32 x i32> %1 = load <32 x i32>, <32 x i32>* %a_1 call spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 0, <32 x i32> %1) @@ -66,7 +66,7 @@ define dso_local spir_func void @FUNC_5() !sycl_explicit_simd !1 { ret void } -define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_6() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i16> @@ -76,7 +76,7 @@ define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_7() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i32> @@ -88,7 +88,7 @@ define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 { ret void } -define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i16> @FUNC_8() { %a_1 = alloca <16 x i16> %1 = load <16 x i16>, <16 x i16>* %a_1 %a_2 = alloca <16 x i16> @@ -98,7 +98,7 @@ define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 { ret <16 x i16> %ret_val } -define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 { +define dso_local spir_func <1 x float> @FUNC_9() { %a_1 = alloca <1 x float> %1 = load <1 x float>, <1 x float>* %a_1 %a_2 = alloca <1 x float> @@ -108,7 +108,7 @@ define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 { ret <1 x float> %ret_val } -define dso_local spir_func <8 x float> @FUNC_10() !sycl_explicit_simd !1 { +define dso_local spir_func <8 x float> @FUNC_10() { %a_1 = alloca <16 x float> %1 = load <16 x float>, <16 x float>* %a_1 %ret_val = call spir_func <8 x float> @_Z16__esimd_rdregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT1_EE4typeENS2_IS3_XT0_EE4typeEt(<16 x float> %1, i16 zeroext 0) @@ -116,7 +116,7 @@ define dso_local spir_func <8 x float> @FUNC_10() !sycl_explicit_simd !1 { ret <8 x float> %ret_val } -define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x float> @FUNC_11() { %a_1 = alloca <16 x float> %1 = load <16 x float>, <16 x float>* %a_1 %a_2 = alloca <8 x float> @@ -126,13 +126,13 @@ define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 { ret <16 x float> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) { %ret_val = call spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 0, %opencl.image2d_ro_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2) ; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) ret <32 x i32> %ret_val } -define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) { %a_3 = alloca <32 x i32> %4 = load <32 x i32>, <32 x i32>* %a_3 call spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 0, %opencl.image2d_wo_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2, <32 x i32> %4) @@ -140,7 +140,7 @@ define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, ret void } -define dso_local spir_func <16 x i32> @FUNC_23() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_23() { %ret_val = call spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*)) ; CHECK: %ret_val1 = load <16 x i32>, <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*), align 64 ; TODO: testcase to generate this: @@ -148,14 +148,14 @@ define dso_local spir_func <16 x i32> @FUNC_23() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func void @FUNC_28(<32 x i32> %0) !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_28(<32 x i32> %0) { call spir_func void @_Z14__esimd_vstoreIiLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), <32 x i32> %0) ; CHECK: store <32 x i32> %0, <32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), align 128 ret void } -define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_29() { %a_1 = alloca <32 x i32> %1 = addrspacecast <32 x i32>* %a_1 to <32 x i32> addrspace(4)* %a_2 = alloca <32 x i32> @@ -165,15 +165,15 @@ define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { ret void } -define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 { -; CHECK: define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 +define dso_local spir_kernel void @FUNC_30() { +; CHECK: define dso_local spir_kernel void @FUNC_30() call spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32 1023) ret void ; CHECK-NEXT: ret void } -define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 { -; CHECK: define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 +define dso_local spir_kernel void @FUNC_31() { +; CHECK: define dso_local spir_kernel void @FUNC_31() %call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() ; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32() ; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0 @@ -182,7 +182,7 @@ define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 { ; CHECK-NEXT: ret void } -define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_32() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -194,7 +194,7 @@ define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_33() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -206,7 +206,7 @@ define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_34() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -218,7 +218,7 @@ define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_35() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -230,7 +230,7 @@ define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_36() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -242,7 +242,7 @@ define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_37() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -254,7 +254,7 @@ define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_38() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -266,7 +266,7 @@ define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_39() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -278,25 +278,25 @@ define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <8 x i32> @FUNC_40() !sycl_explicit_simd !1 { +define dso_local spir_func <8 x i32> @FUNC_40() { %ret_val = call spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 0) ; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.oword.ld.v8i32(i32 0, i32 254, i32 0) ret <8 x i32> %ret_val } -define dso_local spir_func void @FUNC_41() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_41() { call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 1) ; CHECK: call void @llvm.genx.sbarrier(i8 1) ret void } -define dso_local spir_func void @FUNC_42() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_42() { call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 0) ; CHECK: call void @llvm.genx.sbarrier(i8 0) ret void } -define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 { +define dso_local spir_func <8 x i32> @FUNC_43() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <8 x i16> @@ -306,7 +306,7 @@ define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 { ret <8 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_44() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_44() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <8 x i32> diff --git a/llvm/test/SYCLLowerIR/scalar_fptoui.ll b/llvm/test/SYCLLowerIR/scalar_fptoui.ll index a17270c711206..496c493eff82c 100644 --- a/llvm/test/SYCLLowerIR/scalar_fptoui.ll +++ b/llvm/test/SYCLLowerIR/scalar_fptoui.ll @@ -7,7 +7,7 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256: target triple = "spir64-unknown-unknown-sycldevice" ; Function Attrs: convergent norecurse -define dso_local spir_func i32 @foo(float %x) !sycl_explicit_simd !1 { +define dso_local spir_func i32 @foo(float %x) { %y = fptoui float %x to i32 ; check that the scalar float to unsigned int conversion is left intact ; CHECK: %y = fptoui float %x to i32