diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 2163891c15655..2617b80bf52f0 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -1234,10 +1234,6 @@ void SYCLLowerESIMDLegacyPass::collectGenXVolatileType(Module &M) { PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, FunctionAnalysisManager &FAM, SmallPtrSet &GVTS) { - // Only consider functions marked with !sycl_explicit_simd - if (F.getMetadata("sycl_explicit_simd") == nullptr) - return PreservedAnalyses::all(); - SmallVector ESIMDIntrCalls; SmallVector ESIMDToErases; diff --git a/llvm/test/SYCLLowerIR/esimd_global.ll b/llvm/test/SYCLLowerIR/esimd_global.ll index c6a30cfb055be..9613b0dea271a 100644 --- a/llvm/test/SYCLLowerIR/esimd_global.ll +++ b/llvm/test/SYCLLowerIR/esimd_global.ll @@ -79,7 +79,7 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg %0, i8* nocapture %1) #2 declare void @llvm.lifetime.end.p0i8(i64 immarg %0, i8* nocapture %1) #2 ; Function Attrs: noinline norecurse nounwind -define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 !sycl_explicit_simd !12 { +define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 { entry: %agg.tmp = alloca %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", align 64 %0 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index bfa05bf69b182..aedd42865c8ad 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -14,7 +14,9 @@ target triple = "spir64-unknown-unknown-sycldevice" @vg = dso_local global %"cm::gen::simd" zeroinitializer, align 64 #0 @vc = dso_local addrspace(1) global <32 x i32> zeroinitializer -define dso_local spir_func <32 x i32> @FUNC_1() !sycl_explicit_simd !1 { +; LowerESIMD pass should process every function, +; !sycl_explicit_simd metadata is not necessary. +define dso_local spir_func <32 x i32> @FUNC_1() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i16> @@ -24,7 +26,7 @@ define dso_local spir_func <32 x i32> @FUNC_1() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_2() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i32> @@ -36,7 +38,7 @@ define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_3() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i32> @@ -50,13 +52,13 @@ define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_4() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_4() { %ret_val = call spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 0) ; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32.i64(i64 0) ret <32 x i32> %ret_val } -define dso_local spir_func void @FUNC_5() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_5() { %a_1 = alloca <32 x i32> %1 = load <32 x i32>, <32 x i32>* %a_1 call spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 0, <32 x i32> %1) @@ -64,7 +66,7 @@ define dso_local spir_func void @FUNC_5() !sycl_explicit_simd !1 { ret void } -define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_6() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i16> @@ -74,7 +76,7 @@ define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 { ret <32 x i32> %ret_val } -define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_7() { %a_1 = alloca <32 x i64> %1 = load <32 x i64>, <32 x i64>* %a_1 %a_2 = alloca <32 x i32> @@ -86,7 +88,7 @@ define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 { ret void } -define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i16> @FUNC_8() { %a_1 = alloca <16 x i16> %1 = load <16 x i16>, <16 x i16>* %a_1 %a_2 = alloca <16 x i16> @@ -96,7 +98,7 @@ define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 { ret <16 x i16> %ret_val } -define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 { +define dso_local spir_func <1 x float> @FUNC_9() { %a_1 = alloca <1 x float> %1 = load <1 x float>, <1 x float>* %a_1 %a_2 = alloca <1 x float> @@ -106,7 +108,7 @@ define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 { ret <1 x float> %ret_val } -define dso_local spir_func <8 x float> @FUNC_10() !sycl_explicit_simd !1 { +define dso_local spir_func <8 x float> @FUNC_10() { %a_1 = alloca <16 x float> %1 = load <16 x float>, <16 x float>* %a_1 %ret_val = call spir_func <8 x float> @_Z16__esimd_rdregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT1_EE4typeENS2_IS3_XT0_EE4typeEt(<16 x float> %1, i16 zeroext 0) @@ -114,7 +116,7 @@ define dso_local spir_func <8 x float> @FUNC_10() !sycl_explicit_simd !1 { ret <8 x float> %ret_val } -define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x float> @FUNC_11() { %a_1 = alloca <16 x float> %1 = load <16 x float>, <16 x float>* %a_1 %a_2 = alloca <8 x float> @@ -124,13 +126,13 @@ define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 { ret <16 x float> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 { +define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) { %ret_val = call spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 0, %opencl.image2d_ro_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2) ; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) ret <32 x i32> %ret_val } -define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) { %a_3 = alloca <32 x i32> %4 = load <32 x i32>, <32 x i32>* %a_3 call spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 0, %opencl.image2d_wo_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2, <32 x i32> %4) @@ -138,7 +140,7 @@ define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, ret void } -define dso_local spir_func <16 x i32> @FUNC_23() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_23() { %ret_val = call spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*)) ; CHECK: %ret_val1 = load <16 x i32>, <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*), align 64 ; TODO: testcase to generate this: @@ -146,14 +148,14 @@ define dso_local spir_func <16 x i32> @FUNC_23() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func void @FUNC_28(<32 x i32> %0) !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_28(<32 x i32> %0) { call spir_func void @_Z14__esimd_vstoreIiLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), <32 x i32> %0) ; CHECK: store <32 x i32> %0, <32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), align 128 ret void } -define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_29() { %a_1 = alloca <32 x i32> %1 = addrspacecast <32 x i32>* %a_1 to <32 x i32> addrspace(4)* %a_2 = alloca <32 x i32> @@ -163,15 +165,15 @@ define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { ret void } -define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 { -; CHECK: define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 +define dso_local spir_kernel void @FUNC_30() { +; CHECK: define dso_local spir_kernel void @FUNC_30() call spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32 1023) ret void ; CHECK-NEXT: ret void } -define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 { -; CHECK: define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 +define dso_local spir_kernel void @FUNC_31() { +; CHECK: define dso_local spir_kernel void @FUNC_31() %call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() ; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32() ; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0 @@ -180,7 +182,7 @@ define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 { ; CHECK-NEXT: ret void } -define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_32() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -192,7 +194,7 @@ define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_33() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -204,7 +206,7 @@ define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_34() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -216,7 +218,7 @@ define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_35() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -228,7 +230,7 @@ define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_36() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -240,7 +242,7 @@ define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_37() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -252,7 +254,7 @@ define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_38() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -264,7 +266,7 @@ define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_39() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <16 x i32> @@ -276,25 +278,25 @@ define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 { ret <16 x i32> %ret_val } -define dso_local spir_func <8 x i32> @FUNC_40() !sycl_explicit_simd !1 { +define dso_local spir_func <8 x i32> @FUNC_40() { %ret_val = call spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 0) ; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.oword.ld.v8i32(i32 0, i32 254, i32 0) ret <8 x i32> %ret_val } -define dso_local spir_func void @FUNC_41() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_41() { call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 1) ; CHECK: call void @llvm.genx.sbarrier(i8 1) ret void } -define dso_local spir_func void @FUNC_42() !sycl_explicit_simd !1 { +define dso_local spir_func void @FUNC_42() { call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 0) ; CHECK: call void @llvm.genx.sbarrier(i8 0) ret void } -define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 { +define dso_local spir_func <8 x i32> @FUNC_43() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <8 x i16> @@ -304,7 +306,7 @@ define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 { ret <8 x i32> %ret_val } -define dso_local spir_func <16 x i32> @FUNC_44() !sycl_explicit_simd !1 { +define dso_local spir_func <16 x i32> @FUNC_44() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 %a_2 = alloca <8 x i32> diff --git a/llvm/test/SYCLLowerIR/scalar_fptoui.ll b/llvm/test/SYCLLowerIR/scalar_fptoui.ll index a17270c711206..496c493eff82c 100644 --- a/llvm/test/SYCLLowerIR/scalar_fptoui.ll +++ b/llvm/test/SYCLLowerIR/scalar_fptoui.ll @@ -7,7 +7,7 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256: target triple = "spir64-unknown-unknown-sycldevice" ; Function Attrs: convergent norecurse -define dso_local spir_func i32 @foo(float %x) !sycl_explicit_simd !1 { +define dso_local spir_func i32 @foo(float %x) { %y = fptoui float %x to i32 ; check that the scalar float to unsigned int conversion is left intact ; CHECK: %y = fptoui float %x to i32 diff --git a/sycl/test/esimd/slm_load4.cpp b/sycl/test/esimd/slm_load4.cpp index a544a5c781f61..d173e8b505097 100644 --- a/sycl/test/esimd/slm_load4.cpp +++ b/sycl/test/esimd/slm_load4.cpp @@ -9,15 +9,22 @@ using namespace sycl::INTEL::gpu; using namespace cl::sycl; -void kernel() __attribute__((sycl_device)) { - simd offsets(0, 1); - simd v1(0, 1); +template +__attribute__((sycl_kernel)) void kernel_call(Func kernelFunc) { + kernelFunc(); +} + +void caller() { + kernel_call([=]() SYCL_ESIMD_KERNEL { + simd offsets(0, 1); + simd v1(0, 1); - slm_init(1024); + slm_init(1024); - auto v0 = slm_load4(offsets); + auto v0 = slm_load4(offsets); - v0 = v0 + v1; + v0 = v0 + v1; - slm_store4(v0, offsets); + slm_store4(v0, offsets); + }); }