-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[NFC][MLIR][OpenMP] Add test for lowering omp target parallel #70795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NFC][MLIR][OpenMP] Add test for lowering omp target parallel #70795
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-llvm Author: Dominik Adamski (DominikAdamski) ChangesAdded MLIR test which checks if MLIR sample code with omp target parallel construct is correctly lowered to LLVM IR for the device. This PR depends on: #67000 Full diff: https://github.com/llvm/llvm-project/pull/70795.diff 1 Files Affected:
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
new file mode 100644
index 000000000000000..8d321dab33ccdf6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// The aim of the test is to check the LLVM IR codegen for the device
+// for omp target parallel construct
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<3>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<1>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<2>, dense<32> : vector<4xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<8>, dense<128> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<6>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<7>, dense<[160, 256, 256, 32]> : vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<4>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<5>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 32 : i32>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target = #omp.target<target_cpu = "gfx90a", target_features = "">, omp.version = #omp.version<version = 11>} {
+ llvm.func @_QQmain_omp_outline_1(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QQmain"} {
+ %0 = omp.map_info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = "d"}
+ omp.target map_entries(%0 : !llvm.ptr) {
+ omp.parallel {
+ %1 = llvm.mlir.constant(1 : i32) : i32
+ llvm.store %1, %arg0 : i32, !llvm.ptr
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+}
+
+// CHECK: define weak_odr protected amdgpu_kernel void [[FUNC0:@.*]](
+// CHECK-SAME: ptr [[TMP0:%.*]]) {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP1:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// CHECK-NEXT: [[STRUCTARG:%.*]] = alloca { ptr }, align 8, addrspace(5)
+// CHECK-NEXT: [[STRUCTARG_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STRUCTARG]] to ptr
+// CHECK-NEXT: [[TMP3:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[TMP3]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) [[KERNEL_ENV:@.*]] to ptr))
+// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP5]], -1
+// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK: user_code.entry:
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT: br label [[OMP_TARGET:%.*]]
+// CHECK: omp.target:
+// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
+// CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+// CHECK: omp_parallel:
+// CHECK-NEXT: [[GEP_:%.*]] = getelementptr { ptr }, ptr addrspace(5) [[STRUCTARG]], i32 0, i32 0
+// CHECK-NEXT: store ptr [[TMP6]], ptr addrspace(5) [[GEP_]], align 8
+// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[STRUCTARG_ASCAST]], ptr [[TMP7]], align 8
+// CHECK-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr [[FUNC1:@.*]], ptr null, ptr [[TMP2]], i64 1)
+// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+// CHECK: omp.par.outlined.exit:
+// CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+// CHECK: omp.par.exit.split:
+// CHECK-NEXT: br label [[OMP_REGION_CONT:%.*]]
+// CHECK: omp.region.cont:
+// CHECK-NEXT: call void @__kmpc_target_deinit()
+// CHECK-NEXT: ret void
+// CHECK: worker.exit:
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[FUNC1]](
+// CHECK-SAME: ptr noalias noundef [[TID_ADDR_ASCAST:%.*]], ptr noalias noundef [[ZERO_ADDR_ASCAST:%.*]], ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: omp.par.entry:
+// CHECK-NEXT: [[GEP_:%.*]] = getelementptr { ptr }, ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[LOADGEP_:%.*]] = load ptr, ptr [[GEP_]], align 8
+// CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TID_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TID_ADDR_LOCAL]], align 4
+// CHECK-NEXT: [[TID:%.*]] = load i32, ptr addrspace(5) [[TID_ADDR_LOCAL]], align 4
+// CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+// CHECK: omp.par.region:
+// CHECK-NEXT: br label [[OMP_PAR_REGION2:%.*]]
+// CHECK: omp.par.region2:
+// CHECK-NEXT: store i32 1, ptr [[LOADGEP_]], align 4
+// CHECK-NEXT: br label [[OMP_REGION_CONT1:%.*]]
+// CHECK: omp.region.cont1:
+// CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+// CHECK: omp.par.pre_finalize:
+// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+// CHECK: omp.par.outlined.exit.exitStub:
+// CHECK-NEXT: ret void
+
|
I did not see any changes in OpenMPTranslation code in mlir in #67000. Is the presence of |
Yes, I wanted to add the test which generates similar code for |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, test minimally here. Only test the call to @__kmpc_parallel_51
with the correct outlined function, and the surrounding target_init
and target_exit
.
I removed unnecessary checks. The remained checks validate target initialization and passing arguments for kmpc_parallel_51. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LG with minimizing attributes (if possible).
// The aim of the test is to check the LLVM IR codegen for the device | ||
// for omp target parallel construct | ||
|
||
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<3>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<1>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<2>, dense<32> : vector<4xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<8>, dense<128> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<6>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<7>, dense<[160, 256, 256, 32]> : vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<4>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<5>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 32 : i32>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target = #omp.target<target_cpu = "gfx90a", target_features = "">, omp.version = #omp.version<version = 11>} { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please minimize the attributes also if possible.
Added MLIR test which checks if MLIR sample code with omp target parallel construct is correctly lowered to LLVM IR for the device.
1a7410d
to
8832133
Compare
Added MLIR test which checks if MLIR sample code with omp target parallel construct is correctly lowered to LLVM IR for the device.
This PR depends on: #67000