-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][flang][openmp] Rework wsloop reduction operations #80019
Conversation
Note: I have marked this as a draft as I have not yet changed the SCF to OpenMP lowering, meaning that those tests fail. |
@llvm/pr-subscribers-flang-fir-hlfir @llvm/pr-subscribers-mlir Author: David Truby (DavidTruby) ChangesThis patch reworks the way that wsloop reduction operations function to better match the expected semantics from the OpenMP specification, following the rework of parallel reductions. The new semantics create a private reduction variable as a block argument which should be used normally for all operations on that variable in the region; this private variable is then combined with the others into the shared variable. This way no special omp.reduction operations are needed inside the region. These block arguments follow the loop control block arguments. Patch is 361.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80019.diff 37 Files Affected:
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index fcf10b26c135b..74cd6c27b3440 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2274,6 +2274,12 @@ static void createBodyOfOp(
return undef.getDefiningOp();
};
+ llvm::SmallVector<mlir::Type> blockArgTypes;
+ llvm::SmallVector<mlir::Location> blockArgLocs;
+ blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
+ blockArgLocs.reserve(blockArgTypes.size());
+ mlir::Block *entryBlock;
+
// If an argument for the region is provided then create the block with that
// argument. Also update the symbol's address with the mlir argument value.
// e.g. For loops the argument is the induction variable. And all further
@@ -2283,11 +2289,21 @@ static void createBodyOfOp(
for (const Fortran::semantics::Symbol *arg : loopArgs)
loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
- llvm::SmallVector<mlir::Type> tiv(loopArgs.size(), loopVarType);
- llvm::SmallVector<mlir::Location> locs(loopArgs.size(), loc);
- firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
- // The argument is not currently in memory, so make a temporary for the
- // argument, and store it there, then bind that location to the argument.
+ std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
+ loopVarType);
+ std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
+ }
+ if (reductionArgs.size()) {
+ llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
+ std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
+ }
+
+ entryBlock = firOpBuilder.createBlock(&op.getRegion(), {}, blockArgTypes,
+ blockArgLocs);
+
+ // The argument is not currently in memory, so make a temporary for the
+ // argument, and store it there, then bind that location to the argument.
+ if (loopArgs.size()) {
mlir::Operation *storeOp = nullptr;
for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
mlir::Value indexVal =
@@ -2296,16 +2312,12 @@ static void createBodyOfOp(
createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
}
firOpBuilder.setInsertionPointAfter(storeOp);
- } else if (reductionArgs.size()) {
- llvm::SmallVector<mlir::Location> locs(reductionArgs.size(), loc);
- auto block =
- firOpBuilder.createBlock(&op.getRegion(), {}, reductionTypes, locs);
- for (auto [arg, prv] :
- llvm::zip_equal(reductionArgs, block->getArguments())) {
- converter.bindSymbol(*arg, prv);
- }
- } else {
- firOpBuilder.createBlock(&op.getRegion());
+ }
+ // Bind the reduction arguments to their block arguments
+ for (auto [arg, prv] : llvm::zip_equal(
+ reductionArgs,
+ llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
+ converter.bindSymbol(*arg, prv);
}
// Mark the earliest insertion point.
@@ -3293,6 +3305,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
llvm::SmallVector<mlir::Value> linearVars, linearStepVars;
llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+ llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
mlir::omp::ClauseOrderKindAttr orderClauseOperand;
mlir::omp::ClauseScheduleKindAttr scheduleValClauseOperand;
mlir::UnitAttr nowaitClauseOperand, scheduleSimdClauseOperand;
@@ -3304,7 +3317,8 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
loopVarTypeSize);
cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
- cp.processReduction(loc, reductionVars, reductionDeclSymbols);
+ cp.processReduction(loc, reductionVars, reductionDeclSymbols,
+ &reductionSymbols);
cp.processTODO<Fortran::parser::OmpClause::Linear,
Fortran::parser::OmpClause::Order>(loc, ompDirective);
@@ -3347,9 +3361,14 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
auto *nestedEval = getCollapsedLoopEval(
eval, Fortran::lower::getCollapseValue(beginClauseList));
+ llvm::SmallVector<mlir::Type> reductionTypes;
+ reductionTypes.reserve(reductionVars.size());
+ llvm::transform(reductionVars, std::back_inserter(reductionTypes),
+ [](mlir::Value v) { return v.getType(); });
createBodyOfOp<mlir::omp::WsLoopOp>(wsLoopOp, converter, loc, *nestedEval,
/*genNested=*/true, &beginClauseList, iv,
- /*outer=*/false, &dsp);
+ /*outer=*/false, &dsp, reductionSymbols,
+ reductionTypes);
}
static void createSimdWsLoop(
@@ -3450,12 +3469,11 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
// 2.9.3.1 SIMD construct
createSimdLoop(converter, eval, ompDirective, loopOpClauseList,
currentLocation);
+ genOpenMPReduction(converter, loopOpClauseList);
} else {
createWsLoop(converter, eval, ompDirective, loopOpClauseList, endClauseList,
currentLocation);
}
-
- genOpenMPReduction(converter, loopOpClauseList);
}
static void
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 6efa4d0a09586..d6efdcc98b273 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -701,10 +701,17 @@ func.func @_QPsb() {
// CHECK-SAME: %[[ARRAY_REF:.*]]: !llvm.ptr
// CHECK: %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
// CHECK: omp.parallel {
-// CHECK: omp.wsloop reduction(@[[EQV_REDUCTION]] -> %[[RED_ACCUMULATOR]] : !llvm.ptr) for
+// CHECK: omp.wsloop reduction(@[[EQV_REDUCTION]] %[[RED_ACCUMULATOR]] -> %[[PRV:.+]] : !llvm.ptr) for
// CHECK: %[[ARRAY_ELEM_REF:.*]] = llvm.getelementptr %[[ARRAY_REF]][0, %{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr
// CHECK: %[[ARRAY_ELEM:.*]] = llvm.load %[[ARRAY_ELEM_REF]] : !llvm.ptr -> i32
-// CHECK: omp.reduction %[[ARRAY_ELEM]], %[[RED_ACCUMULATOR]] : i32, !llvm.ptr
+// CHECK: %[[LPRV:.+]] = llvm.load %[[PRV]] : !llvm.ptr -> i32
+// CHECK: %[[ZERO_1:.*]] = llvm.mlir.constant(0 : i64) : i32
+// CHECK: %[[ARGVAL_1:.*]] = llvm.icmp "ne" %[[LPRV]], %[[ZERO_1]] : i32
+// CHECK: %[[ZERO_2:.*]] = llvm.mlir.constant(0 : i64) : i32
+// CHECK: %[[ARGVAL_2:.*]] = llvm.icmp "ne" %[[ARRAY_ELEM]], %[[ZERO_2]] : i32
+// CHECK: %[[RES:.*]] = llvm.icmp "eq" %[[ARGVAL_2]], %[[ARGVAL_1]] : i1
+// CHECK: %[[RES_EXT:.*]] = llvm.zext %[[RES]] : i1 to i32
+// CHECK: llvm.store %[[RES_EXT]], %[[PRV]] : i32, !llvm.ptr
// CHECK: omp.yield
// CHECK: omp.terminator
// CHECK: llvm.return
@@ -733,7 +740,7 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
%c1_i32 = arith.constant 1 : i32
%c100_i32 = arith.constant 100 : i32
%c1_i32_0 = arith.constant 1 : i32
- omp.wsloop reduction(@eqv_reduction -> %1 : !fir.ref<!fir.logical<4>>) for (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32_0) {
+ omp.wsloop reduction(@eqv_reduction %1 -> %prv : !fir.ref<!fir.logical<4>>) for (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32_0) {
fir.store %arg1 to %3 : !fir.ref<i32>
%4 = fir.load %3 : !fir.ref<i32>
%5 = fir.convert %4 : (i32) -> i64
@@ -741,7 +748,12 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
%6 = arith.subi %5, %c1_i64 : i64
%7 = fir.coordinate_of %arg0, %6 : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
%8 = fir.load %7 : !fir.ref<!fir.logical<4>>
- omp.reduction %8, %1 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+ %lprv = fir.load %prv : !fir.ref<!fir.logical<4>>
+ %lprv1 = fir.convert %lprv : (!fir.logical<4>) -> i1
+ %9 = fir.convert %8 : (!fir.logical<4>) -> i1
+ %10 = arith.cmpi eq, %9, %lprv1 : i1
+ %11 = fir.convert %10 : (i1) -> !fir.logical<4>
+ fir.store %11 to %prv : !fir.ref<!fir.logical<4>>
omp.yield
}
omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
index 62d9af31588e9..5664529416fe8 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
@@ -1,66 +1,79 @@
! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_F64_NAME:.*]] : f64 init {
-!CHECK: ^bb0(%{{.*}}: f64):
-!CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
-!CHECK: omp.yield(%[[C0_1]] : f64)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f64, %[[ARG1:.*]]: f64):
-!CHECK: %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f64
-!CHECK: omp.yield(%[[RES]] : f64)
-!CHECK: }
+! The script is designed to make adding checks to
+! a test case fast, it is *not* designed to be authoritative
+! about what constitutes a good test! The CHECK should be
+! minimized and named to reflect the test intent.
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_I64_NAME:.*]] : i64 init {
-!CHECK: ^bb0(%{{.*}}: i64):
-!CHECK: %[[C0_1:.*]] = arith.constant 0 : i64
-!CHECK: omp.yield(%[[C0_1]] : i64)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64):
-!CHECK: %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i64
-!CHECK: omp.yield(%[[RES]] : i64)
-!CHECK: }
+! CHECK-LABEL: omp.reduction.declare @add_reduction_f_64 : f64 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f64):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK: omp.yield(%[[VAL_1]] : f64)
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
-!CHECK: ^bb0(%{{.*}}: f32):
-!CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK: omp.yield(%[[C0_1]] : f32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
-!CHECK: %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
-!CHECK: omp.yield(%[[RES]] : f32)
-!CHECK: }
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f64, %[[VAL_1:.*]]: f64):
+! CHECK: %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f64
+! CHECK: omp.yield(%[[VAL_2]] : f64)
+! CHECK: }
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
-!CHECK: ^bb0(%{{.*}}: i32):
-!CHECK: %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK: omp.yield(%[[C0_1]] : i32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
-!CHECK: %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
-!CHECK: omp.yield(%[[RES]] : i32)
-!CHECK: }
+! CHECK-LABEL: omp.reduction.declare @add_reduction_i_64 : i64 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i64):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i64
+! CHECK: omp.yield(%[[VAL_1]] : i64)
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i64, %[[VAL_1:.*]]: i64):
+! CHECK: %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
+! CHECK: omp.yield(%[[VAL_2]] : i64)
+! CHECK: }
+
+! CHECK-LABEL: omp.reduction.declare @add_reduction_f_32 : f32 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f32):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: omp.yield(%[[VAL_1]] : f32)
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32):
+! CHECK: %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f32
+! CHECK: omp.yield(%[[VAL_2]] : f32)
+! CHECK: }
+
+! CHECK-LABEL: omp.reduction.declare @add_reduction_i_32 : i32 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK: omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK: %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK: omp.yield(%[[VAL_2]] : i32)
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPsimple_int_reduction() {
+! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop reduction(@add_reduction_i_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) for (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK: fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: return
+! CHECK: }
-!CHECK-LABEL: func.func @_QPsimple_int_reduction
-!CHECK: %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-!CHECK: %[[C0_2:.*]] = arith.constant 0 : i32
-!CHECK: fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
-!CHECK: omp.parallel
-!CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK: %[[C100:.*]] = arith.constant 100 : i32
-!CHECK: %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK: fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: return
subroutine simple_int_reduction
integer :: x
x = 0
@@ -73,23 +86,31 @@ subroutine simple_int_reduction
!$omp end parallel
end subroutine
-!CHECK-LABEL: func.func @_QPsimple_real_reduction
-!CHECK: %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-!CHECK: %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK: fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
-!CHECK: omp.parallel
-!CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK: %[[C100:.*]] = arith.constant 100 : i32
-!CHECK: %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK: fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
-!CHECK: omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: return
+
+! CHECK-LABEL: func.func @_QPsimple_real_reduction() {
+! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK: %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop reduction(@add_reduction_f_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) for (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
+! CHECK: %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK: fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: return
+! CHECK: }
+
subroutine simple_real_reduction
real :: x
x = 0.0
@@ -102,22 +123,29 @@ subroutine simple_real_reduction
!$omp end parallel
end subroutine
-!CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order
-!CHECK: %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-!CHECK: %[[C0_2:.*]] = arith.constant 0 : i32
-!CHECK: fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
-!CHECK: omp.parallel
-!CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK: %[[C100:.*]] = arith.constant 100 : i32
-!CHECK: %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK: fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: return
+! CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop reduction(@add_reduction_i_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) for (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK: fir.store %[[VAL_11]] to ...
[truncated]
|
@llvm/pr-subscribers-mlir-llvm Author: David Truby (DavidTruby) ChangesThis patch reworks the way that wsloop reduction operations function to better match the expected semantics from the OpenMP specification, following the rework of parallel reductions. The new semantics create a private reduction variable as a block argument which should be used normally for all operations on that variable in the region; this private variable is then combined with the others into the shared variable. This way no special omp.reduction operations are needed inside the region. These block arguments follow the loop control block arguments. Patch is 361.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80019.diff 37 Files Affected:
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index fcf10b26c135b..74cd6c27b3440 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2274,6 +2274,12 @@ static void createBodyOfOp(
return undef.getDefiningOp();
};
+ llvm::SmallVector<mlir::Type> blockArgTypes;
+ llvm::SmallVector<mlir::Location> blockArgLocs;
+ blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
+ blockArgLocs.reserve(blockArgTypes.size());
+ mlir::Block *entryBlock;
+
// If an argument for the region is provided then create the block with that
// argument. Also update the symbol's address with the mlir argument value.
// e.g. For loops the argument is the induction variable. And all further
@@ -2283,11 +2289,21 @@ static void createBodyOfOp(
for (const Fortran::semantics::Symbol *arg : loopArgs)
loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
- llvm::SmallVector<mlir::Type> tiv(loopArgs.size(), loopVarType);
- llvm::SmallVector<mlir::Location> locs(loopArgs.size(), loc);
- firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
- // The argument is not currently in memory, so make a temporary for the
- // argument, and store it there, then bind that location to the argument.
+ std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
+ loopVarType);
+ std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
+ }
+ if (reductionArgs.size()) {
+ llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
+ std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
+ }
+
+ entryBlock = firOpBuilder.createBlock(&op.getRegion(), {}, blockArgTypes,
+ blockArgLocs);
+
+ // The argument is not currently in memory, so make a temporary for the
+ // argument, and store it there, then bind that location to the argument.
+ if (loopArgs.size()) {
mlir::Operation *storeOp = nullptr;
for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
mlir::Value indexVal =
@@ -2296,16 +2312,12 @@ static void createBodyOfOp(
createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
}
firOpBuilder.setInsertionPointAfter(storeOp);
- } else if (reductionArgs.size()) {
- llvm::SmallVector<mlir::Location> locs(reductionArgs.size(), loc);
- auto block =
- firOpBuilder.createBlock(&op.getRegion(), {}, reductionTypes, locs);
- for (auto [arg, prv] :
- llvm::zip_equal(reductionArgs, block->getArguments())) {
- converter.bindSymbol(*arg, prv);
- }
- } else {
- firOpBuilder.createBlock(&op.getRegion());
+ }
+ // Bind the reduction arguments to their block arguments
+ for (auto [arg, prv] : llvm::zip_equal(
+ reductionArgs,
+ llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
+ converter.bindSymbol(*arg, prv);
}
// Mark the earliest insertion point.
@@ -3293,6 +3305,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
llvm::SmallVector<mlir::Value> linearVars, linearStepVars;
llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+ llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
mlir::omp::ClauseOrderKindAttr orderClauseOperand;
mlir::omp::ClauseScheduleKindAttr scheduleValClauseOperand;
mlir::UnitAttr nowaitClauseOperand, scheduleSimdClauseOperand;
@@ -3304,7 +3317,8 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
loopVarTypeSize);
cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
- cp.processReduction(loc, reductionVars, reductionDeclSymbols);
+ cp.processReduction(loc, reductionVars, reductionDeclSymbols,
+ &reductionSymbols);
cp.processTODO<Fortran::parser::OmpClause::Linear,
Fortran::parser::OmpClause::Order>(loc, ompDirective);
@@ -3347,9 +3361,14 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
auto *nestedEval = getCollapsedLoopEval(
eval, Fortran::lower::getCollapseValue(beginClauseList));
+ llvm::SmallVector<mlir::Type> reductionTypes;
+ reductionTypes.reserve(reductionVars.size());
+ llvm::transform(reductionVars, std::back_inserter(reductionTypes),
+ [](mlir::Value v) { return v.getType(); });
createBodyOfOp<mlir::omp::WsLoopOp>(wsLoopOp, converter, loc, *nestedEval,
/*genNested=*/true, &beginClauseList, iv,
- /*outer=*/false, &dsp);
+ /*outer=*/false, &dsp, reductionSymbols,
+ reductionTypes);
}
static void createSimdWsLoop(
@@ -3450,12 +3469,11 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
// 2.9.3.1 SIMD construct
createSimdLoop(converter, eval, ompDirective, loopOpClauseList,
currentLocation);
+ genOpenMPReduction(converter, loopOpClauseList);
} else {
createWsLoop(converter, eval, ompDirective, loopOpClauseList, endClauseList,
currentLocation);
}
-
- genOpenMPReduction(converter, loopOpClauseList);
}
static void
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 6efa4d0a09586..d6efdcc98b273 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -701,10 +701,17 @@ func.func @_QPsb() {
// CHECK-SAME: %[[ARRAY_REF:.*]]: !llvm.ptr
// CHECK: %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
// CHECK: omp.parallel {
-// CHECK: omp.wsloop reduction(@[[EQV_REDUCTION]] -> %[[RED_ACCUMULATOR]] : !llvm.ptr) for
+// CHECK: omp.wsloop reduction(@[[EQV_REDUCTION]] %[[RED_ACCUMULATOR]] -> %[[PRV:.+]] : !llvm.ptr) for
// CHECK: %[[ARRAY_ELEM_REF:.*]] = llvm.getelementptr %[[ARRAY_REF]][0, %{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr
// CHECK: %[[ARRAY_ELEM:.*]] = llvm.load %[[ARRAY_ELEM_REF]] : !llvm.ptr -> i32
-// CHECK: omp.reduction %[[ARRAY_ELEM]], %[[RED_ACCUMULATOR]] : i32, !llvm.ptr
+// CHECK: %[[LPRV:.+]] = llvm.load %[[PRV]] : !llvm.ptr -> i32
+// CHECK: %[[ZERO_1:.*]] = llvm.mlir.constant(0 : i64) : i32
+// CHECK: %[[ARGVAL_1:.*]] = llvm.icmp "ne" %[[LPRV]], %[[ZERO_1]] : i32
+// CHECK: %[[ZERO_2:.*]] = llvm.mlir.constant(0 : i64) : i32
+// CHECK: %[[ARGVAL_2:.*]] = llvm.icmp "ne" %[[ARRAY_ELEM]], %[[ZERO_2]] : i32
+// CHECK: %[[RES:.*]] = llvm.icmp "eq" %[[ARGVAL_2]], %[[ARGVAL_1]] : i1
+// CHECK: %[[RES_EXT:.*]] = llvm.zext %[[RES]] : i1 to i32
+// CHECK: llvm.store %[[RES_EXT]], %[[PRV]] : i32, !llvm.ptr
// CHECK: omp.yield
// CHECK: omp.terminator
// CHECK: llvm.return
@@ -733,7 +740,7 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
%c1_i32 = arith.constant 1 : i32
%c100_i32 = arith.constant 100 : i32
%c1_i32_0 = arith.constant 1 : i32
- omp.wsloop reduction(@eqv_reduction -> %1 : !fir.ref<!fir.logical<4>>) for (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32_0) {
+ omp.wsloop reduction(@eqv_reduction %1 -> %prv : !fir.ref<!fir.logical<4>>) for (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32_0) {
fir.store %arg1 to %3 : !fir.ref<i32>
%4 = fir.load %3 : !fir.ref<i32>
%5 = fir.convert %4 : (i32) -> i64
@@ -741,7 +748,12 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
%6 = arith.subi %5, %c1_i64 : i64
%7 = fir.coordinate_of %arg0, %6 : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
%8 = fir.load %7 : !fir.ref<!fir.logical<4>>
- omp.reduction %8, %1 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+ %lprv = fir.load %prv : !fir.ref<!fir.logical<4>>
+ %lprv1 = fir.convert %lprv : (!fir.logical<4>) -> i1
+ %9 = fir.convert %8 : (!fir.logical<4>) -> i1
+ %10 = arith.cmpi eq, %9, %lprv1 : i1
+ %11 = fir.convert %10 : (i1) -> !fir.logical<4>
+ fir.store %11 to %prv : !fir.ref<!fir.logical<4>>
omp.yield
}
omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
index 62d9af31588e9..5664529416fe8 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
@@ -1,66 +1,79 @@
! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_F64_NAME:.*]] : f64 init {
-!CHECK: ^bb0(%{{.*}}: f64):
-!CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
-!CHECK: omp.yield(%[[C0_1]] : f64)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f64, %[[ARG1:.*]]: f64):
-!CHECK: %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f64
-!CHECK: omp.yield(%[[RES]] : f64)
-!CHECK: }
+! The script is designed to make adding checks to
+! a test case fast, it is *not* designed to be authoritative
+! about what constitutes a good test! The CHECK should be
+! minimized and named to reflect the test intent.
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_I64_NAME:.*]] : i64 init {
-!CHECK: ^bb0(%{{.*}}: i64):
-!CHECK: %[[C0_1:.*]] = arith.constant 0 : i64
-!CHECK: omp.yield(%[[C0_1]] : i64)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64):
-!CHECK: %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i64
-!CHECK: omp.yield(%[[RES]] : i64)
-!CHECK: }
+! CHECK-LABEL: omp.reduction.declare @add_reduction_f_64 : f64 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f64):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK: omp.yield(%[[VAL_1]] : f64)
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
-!CHECK: ^bb0(%{{.*}}: f32):
-!CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK: omp.yield(%[[C0_1]] : f32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
-!CHECK: %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
-!CHECK: omp.yield(%[[RES]] : f32)
-!CHECK: }
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f64, %[[VAL_1:.*]]: f64):
+! CHECK: %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f64
+! CHECK: omp.yield(%[[VAL_2]] : f64)
+! CHECK: }
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
-!CHECK: ^bb0(%{{.*}}: i32):
-!CHECK: %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK: omp.yield(%[[C0_1]] : i32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
-!CHECK: %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
-!CHECK: omp.yield(%[[RES]] : i32)
-!CHECK: }
+! CHECK-LABEL: omp.reduction.declare @add_reduction_i_64 : i64 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i64):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i64
+! CHECK: omp.yield(%[[VAL_1]] : i64)
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i64, %[[VAL_1:.*]]: i64):
+! CHECK: %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
+! CHECK: omp.yield(%[[VAL_2]] : i64)
+! CHECK: }
+
+! CHECK-LABEL: omp.reduction.declare @add_reduction_f_32 : f32 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f32):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: omp.yield(%[[VAL_1]] : f32)
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32):
+! CHECK: %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f32
+! CHECK: omp.yield(%[[VAL_2]] : f32)
+! CHECK: }
+
+! CHECK-LABEL: omp.reduction.declare @add_reduction_i_32 : i32 init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK: omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK: %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK: omp.yield(%[[VAL_2]] : i32)
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPsimple_int_reduction() {
+! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop reduction(@add_reduction_i_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) for (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK: fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: return
+! CHECK: }
-!CHECK-LABEL: func.func @_QPsimple_int_reduction
-!CHECK: %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-!CHECK: %[[C0_2:.*]] = arith.constant 0 : i32
-!CHECK: fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
-!CHECK: omp.parallel
-!CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK: %[[C100:.*]] = arith.constant 100 : i32
-!CHECK: %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK: fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: return
subroutine simple_int_reduction
integer :: x
x = 0
@@ -73,23 +86,31 @@ subroutine simple_int_reduction
!$omp end parallel
end subroutine
-!CHECK-LABEL: func.func @_QPsimple_real_reduction
-!CHECK: %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-!CHECK: %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK: fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
-!CHECK: omp.parallel
-!CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK: %[[C100:.*]] = arith.constant 100 : i32
-!CHECK: %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK: fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
-!CHECK: omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: return
+
+! CHECK-LABEL: func.func @_QPsimple_real_reduction() {
+! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK: %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop reduction(@add_reduction_f_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) for (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
+! CHECK: %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK: fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: return
+! CHECK: }
+
subroutine simple_real_reduction
real :: x
x = 0.0
@@ -102,22 +123,29 @@ subroutine simple_real_reduction
!$omp end parallel
end subroutine
-!CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order
-!CHECK: %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-!CHECK: %[[C0_2:.*]] = arith.constant 0 : i32
-!CHECK: fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
-!CHECK: omp.parallel
-!CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK: %[[C100:.*]] = arith.constant 100 : i32
-!CHECK: %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK: fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK: omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: return
+! CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop reduction(@add_reduction_i_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) for (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK: fir.store %[[VAL_11]] to ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the update!
1ed4bc3
to
c84f525
Compare
This patch reworks the way that wsloop reduction operations function to better match the expected semantics from the OpenMP specification, following the rework of parallel reductions. The new semantics create a private reduction variable as a block argument which should be used normally for all operations on that variable in the region; this private variable is then combined with the others into the shared variable. This way no special omp.reduction operations are needed inside the region. These block arguments follow the loop control block arguments.
6cbd47b
to
2d2f65c
Compare
You can test this locally with the following command:git-clang-format --diff 89c1bf1230e011f2f0e43554c278205fa1819de5 d7267919b7350cad5e4fdf3cb03aae366aefcc46 -- flang/lib/Lower/OpenMP.cpp mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp View the diff from clang-format here.diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index f7f80ca9c6..24f91765cb 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -3353,11 +3353,12 @@ genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
}
static llvm::SmallVector<const Fortran::semantics::Symbol *>
-genLoopAndReductionVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
- mlir::Location &loc,
- const llvm::SmallVector<const Fortran::semantics::Symbol *> &loopArgs,
- const llvm::SmallVector<const Fortran::semantics::Symbol *> &reductionArgs,
- llvm::SmallVector<mlir::Type> &reductionTypes) {
+genLoopAndReductionVars(
+ mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+ mlir::Location &loc,
+ const llvm::SmallVector<const Fortran::semantics::Symbol *> &loopArgs,
+ const llvm::SmallVector<const Fortran::semantics::Symbol *> &reductionArgs,
+ llvm::SmallVector<mlir::Type> &reductionTypes) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
llvm::SmallVector<mlir::Type> blockArgTypes;
@@ -3372,7 +3373,7 @@ genLoopAndReductionVars(mlir::Operation *op, Fortran::lower::AbstractConverter &
loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
- loopVarType);
+ loopVarType);
std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
}
if (reductionArgs.size()) {
@@ -3386,12 +3387,12 @@ genLoopAndReductionVars(mlir::Operation *op, Fortran::lower::AbstractConverter &
if (loopArgs.size()) {
mlir::Operation *storeOp = nullptr;
for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
- mlir::Value indexVal =
+ mlir::Value indexVal =
fir::getBase(op->getRegion(0).front().getArgument(argIndex));
storeOp =
createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
}
- firOpBuilder.setInsertionPointAfter(storeOp);
+ firOpBuilder.setInsertionPointAfter(storeOp);
}
// Bind the reduction arguments to their block arguments
for (auto [arg, prv] : llvm::zip_equal(
@@ -3543,14 +3544,15 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
[](mlir::Value v) { return v.getType(); });
auto ivCallback = [&](mlir::Operation *op) {
- return genLoopAndReductionVars(op, converter, loc, iv, reductionSymbols, reductionTypes);
+ return genLoopAndReductionVars(op, converter, loc, iv, reductionSymbols,
+ reductionTypes);
};
createBodyOfOp<mlir::omp::WsLoopOp>(
wsLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
.setClauses(&beginClauseList)
.setDataSharingProcessor(&dsp)
- .setReductions(&reductionSymbols, &reductionTypes)
+ .setReductions(&reductionSymbols, &reductionTypes)
.setGenRegionEntryCb(ivCallback));
}
|
2d2f65c
to
d726791
Compare
Misssed the code formatting change here. Will update in a separate patch. |
Fixed in 2772692 |
The old code was replaced by #80019.
This operation did not model the behaviour of reductions in the openmp standard. It has since been replaced by block arguments on the outer operation. See llvm#79308 and llvm#80019
This operation did not model the behaviour of reductions in the openmp standard. It has since been replaced by block arguments on the outer operation. See llvm#79308 and llvm#80019
This patch reworks the way that wsloop reduction operations function to better match the expected semantics from the OpenMP specification, following the rework of parallel reductions.
The new semantics create a private reduction variable as a block argument which should be used normally for all operations on that variable in the region; this private variable is then combined with the others into the shared variable. This way no special omp.reduction operations are needed inside the region. These block arguments follow the loop control block arguments.