Skip to content

Commit 0f882d2

Browse files
committed
[LoopUnroll] Introduce parallel accumulators when unrolling reductions with vector instructions.
1 parent bb14b83 commit 0f882d2

File tree

4 files changed

+281
-12
lines changed

4 files changed

+281
-12
lines changed

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,10 +270,12 @@ bool RecurrenceDescriptor::AddReductionVar(
270270
// resulting from the type promotion performed by InstCombine. Vector
271271
// operations are not limited to the legal integer widths, so we may be able
272272
// to evaluate the reduction in the narrower width.
273-
if (RecurrenceType->isFloatingPointTy()) {
273+
// Check the scalar type to handle both scalar and vector FP/integer types.
274+
Type *ScalarTy = RecurrenceType->getScalarType();
275+
if (ScalarTy->isFloatingPointTy()) {
274276
if (!isFloatingPointRecurrenceKind(Kind))
275277
return false;
276-
} else if (RecurrenceType->isIntegerTy()) {
278+
} else if (ScalarTy->isIntegerTy()) {
277279
if (!isIntegerRecurrenceKind(Kind))
278280
return false;
279281
if (!isMinMaxRecurrenceKind(Kind))

llvm/lib/Transforms/Utils/LoopUnroll.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,6 +1094,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
10941094
if (!RdxResult) {
10951095
RdxResult = PartialReductions.front();
10961096
IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
1097+
Builder.setFastMathFlags(Reductions.begin()->second.getFastMathFlags());
10971098
RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
10981099
for (Instruction *RdxPart : drop_begin(PartialReductions)) {
10991100
RdxResult = Builder.CreateBinOp(
@@ -1256,14 +1257,19 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
12561257
return std::nullopt;
12571258
RecurKind RK = RdxDesc.getRecurrenceKind();
12581259
// Skip unsupported reductions.
1259-
// TODO: Handle additional reductions, including FP and min-max
1260-
// reductions.
1261-
if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
1260+
// TODO: Handle additional reductions, including min-max reductions.
1261+
if (!(RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
1262+
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RK)) ||
12621263
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
12631264
RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
12641265
RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
12651266
return std::nullopt;
12661267

1268+
if (RecurrenceDescriptor::isFloatingPointRecurrenceKind(RK)) {
1269+
if (!RdxDesc.getFastMathFlags().allowReassoc())
1270+
return std::nullopt;
1271+
}
1272+
12671273
if (RdxDesc.IntermediateStore)
12681274
return std::nullopt;
12691275

llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -319,27 +319,33 @@ define float @test_fadd_with_ressaoc(ptr %src, i64 %n, float %start) {
319319
; CHECK-NEXT: br label %[[LOOP:.*]]
320320
; CHECK: [[LOOP]]:
321321
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
322-
; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
322+
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ -0.000000e+00, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
323+
; CHECK-NEXT: [[RDX_2:%.*]] = phi float [ -0.000000e+00, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
324+
; CHECK-NEXT: [[RDX_3:%.*]] = phi float [ -0.000000e+00, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
325+
; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
323326
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
324327
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
325328
; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
326-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
329+
; CHECK-NEXT: [[RDX_NEXT]] = fadd reassoc float [[RDX]], [[L]]
327330
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
328331
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
329332
; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
330-
; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
333+
; CHECK-NEXT: [[RDX_NEXT_1]] = fadd reassoc float [[RDX_1]], [[L_1]]
331334
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
332335
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
333336
; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
334-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
337+
; CHECK-NEXT: [[RDX_NEXT_2]] = fadd reassoc float [[RDX_2]], [[L_2]]
335338
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
336339
; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
337340
; CHECK-NEXT: [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
338-
; CHECK-NEXT: [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
341+
; CHECK-NEXT: [[RDX_NEXT_3]] = fadd reassoc float [[RDX_3]], [[L_24]]
339342
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
340343
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
341344
; CHECK: [[EXIT]]:
342-
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
345+
; CHECK-NEXT: [[RDX_NEXT_LCSSA1:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
346+
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc float [[RDX_NEXT_1]], [[RDX_NEXT]]
347+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd reassoc float [[RDX_NEXT_2]], [[BIN_RDX]]
348+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = fadd reassoc float [[RDX_NEXT_3]], [[BIN_RDX1]]
343349
; CHECK-NEXT: ret float [[RDX_NEXT_LCSSA]]
344350
;
345351
entry:
@@ -351,13 +357,14 @@ loop:
351357
%iv.next = add i64 %iv, 1
352358
%gep.src = getelementptr float, ptr %src, i64 %iv
353359
%l = load float, ptr %gep.src, align 1
354-
%rdx.next = fadd float %rdx, %l
360+
%rdx.next = fadd reassoc float %rdx, %l
355361
%ec = icmp ne i64 %iv.next, 1000
356362
br i1 %ec, label %loop, label %exit
357363

358364
exit:
359365
ret float %rdx.next
360366
}
367+
361368
define i32 @test_smin(ptr %src, i64 %n) {
362369
; CHECK-LABEL: define i32 @test_smin(
363370
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
@@ -623,3 +630,56 @@ loop:
623630
exit:
624631
ret i32 %rdx.next
625632
}
633+
634+
define <4 x float> @test_vector_fadd(ptr %p, i64 %n, <4 x float> %start) {
635+
; CHECK-LABEL: define <4 x float> @test_vector_fadd(
636+
; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], <4 x float> [[START:%.*]]) {
637+
; CHECK-NEXT: [[ENTRY:.*]]:
638+
; CHECK-NEXT: br label %[[LOOP:.*]]
639+
; CHECK: [[LOOP]]:
640+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
641+
; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
642+
; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
643+
; CHECK-NEXT: [[RDX_3:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
644+
; CHECK-NEXT: [[RDX:%.*]] = phi <4 x float> [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
645+
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
646+
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[P]], i64 [[IV]]
647+
; CHECK-NEXT: [[L:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 16
648+
; CHECK-NEXT: [[RDX_NEXT]] = fadd reassoc <4 x float> [[RDX]], [[L]]
649+
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
650+
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[P]], i64 [[IV_NEXT]]
651+
; CHECK-NEXT: [[L_1:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 16
652+
; CHECK-NEXT: [[RDX_NEXT_3]] = fadd reassoc <4 x float> [[RDX_1]], [[L_1]]
653+
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
654+
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[P]], i64 [[IV_NEXT_1]]
655+
; CHECK-NEXT: [[L_2:%.*]] = load <4 x float>, ptr [[GEP_SRC_2]], align 16
656+
; CHECK-NEXT: [[RDX_NEXT_2]] = fadd reassoc <4 x float> [[RDX_NEXT_1]], [[L_2]]
657+
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
658+
; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[P]], i64 [[IV_NEXT_2]]
659+
; CHECK-NEXT: [[L_24:%.*]] = load <4 x float>, ptr [[GEP_SRC_24]], align 16
660+
; CHECK-NEXT: [[RDX_NEXT_24]] = fadd reassoc <4 x float> [[RDX_3]], [[L_24]]
661+
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
662+
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
663+
; CHECK: [[EXIT]]:
664+
; CHECK-NEXT: [[RDX_NEXT_LCSSA1:%.*]] = phi <4 x float> [ [[RDX_NEXT_24]], %[[LOOP]] ]
665+
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[RDX_NEXT_3]], [[RDX_NEXT]]
666+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd reassoc <4 x float> [[RDX_NEXT_2]], [[BIN_RDX]]
667+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = fadd reassoc <4 x float> [[RDX_NEXT_24]], [[BIN_RDX1]]
668+
; CHECK-NEXT: ret <4 x float> [[RDX_NEXT_LCSSA]]
669+
;
670+
entry:
671+
br label %loop
672+
673+
loop:
674+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
675+
%rdx = phi <4 x float> [ %start, %entry ], [ %rdx.next, %loop ]
676+
%iv.next = add i64 %iv, 1
677+
%gep = getelementptr inbounds nuw <4 x float>, ptr %p, i64 %iv
678+
%l = load <4 x float>, ptr %gep, align 16
679+
%rdx.next = fadd reassoc <4 x float> %rdx, %l
680+
%ec = icmp ne i64 %iv.next, 1000
681+
br i1 %ec, label %loop, label %exit
682+
683+
exit:
684+
ret <4 x float> %rdx.next
685+
}

0 commit comments

Comments
 (0)