Skip to content

Commit b0b0079

Browse files
committed
[IVDesciptors] Support detecting reductions with vector instructions. (llvm#166353)
In combination with llvm#149470 this will introduce parallel accumulators when unrolling reductions with vector instructions. See also llvm#166630, which aims to introduce parallel accumulators for FP reductions. (cherry picked from commit c73de97)
1 parent c94a28d commit b0b0079

File tree

3 files changed

+125
-2
lines changed

3 files changed

+125
-2
lines changed

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,12 @@ bool RecurrenceDescriptor::AddReductionVar(
268268
// resulting from the type promotion performed by InstCombine. Vector
269269
// operations are not limited to the legal integer widths, so we may be able
270270
// to evaluate the reduction in the narrower width.
271-
if (RecurrenceType->isFloatingPointTy()) {
271+
// Check the scalar type to handle both scalar and vector types.
272+
Type *ScalarTy = RecurrenceType->getScalarType();
273+
if (ScalarTy->isFloatingPointTy()) {
272274
if (!isFloatingPointRecurrenceKind(Kind))
273275
return false;
274-
} else if (RecurrenceType->isIntegerTy()) {
276+
} else if (ScalarTy->isIntegerTy()) {
275277
if (!isIntegerRecurrenceKind(Kind))
276278
return false;
277279
if (!isMinMaxRecurrenceKind(Kind))

llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ loop:
358358
exit:
359359
ret float %rdx.next
360360
}
361+
361362
define i32 @test_smin(ptr %src, i64 %n) {
362363
; CHECK-LABEL: define i32 @test_smin(
363364
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
@@ -623,3 +624,56 @@ loop:
623624
exit:
624625
ret i32 %rdx.next
625626
}
627+
628+
define <4 x i32> @test_vector_add(ptr %p, i64 %n, <4 x i32> %start) {
629+
; CHECK-LABEL: define <4 x i32> @test_vector_add(
630+
; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], <4 x i32> [[START:%.*]]) {
631+
; CHECK-NEXT: [[ENTRY:.*]]:
632+
; CHECK-NEXT: br label %[[LOOP:.*]]
633+
; CHECK: [[LOOP]]:
634+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
635+
; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
636+
; CHECK-NEXT: [[RDX_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
637+
; CHECK-NEXT: [[RDX_3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
638+
; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
639+
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
640+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV]]
641+
; CHECK-NEXT: [[L:%.*]] = load <4 x i32>, ptr [[GEP]], align 16
642+
; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[L]]
643+
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
644+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT]]
645+
; CHECK-NEXT: [[L_1:%.*]] = load <4 x i32>, ptr [[GEP_1]], align 16
646+
; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[L_1]]
647+
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
648+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_1]]
649+
; CHECK-NEXT: [[L_2:%.*]] = load <4 x i32>, ptr [[GEP_2]], align 16
650+
; CHECK-NEXT: [[RDX_NEXT_2]] = add <4 x i32> [[RDX_2]], [[L_2]]
651+
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
652+
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_2]]
653+
; CHECK-NEXT: [[L_3:%.*]] = load <4 x i32>, ptr [[GEP_3]], align 16
654+
; CHECK-NEXT: [[RDX_NEXT_3]] = add <4 x i32> [[RDX_3]], [[L_3]]
655+
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
656+
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
657+
; CHECK: [[EXIT]]:
658+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi <4 x i32> [ [[RDX_NEXT_3]], %[[LOOP]] ]
659+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]]
660+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[RDX_NEXT_2]], [[BIN_RDX]]
661+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[RDX_NEXT_3]], [[BIN_RDX1]]
662+
; CHECK-NEXT: ret <4 x i32> [[BIN_RDX2]]
663+
;
664+
entry:
665+
br label %loop
666+
667+
loop:
668+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
669+
%rdx = phi <4 x i32> [ %start, %entry ], [ %rdx.next, %loop ]
670+
%iv.next = add i64 %iv, 1
671+
%gep = getelementptr inbounds nuw <4 x i32>, ptr %p, i64 %iv
672+
%l = load <4 x i32>, ptr %gep, align 16
673+
%rdx.next = add <4 x i32> %rdx, %l
674+
%ec = icmp ne i64 %iv.next, 1000
675+
br i1 %ec, label %loop, label %exit
676+
677+
exit:
678+
ret <4 x i32> %rdx.next
679+
}

llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,72 @@ exit:
223223
ret i32 %res
224224
}
225225

226+
define <4 x i32> @test_vector_add_reduction(ptr %a, i64 %n) {
227+
; CHECK-LABEL: define <4 x i32> @test_vector_add_reduction(
228+
; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
229+
; CHECK-NEXT: [[ENTRY:.*]]:
230+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
231+
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
232+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
233+
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
234+
; CHECK: [[ENTRY_NEW]]:
235+
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
236+
; CHECK-NEXT: br label %[[LOOP:.*]]
237+
; CHECK: [[LOOP]]:
238+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
239+
; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
240+
; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
241+
; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
242+
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV]]
243+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP_A]], align 16
244+
; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[TMP2]]
245+
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
246+
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_NEXT]]
247+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GEP_A_1]], align 16
248+
; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[TMP3]]
249+
; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
250+
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
251+
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
252+
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
253+
; CHECK: [[EXIT_UNR_LCSSA]]:
254+
; CHECK-NEXT: [[RES_PH:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ]
255+
; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
256+
; CHECK-NEXT: [[RDX_UNR:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ]
257+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]]
258+
; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
259+
; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]]
260+
; CHECK: [[LOOP_EPIL_PREHEADER]]:
261+
; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
262+
; CHECK-NEXT: [[RDX_EPIL_INIT:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ]
263+
; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0
264+
; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]])
265+
; CHECK-NEXT: br label %[[LOOP_EPIL:.*]]
266+
; CHECK: [[LOOP_EPIL]]:
267+
; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_EPIL_INIT]]
268+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GEP_A_EPIL]], align 16
269+
; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = add <4 x i32> [[RDX_EPIL_INIT]], [[TMP4]]
270+
; CHECK-NEXT: br label %[[EXIT]]
271+
; CHECK: [[EXIT]]:
272+
; CHECK-NEXT: [[RES:%.*]] = phi <4 x i32> [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
273+
; CHECK-NEXT: ret <4 x i32> [[RES]]
274+
;
275+
entry:
276+
br label %loop
277+
278+
loop:
279+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
280+
%rdx = phi <4 x i32> [ zeroinitializer, %entry ], [ %rdx.next, %loop ]
281+
%gep.a = getelementptr inbounds nuw <4 x i32>, ptr %a, i64 %iv
282+
%1 = load <4 x i32>, ptr %gep.a, align 16
283+
%rdx.next = add <4 x i32> %rdx, %1
284+
%iv.next = add nuw nsw i64 %iv, 1
285+
%ec = icmp eq i64 %iv.next, %n
286+
br i1 %ec, label %exit, label %loop, !llvm.loop !0
287+
288+
exit:
289+
%res = phi <4 x i32> [ %rdx.next, %loop ]
290+
ret <4 x i32> %res
291+
}
226292

227293

228294
!0 = distinct !{!0, !1}
@@ -237,4 +303,5 @@ exit:
237303
; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
238304
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
239305
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
306+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
240307
;.

0 commit comments

Comments
 (0)