Skip to content

Commit 664b227

Browse files
authored
[LV] Keep duplicate recipes in VPExpressionRecipe (llvm#156976)
The VPExpressionRecipe class uses a set to store its bundled recipes. If repeated recipes are bundled then the duplicates will be lost, causing the following recipes to not be at the expected place in the set. When printing a reduce.add(mul(ext, ext)) bundle, for example, if the extends are the same then the 3rd element of the set will be the reduction, rather than the expected mul, causing a cast error. With this change, the recipes are at the expected index in the set. Fixes llvm#156464
1 parent edb80a8 commit 664b227

File tree

4 files changed

+189
-9
lines changed

4 files changed

+189
-9
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llvm/ADT/DenseMap.h"
3030
#include "llvm/ADT/SmallBitVector.h"
3131
#include "llvm/ADT/SmallPtrSet.h"
32+
#include "llvm/ADT/SmallSet.h"
3233
#include "llvm/ADT/SmallVector.h"
3334
#include "llvm/ADT/Twine.h"
3435
#include "llvm/ADT/ilist.h"
@@ -2977,7 +2978,8 @@ class LLVM_ABI_FOR_TEST VPBranchOnMaskRecipe : public VPRecipeBase {
29772978
/// the expression is elevated to connect the non-expression recipe with the
29782979
/// VPExpressionRecipe itself.
29792980
class VPExpressionRecipe : public VPSingleDefRecipe {
2980-
/// Recipes included in this VPExpressionRecipe.
2981+
/// Recipes included in this VPExpressionRecipe. This could contain
2982+
/// duplicates.
29812983
SmallVector<VPSingleDefRecipe *> ExpressionRecipes;
29822984

29832985
/// Temporary VPValues used for external operands of the expression, i.e.
@@ -3039,8 +3041,11 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
30393041
}
30403042

30413043
~VPExpressionRecipe() override {
3042-
for (auto *R : reverse(ExpressionRecipes))
3043-
delete R;
3044+
SmallPtrSet<VPSingleDefRecipe *, 4> ExpressionRecipesSeen;
3045+
for (auto *R : reverse(ExpressionRecipes)) {
3046+
if (ExpressionRecipesSeen.insert(R).second)
3047+
delete R;
3048+
}
30443049
for (VPValue *T : LiveInPlaceholders)
30453050
delete T;
30463051
}

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2755,10 +2755,7 @@ VPExpressionRecipe::VPExpressionRecipe(
27552755
ExpressionTypes ExpressionType,
27562756
ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
27572757
: VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2758-
ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
2759-
ExpressionRecipes.begin(), ExpressionRecipes.end())
2760-
.takeVector()),
2761-
ExpressionType(ExpressionType) {
2758+
ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
27622759
assert(!ExpressionRecipes.empty() && "Nothing to combine?");
27632760
assert(
27642761
none_of(ExpressionRecipes,
@@ -2802,14 +2799,22 @@ VPExpressionRecipe::VPExpressionRecipe(
28022799
continue;
28032800
addOperand(Op);
28042801
LiveInPlaceholders.push_back(new VPValue());
2805-
R->setOperand(Idx, LiveInPlaceholders.back());
28062802
}
28072803
}
2804+
2805+
// Replace each external operand with the first one created for it in
2806+
// LiveInPlaceholders.
2807+
for (auto *R : ExpressionRecipes)
2808+
for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
2809+
R->replaceUsesOfWith(LiveIn, Tmp);
28082810
}
28092811

28102812
void VPExpressionRecipe::decompose() {
28112813
for (auto *R : ExpressionRecipes)
2812-
R->insertBefore(this);
2814+
// Since the list could contain duplicates, make sure the recipe hasn't
2815+
// already been inserted.
2816+
if (!R->getParent())
2817+
R->insertBefore(this);
28132818

28142819
for (const auto &[Idx, Op] : enumerate(operands()))
28152820
LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);

llvm/test/Transforms/LoopVectorize/reduction-inloop.ll

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3351,6 +3351,129 @@ for.end: ; preds = %for.body, %entry
33513351
ret i32 %x.0.lcssa
33523352
}
33533353

3354+
; Test that bundling recipes that share an operand into an expression works.
3355+
; In this case the two extends are the recipes that share an operand.
3356+
define i64 @reduction_expression_same_operands(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
3357+
; CHECK-LABEL: define i64 @reduction_expression_same_operands(
3358+
; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], i32 [[N:%.*]]) {
3359+
; CHECK-NEXT: [[ENTRY:.*]]:
3360+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
3361+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
3362+
; CHECK: [[VECTOR_PH]]:
3363+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
3364+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
3365+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
3366+
; CHECK: [[VECTOR_BODY]]:
3367+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
3368+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
3369+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[INDEX]]
3370+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 4
3371+
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
3372+
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i64> [[TMP3]], [[TMP3]]
3373+
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
3374+
; CHECK-NEXT: [[TMP6]] = add i64 [[VEC_PHI]], [[TMP5]]
3375+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
3376+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
3377+
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
3378+
; CHECK: [[MIDDLE_BLOCK]]:
3379+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
3380+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
3381+
; CHECK: [[SCALAR_PH]]:
3382+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
3383+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
3384+
; CHECK-NEXT: br label %[[LOOP:.*]]
3385+
; CHECK: [[LOOP]]:
3386+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
3387+
; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
3388+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[IV]]
3389+
; CHECK-NEXT: [[LOAD0:%.*]] = load i16, ptr [[ARRAYIDX]], align 4
3390+
; CHECK-NEXT: [[CONV0:%.*]] = sext i16 [[LOAD0]] to i32
3391+
; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD0]] to i32
3392+
; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[CONV0]], [[CONV1]]
3393+
; CHECK-NEXT: [[MUL:%.*]] = sext i32 [[MUL1]] to i64
3394+
; CHECK-NEXT: [[RDX_NEXT]] = add nsw i64 [[RDX]], [[MUL]]
3395+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
3396+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
3397+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
3398+
; CHECK: [[EXIT]]:
3399+
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
3400+
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
3401+
;
3402+
; CHECK-INTERLEAVED-LABEL: define i64 @reduction_expression_same_operands(
3403+
; CHECK-INTERLEAVED-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], i32 [[N:%.*]]) {
3404+
; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]:
3405+
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
3406+
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
3407+
; CHECK-INTERLEAVED: [[VECTOR_PH]]:
3408+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 8
3409+
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
3410+
; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]]
3411+
; CHECK-INTERLEAVED: [[VECTOR_BODY]]:
3412+
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
3413+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
3414+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
3415+
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[INDEX]]
3416+
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 4
3417+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 4
3418+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP2]], align 4
3419+
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
3420+
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nsw <4 x i64> [[TMP4]], [[TMP4]]
3421+
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]])
3422+
; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add i64 [[VEC_PHI]], [[TMP6]]
3423+
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[WIDE_LOAD2]] to <4 x i64>
3424+
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nsw <4 x i64> [[TMP9]], [[TMP9]]
3425+
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP10]])
3426+
; CHECK-INTERLEAVED-NEXT: [[TMP12]] = add i64 [[VEC_PHI1]], [[TMP11]]
3427+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
3428+
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
3429+
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
3430+
; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]:
3431+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i64 [[TMP12]], [[TMP7]]
3432+
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
3433+
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
3434+
; CHECK-INTERLEAVED: [[SCALAR_PH]]:
3435+
; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
3436+
; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
3437+
; CHECK-INTERLEAVED-NEXT: br label %[[LOOP:.*]]
3438+
; CHECK-INTERLEAVED: [[LOOP]]:
3439+
; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
3440+
; CHECK-INTERLEAVED-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
3441+
; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[IV]]
3442+
; CHECK-INTERLEAVED-NEXT: [[LOAD0:%.*]] = load i16, ptr [[ARRAYIDX]], align 4
3443+
; CHECK-INTERLEAVED-NEXT: [[CONV0:%.*]] = sext i16 [[LOAD0]] to i32
3444+
; CHECK-INTERLEAVED-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD0]] to i32
3445+
; CHECK-INTERLEAVED-NEXT: [[MUL1:%.*]] = mul nsw i32 [[CONV0]], [[CONV1]]
3446+
; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = sext i32 [[MUL1]] to i64
3447+
; CHECK-INTERLEAVED-NEXT: [[RDX_NEXT]] = add nsw i64 [[RDX]], [[MUL]]
3448+
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
3449+
; CHECK-INTERLEAVED-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
3450+
; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
3451+
; CHECK-INTERLEAVED: [[EXIT]]:
3452+
; CHECK-INTERLEAVED-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
3453+
; CHECK-INTERLEAVED-NEXT: ret i64 [[R_0_LCSSA]]
3454+
;
3455+
entry:
3456+
br label %loop
3457+
3458+
loop:
3459+
%iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
3460+
%rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
3461+
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
3462+
%load0 = load i16, ptr %arrayidx, align 4
3463+
%conv0 = sext i16 %load0 to i32
3464+
%conv1 = sext i16 %load0 to i32
3465+
%mul = mul nsw i32 %conv0, %conv1
3466+
%conv = sext i32 %mul to i64
3467+
%rdx.next = add nsw i64 %rdx, %conv
3468+
%iv.next = add nuw nsw i32 %iv, 1
3469+
%exitcond = icmp eq i32 %iv.next, %n
3470+
br i1 %exitcond, label %exit, label %loop
3471+
3472+
exit:
3473+
%r.0.lcssa = phi i64 [ %rdx.next, %loop ]
3474+
ret i64 %r.0.lcssa
3475+
}
3476+
33543477
declare float @llvm.fmuladd.f32(float, float, float)
33553478

33563479
!6 = distinct !{!6, !7, !8}

llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,3 +753,50 @@ exit:
753753
%r.0.lcssa = phi i64 [ %rdx.next, %loop ]
754754
ret i64 %r.0.lcssa
755755
}
756+
757+
define i64 @print_mulacc_duplicate_extends(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
758+
; CHECK-LABEL: 'print_mulacc_duplicate_extends'
759+
; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
760+
; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
761+
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
762+
; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
763+
; CHECK-NEXT: Live-in ir<%n> = original trip-count
764+
; CHECK-EMPTY:
765+
; CHECK: vector.ph:
766+
; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
767+
; CHECK-NEXT: Successor(s): vector loop
768+
; CHECK-EMPTY:
769+
; CHECK-NEXT: <x1> vector loop: {
770+
; CHECK-NEXT: vector.body:
771+
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
772+
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
773+
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
774+
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
775+
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
776+
; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
777+
; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.sub (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD0]]> sext to i64))
778+
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
779+
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
780+
; CHECK-NEXT: No successors
781+
; CHECK-NEXT: }
782+
;
783+
entry:
784+
br label %loop
785+
786+
loop:
787+
%iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
788+
%rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
789+
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
790+
%load0 = load i16, ptr %arrayidx, align 4
791+
%conv0 = sext i16 %load0 to i32
792+
%mul = mul nsw i32 %conv0, %conv0
793+
%conv = sext i32 %mul to i64
794+
%rdx.next = sub nsw i64 %rdx, %conv
795+
%iv.next = add nuw nsw i32 %iv, 1
796+
%exitcond = icmp eq i32 %iv.next, %n
797+
br i1 %exitcond, label %exit, label %loop
798+
799+
exit:
800+
%r.0.lcssa = phi i64 [ %rdx.next, %loop ]
801+
ret i64 %r.0.lcssa
802+
}

0 commit comments

Comments
 (0)