diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 906fa2f857c21..b7224a33f47b1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7933,6 +7933,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))) ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second); } + + // Check that all partial reductions in a chain are only used by other + // partial reductions with the same scale factor. Otherwise we end up creating + // users of scaled reductions where the types of the other operands don't + // match. + for (const auto &[Chain, Scale] : PartialReductionChains) { + auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) { + auto *UI = cast(U); + if (isa(UI) && UI->getParent() == OrigLoop->getHeader()) { + return all_of(UI->users(), [ScaleVal, this](const User *U) { + auto *UI = cast(U); + return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal; + }); + } + return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal || + !OrigLoop->contains(UI->getParent()); + }; + if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx)) + ScaledReductionMap.erase(Chain.Reduction); + } } bool VPRecipeBuilder::getScaledReductions( @@ -8116,11 +8136,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (std::optional ScaleFactor = getScalingForReduction(Instr)) { - if (auto PartialRed = - tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) - return PartialRed; - } + if (std::optional ScaleFactor = getScalingForReduction(Instr)) + return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; @@ -8154,9 +8171,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa(BinOpRecipe)) std::swap(BinOp, Accumulator); - if (ScaleFactor != - vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) - return nullptr; + assert(ScaleFactor == + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) && + "all accumulators in chain must have same scale factor"); unsigned ReductionOpcode = Reduction->getOpcode(); if (ReductionOpcode == Instruction::Sub) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll index d80178fde45d9..866487d2620ea 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll @@ -70,3 +70,28 @@ loop: exit: ret i32 %red.next } + +define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i16 [ 0, %entry ], [ %red.next, %loop ] + %l.a = load i8, ptr %A, align 1 + %a.ext = zext i8 %l.a to i16 + store i16 %a.ext, ptr %dst, align 2 + %l.b = load i8, ptr %B, align 1 + %b.ext = zext i8 %l.b to i16 + %add = add i16 %red, %b.ext + %add.1 = add i16 %add, %a.ext + %red.next = add i16 %add.1, %b.ext + %iv.next = add i64 %iv, 1 + %ec = icmp ult i64 %iv, 1024 + br i1 %ec, label %loop, label %exit + +exit: + ret i16 %red.next +} + +attributes #0 = { "target-cpu"="grace" }