From ad1b8772cf6b16c1162bb8ff425679f5ff046ae9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 10 Jan 2022 08:39:12 +0000 Subject: [PATCH] [SCEVExpander] Only create multiplication if needed. 9345ab3a4550 updated generateOverflowCheck to skip creating checks that always evaluate to false. This in turn means that we only need to compute |Step| * Trip count if the result of the multiplication is actually used. Sink the multiplication into ComputeEndCheck, so it is only created when there's an actual check. --- .../Utils/ScalarEvolutionExpander.cpp | 31 ++++++++++--------- .../scev-inserted-runtime-check.ll | 14 +++------ .../wrapping-pointer-versioning.ll | 7 ++--- 3 files changed, 22 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index d60462329d4227..70d284f6776fa8 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2494,21 +2494,6 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); // Compute |Step| * Backedge - Value *MulV, *OfMul; - if (Step->isOne()) { - // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't - // needed, there is never an overflow, so to avoid artificially inflating - // the cost of the check, directly emit the optimized IR. - MulV = TruncTripCount; - OfMul = ConstantInt::getFalse(MulV->getContext()); - } else { - auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), - Intrinsic::umul_with_overflow, Ty); - CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); - MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); - OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); - } - // Compute: // 1. Start + |Step| * Backedge < Start // 2. Start - |Step| * Backedge > Start @@ -2521,6 +2506,22 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, if (!Signed && Start->isZero() && SE.isKnownPositive(Step)) return ConstantInt::getFalse(Loc->getContext()); + Value *MulV, *OfMul; + if (Step->isOne()) { + // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't + // needed, there is never an overflow, so to avoid artificially inflating + // the cost of the check, directly emit the optimized IR. + MulV = TruncTripCount; + OfMul = ConstantInt::getFalse(MulV->getContext()); + } else { + auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), + Intrinsic::umul_with_overflow, Ty); + CallInst *Mul = + Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); + MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); + OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); + } + Value *Add = nullptr, *Sub = nullptr; bool NeedPosCheck = !SE.isKnownNegative(Step); bool NeedNegCheck = !SE.isKnownPositive(Step); diff --git a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll index 9f3ebcc9a9afc1..34f4045228250b 100644 --- a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll +++ b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll @@ -14,9 +14,6 @@ define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias % ; CHECK: for.body.lver.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 -; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP1]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 ; CHECK-NEXT: [[TMP8:%.*]] = or i1 false, [[TMP7]] ; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]]) @@ -88,10 +85,10 @@ define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias % ; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[MUL_EXT]] ; CHECK-NEXT: store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT3:%.*]], label [[FOR_BODY]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END:%.*]] -; CHECK: for.end.loopexit6: +; CHECK: for.end.loopexit3: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -153,9 +150,6 @@ define void @f_with_offset(i32* noalias %b, i32* noalias %c, i32* noalias %d, i3 ; CHECK: for.body.lver.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 -; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP1]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 ; CHECK-NEXT: [[TMP8:%.*]] = or i1 false, [[TMP7]] ; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]]) @@ -227,10 +221,10 @@ define void @f_with_offset(i32* noalias %b, i32* noalias %c, i32* noalias %d, i3 ; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[MUL_EXT]] ; CHECK-NEXT: store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT5:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT2:%.*]], label [[FOR_BODY]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END:%.*]] -; CHECK: for.end.loopexit5: +; CHECK: for.end.loopexit2: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll index 7ce134e66cf8a4..f2d46656301974 100644 --- a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll +++ b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll @@ -31,9 +31,6 @@ define void @f1(i16* noalias %a, ; LV-NEXT: [[A5:%.*]] = bitcast i16* [[A:%.*]] to i8* ; LV-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -1 ; LV-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 -; LV-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP1]]) -; LV-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; LV-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 ; LV-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 ; LV-NEXT: [[TMP8:%.*]] = or i1 false, [[TMP7]] ; LV-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]]) @@ -78,10 +75,10 @@ define void @f1(i16* noalias %a, ; LV-NEXT: [[INC]] = add nuw nsw i64 [[IND]], 1 ; LV-NEXT: [[INC1]] = add i32 [[IND1]], 1 ; LV-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[N]] -; LV-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]] +; LV-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT3:%.*]], label [[FOR_BODY]] ; LV: for.end.loopexit: ; LV-NEXT: br label [[FOR_END:%.*]] -; LV: for.end.loopexit6: +; LV: for.end.loopexit3: ; LV-NEXT: br label [[FOR_END]] ; LV: for.end: ; LV-NEXT: ret void