From fcec466cf4a263f267550d6982f0b136503ea46d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 25 Apr 2024 09:47:29 +0100 Subject: [PATCH 1/2] [LAA] Support different strides & non constant dep distances using SCEV. #88039 --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 134 ++++++++++++------ .../Transforms/Scalar/LoopLoadElimination.cpp | 4 +- .../non-constant-strides-forward.ll | 10 +- 3 files changed, 95 insertions(+), 53 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index b1ba8e7c0f601..ed8e749f9d6c2 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1920,20 +1920,21 @@ isLoopVariantIndirectAddress(ArrayRef UnderlyingObjects, namespace { struct DepDistanceStrideAndSizeInfo { const SCEV *Dist; - uint64_t Stride; + uint64_t StrideA; + uint64_t StrideB; uint64_t TypeByteSize; bool AIsWrite; bool BIsWrite; - DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t Stride, - uint64_t TypeByteSize, bool AIsWrite, - bool BIsWrite) - : Dist(Dist), Stride(Stride), TypeByteSize(TypeByteSize), - AIsWrite(AIsWrite), BIsWrite(BIsWrite) {} + DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA, + uint64_t StrideB, uint64_t TypeByteSize, + bool AIsWrite, bool BIsWrite) + : Dist(Dist), StrideA(StrideA), StrideB(StrideB), + TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {} }; } // namespace -// Get the dependence distance, stride, type size and whether it is a write for +// Get the dependence distance, strides, type size and whether it is a write for // the dependence between A and B. Returns a DepType, if we can prove there's // no dependence or the analysis fails. Outlined to lambda to limit he scope // of various temporary variables, like A/BPtr, StrideA/BPtr and others. @@ -1995,10 +1996,11 @@ getDependenceDistanceStrideAndSize( InnermostLoop)) return MemoryDepChecker::Dependence::IndirectUnsafe; - // Need accesses with constant stride. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap - // in the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) { + // Need accesses with constant strides and the same direction. We don't want + // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that + // could wrap in the address space. + if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) || + (StrideAPtr < 0 && StrideBPtr > 0)) { LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n"); return MemoryDepChecker::Dependence::Unknown; } @@ -2008,9 +2010,9 @@ getDependenceDistanceStrideAndSize( DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy); if (!HasSameSize) TypeByteSize = 0; - uint64_t Stride = std::abs(StrideAPtr); - return DepDistanceStrideAndSizeInfo(Dist, Stride, TypeByteSize, AIsWrite, - BIsWrite); + return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr), + std::abs(StrideBPtr), TypeByteSize, + AIsWrite, BIsWrite); } MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( @@ -2028,41 +2030,63 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( if (std::holds_alternative(Res)) return std::get(Res); - const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] = + const auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] = std::get(Res); bool HasSameSize = TypeByteSize > 0; + std::optional CommonStride = + StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt; + if (isa(Dist)) { + // TODO: Relax requirement that there is a common stride to retry with + // non-constant distance dependencies. + FoundNonConstantDistanceDependence |= !!CommonStride; + LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n"); + return Dependence::Unknown; + } + ScalarEvolution &SE = *PSE.getSE(); auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); + // If the distance between the acecsses is larger than their absolute stride // multiplied by the backedge taken count, the accesses are independet, i.e. // they are far enough appart that accesses won't access the same location // across all loop ierations. - if (!isa(Dist) && HasSameSize && + if (HasSameSize && CommonStride && isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist, - Stride, TypeByteSize)) + *CommonStride, TypeByteSize)) return Dependence::NoDep; const SCEVConstant *C = dyn_cast(Dist); - if (!C) { - LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n"); - FoundNonConstantDistanceDependence = true; - return Dependence::Unknown; - } - const APInt &Val = C->getAPInt(); - int64_t Distance = Val.getSExtValue(); - - // If the distance between accesses and their strides are known constants, - // check whether the accesses interlace each other. - if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize && - areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) { - LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); - return Dependence::NoDep; + // Attempt to prove strided accesses independent. + if (C) { + const APInt &Val = C->getAPInt(); + int64_t Distance = Val.getSExtValue(); + + // If the distance between accesses and their strides are known constants, + // check whether the accesses interlace each other. + if (std::abs(Distance) > 0 && CommonStride && *CommonStride > 1 && + HasSameSize && + areStridedAccessesIndependent(std::abs(Distance), *CommonStride, + TypeByteSize)) { + LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); + return Dependence::NoDep; + } } // Negative distances are not plausible dependencies. - if (Val.isNegative()) { + if (SE.isKnownNonPositive(Dist)) { + if (SE.isKnownNonNegative(Dist)) { + if (HasSameSize) { + // Write to the same location with the same size. + return Dependence::Forward; + } else { + LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " + "different type sizes\n"); + return Dependence::Unknown; + } + } + bool IsTrueDataDependence = (AIsWrite && !BIsWrite); // Check if the first access writes to a location that is read in a later // iteration, where the distance between them is not a multiple of a vector @@ -2071,27 +2095,37 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a // forward dependency will allow vectorization using any width. - if (IsTrueDataDependence && EnableForwardingConflictDetection && - (!HasSameSize || couldPreventStoreLoadForward(Val.abs().getZExtValue(), - TypeByteSize))) { - LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n"); - return Dependence::ForwardButPreventsForwarding; + + if (IsTrueDataDependence && EnableForwardingConflictDetection) { + if (!C) { + // TODO: Relax requirement that there is a common stride to retry with + // non-constant distance dependencies. + FoundNonConstantDistanceDependence |= !!CommonStride; + return Dependence::Unknown; + } + if (!HasSameSize || + couldPreventStoreLoadForward(C->getAPInt().abs().getZExtValue(), + TypeByteSize)) { + LLVM_DEBUG( + dbgs() << "LAA: Forward but may prevent st->ld forwarding\n"); + return Dependence::ForwardButPreventsForwarding; + } } LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n"); return Dependence::Forward; } - // Write to the same location with the same size. - if (Val == 0) { - if (HasSameSize) - return Dependence::Forward; - LLVM_DEBUG( - dbgs() << "LAA: Zero dependence difference but different type sizes\n"); + if (!C) { + // TODO: Relax requirement that there is a common stride to retry with + // non-constant distance dependencies. + FoundNonConstantDistanceDependence |= !!CommonStride; + LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n"); return Dependence::Unknown; } - assert(Val.isStrictlyPositive() && "Expect a positive value"); + if (!SE.isKnownPositive(Dist)) + return Dependence::Unknown; if (!HasSameSize) { LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " @@ -2099,6 +2133,14 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( return Dependence::Unknown; } + // The logic below currently only supports StrideA == StrideB, i.e. there's a + // common stride. + if (!CommonStride) + return Dependence::Unknown; + + const APInt &Val = C->getAPInt(); + int64_t Distance = Val.getSExtValue(); + // Bail out early if passed-in parameters make vectorization not feasible. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? VectorizerParams::VectorizationFactor : 1); @@ -2134,7 +2176,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( // the minimum distance needed is 28, which is greater than distance. It is // not safe to do vectorization. uint64_t MinDistanceNeeded = - TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize; + TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize; if (MinDistanceNeeded > static_cast(Distance)) { LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance << '\n'); @@ -2183,7 +2225,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits // since there is a backwards dependency. - uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride); + uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride)); LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxVF << '\n'); uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index edddfb1b92402..059900f357e64 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -126,8 +126,10 @@ struct StoreToLoadForwardingCandidate { // We don't need to check non-wrapping here because forward/backward // dependence wouldn't be valid if these weren't monotonic accesses. - auto *Dist = cast( + auto *Dist = dyn_cast( PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); + if (!Dist) + return false; const APInt &Val = Dist->getAPInt(); return Val == TypeByteSize * StrideLoad; } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll index 51755314896bb..5f4c732dc19df 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll @@ -8,10 +8,9 @@ declare void @llvm.assume(i1) define void @different_non_constant_strides_known_forward(ptr %A) { ; CHECK-LABEL: 'different_non_constant_strides_known_forward' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: +; CHECK-NEXT: Forward: ; CHECK-NEXT: %l = load i32, ptr %gep.mul.2, align 4 -> ; CHECK-NEXT: store i32 %add, ptr %gep, align 4 ; CHECK-EMPTY: @@ -45,10 +44,9 @@ exit: define void @different_non_constant_strides_known_forward_min_distance_3(ptr %A) { ; CHECK-LABEL: 'different_non_constant_strides_known_forward_min_distance_3' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: +; CHECK-NEXT: Forward: ; CHECK-NEXT: %l = load i32, ptr %gep.mul.2, align 4 -> ; CHECK-NEXT: store i32 %add, ptr %gep, align 4 ; CHECK-EMPTY: From 54a94b7c3173ae334c4dbc555f0b35ac7cd05636 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 25 Apr 2024 10:34:57 +0100 Subject: [PATCH 2/2] [LAA] Pass maximum stride to isSafeDependenceDistance. As discussed in https://github.com/llvm/llvm-project/pull/88039, support different strides with isSafeDependenceDistance by passing the maximum of both strides. isSafeDependenceDistance tries to prove that |Dist| > BackedgeTakenCount * Step holds. Chosing the maximum stride computes the maximum range accesed by the loop for all strides. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 31 ++++++++++--------- ...es-safe-dep-due-to-backedge-taken-count.ll | 19 +++--------- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index ed8e749f9d6c2..1015cc77890cd 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1805,20 +1805,20 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { } /// Given a dependence-distance \p Dist between two -/// memory accesses, that have the same stride whose absolute value is given -/// in \p Stride, and that have the same type size \p TypeByteSize, -/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is -/// possible to prove statically that the dependence distance is larger -/// than the range that the accesses will travel through the execution of -/// the loop. If so, return true; false otherwise. This is useful for -/// example in loops such as the following (PR31098): +/// memory accesses, that have strides in the same direction whose absolute +/// value of the maximum stride is given in \p MaxStride, and that have the same +/// type size \p TypeByteSize, in a loop whose takenCount is \p +/// BackedgeTakenCount, check if it is possible to prove statically that the +/// dependence distance is larger than the range that the accesses will travel +/// through the execution of the loop. If so, return true; false otherwise. This +/// is useful for example in loops such as the following (PR31098): /// for (i = 0; i < D; ++i) { /// = out[i]; /// out[i+D] = /// } static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, - const SCEV &Dist, uint64_t Stride, + const SCEV &Dist, uint64_t MaxStride, uint64_t TypeByteSize) { // If we can prove that @@ -1838,7 +1838,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, // will be executed only if LoopCount >= VF, proving distance >= LoopCount // also guarantees that distance >= VF. // - const uint64_t ByteStride = Stride * TypeByteSize; + const uint64_t ByteStride = MaxStride * TypeByteSize; const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride); const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step); @@ -2046,14 +2046,15 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( ScalarEvolution &SE = *PSE.getSE(); auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); + uint64_t MaxStride = std::max(StrideA, StrideB); - // If the distance between the acecsses is larger than their absolute stride - // multiplied by the backedge taken count, the accesses are independet, i.e. - // they are far enough appart that accesses won't access the same location - // across all loop ierations. - if (HasSameSize && CommonStride && + // If the distance between the acecsses is larger than their maximum absolute + // stride multiplied by the backedge taken count, the accesses are independet, + // i.e. they are far enough appart that accesses won't access the same + // location across all loop ierations. + if (HasSameSize && isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist, - *CommonStride, TypeByteSize)) + MaxStride, TypeByteSize)) return Dependence::NoDep; const SCEVConstant *C = dyn_cast(Dist); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll index 932129bbb957f..8c7df4bdf5a5a 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll @@ -6,13 +6,8 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" define void @forward_dep_known_safe_due_to_backedge_taken_count(ptr %A) { ; CHECK-LABEL: 'forward_dep_known_safe_due_to_backedge_taken_count' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %l = load i32, ptr %gep.mul.2, align 4 -> -; CHECK-NEXT: store i32 %add, ptr %gep, align 4 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: ; CHECK-EMPTY: @@ -44,10 +39,9 @@ exit: define void @forward_dep_not_known_safe_due_to_backedge_taken_count(ptr %A) { ; CHECK-LABEL: 'forward_dep_not_known_safe_due_to_backedge_taken_count' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: +; CHECK-NEXT: Forward: ; CHECK-NEXT: %l = load i32, ptr %gep.mul.2, align 4 -> ; CHECK-NEXT: store i32 %add, ptr %gep, align 4 ; CHECK-EMPTY: @@ -82,13 +76,8 @@ exit: define void @unknown_dep_known_safe_due_to_backedge_taken_count(ptr %A) { ; CHECK-LABEL: 'unknown_dep_known_safe_due_to_backedge_taken_count' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %l = load i32, ptr %gep, align 4 -> -; CHECK-NEXT: store i32 %add, ptr %gep.mul.2, align 4 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: ; CHECK-EMPTY: