Skip to content

Commit afb657e

Browse files
committed
LAA: generalize strides over unequal type sizes
getDepdenceDistanceStrideAndSize currently returns a non-zero TypeByteSize only if the type-sizes of the source and sink are equal. The non-zero TypeByteSize is then used by isDependent to scale the strides, the distance between the accesses, and the VF. This restriction is very artificial, as the stride sizes can be scaled by the respective type-sizes in advance, freeing isDependent of this responsibility, and removing the ugly special-case of zero-TypeByteSize. The patch also has the side-effect of fixing the long-standing TODO of requesting runtime-checks when the strides are unequal. The test impact of this patch is that several false depdendencies are eliminated, and several unknown depdendencies now come with runtime-checks instead.
1 parent d432e22 commit afb657e

File tree

9 files changed

+239
-336
lines changed

9 files changed

+239
-336
lines changed

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 63 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1805,8 +1805,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
18051805
/// }
18061806
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18071807
const SCEV &MaxBTC, const SCEV &Dist,
1808-
uint64_t MaxStride,
1809-
uint64_t TypeByteSize) {
1808+
uint64_t MaxStride) {
18101809

18111810
// If we can prove that
18121811
// (**) |Dist| > MaxBTC * Step
@@ -1825,8 +1824,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18251824
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
18261825
// also guarantees that distance >= VF.
18271826
//
1828-
const uint64_t ByteStride = MaxStride * TypeByteSize;
1829-
const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
1827+
const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
18301828
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
18311829

18321830
const SCEV *CastedDist = &Dist;
@@ -1870,9 +1868,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
18701868
if (Distance % TypeByteSize)
18711869
return false;
18721870

1873-
uint64_t ScaledDist = Distance / TypeByteSize;
1874-
1875-
// No dependence if the scaled distance is not multiple of the stride.
1871+
// No dependence if the distance is not multiple of the stride.
18761872
// E.g.
18771873
// for (i = 0; i < 1024 ; i += 4)
18781874
// A[i+2] = A[i] + 1;
@@ -1888,7 +1884,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
18881884
// Two accesses in memory (scaled distance is 4, stride is 3):
18891885
// | A[0] | | | A[3] | | | A[6] | | |
18901886
// | | | | | A[4] | | | A[7] | |
1891-
return ScaledDist % Stride;
1887+
return Distance % Stride;
18921888
}
18931889

18941890
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1927,6 +1923,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19271923
if (StrideAPtr && *StrideAPtr < 0) {
19281924
std::swap(Src, Sink);
19291925
std::swap(AInst, BInst);
1926+
std::swap(ATy, BTy);
19301927
std::swap(StrideAPtr, StrideBPtr);
19311928
}
19321929

@@ -1970,31 +1967,51 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19701967
return MemoryDepChecker::Dependence::IndirectUnsafe;
19711968
}
19721969

1973-
int64_t StrideAPtrInt = *StrideAPtr;
1974-
int64_t StrideBPtrInt = *StrideBPtr;
1975-
LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
1976-
<< " Sink induction step: " << StrideBPtrInt << "\n");
1970+
LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << *StrideAPtr
1971+
<< " Sink induction step: " << *StrideBPtr << "\n");
1972+
1973+
// Note that store size is different from alloc size, which is dependent on
1974+
// store size. We use the former for checking illegal cases, and the latter
1975+
// for scaling strides.
1976+
TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
1977+
BStoreSz = DL.getTypeStoreSize(BTy);
1978+
1979+
// When the distance is zero, we're reading/writing the same memory location:
1980+
// check that the store sizes are equal. Otherwise, fail with an unknown
1981+
// dependence for which we should not generate runtime checks.
1982+
if (Dist->isZero() && AStoreSz != BStoreSz)
1983+
return MemoryDepChecker::Dependence::Unknown;
1984+
1985+
// We can't get get a uint64_t for the AllocSize if either of the store sizes
1986+
// are scalable.
1987+
if (AStoreSz.isScalable() || BStoreSz.isScalable())
1988+
return MemoryDepChecker::Dependence::Unknown;
1989+
1990+
// The TypeByteSize is used to scale Distance and VF. In these contexts, the
1991+
// only size that matters is the size of the Sink.
1992+
uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy).value()),
1993+
TypeByteSize = alignTo(BStoreSz, DL.getABITypeAlign(BTy).value());
1994+
1995+
// We scale the strides by the alloc-type-sizes, so we can check that the
1996+
// common distance is equal when ASz != BSz.
1997+
int64_t StrideAScaled = *StrideAPtr * ASz;
1998+
int64_t StrideBScaled = *StrideBPtr * TypeByteSize;
1999+
19772000
// At least Src or Sink are loop invariant and the other is strided or
19782001
// invariant. We can generate a runtime check to disambiguate the accesses.
1979-
if (StrideAPtrInt == 0 || StrideBPtrInt == 0)
2002+
if (!StrideAScaled || !StrideBScaled)
19802003
return MemoryDepChecker::Dependence::Unknown;
19812004

19822005
// Both Src and Sink have a constant stride, check if they are in the same
19832006
// direction.
1984-
if ((StrideAPtrInt > 0 && StrideBPtrInt < 0) ||
1985-
(StrideAPtrInt < 0 && StrideBPtrInt > 0)) {
2007+
if (StrideAScaled > 0 != StrideBScaled > 0) {
19862008
LLVM_DEBUG(
19872009
dbgs() << "Pointer access with strides in different directions\n");
19882010
return MemoryDepChecker::Dependence::Unknown;
19892011
}
19902012

1991-
uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1992-
bool HasSameSize =
1993-
DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
1994-
if (!HasSameSize)
1995-
TypeByteSize = 0;
1996-
return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt),
1997-
std::abs(StrideBPtrInt), TypeByteSize,
2013+
return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAScaled),
2014+
std::abs(StrideBScaled), TypeByteSize,
19982015
AIsWrite, BIsWrite);
19992016
}
20002017

@@ -2012,15 +2029,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20122029

20132030
auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
20142031
std::get<DepDistanceStrideAndSizeInfo>(Res);
2015-
bool HasSameSize = TypeByteSize > 0;
20162032

20172033
std::optional<uint64_t> CommonStride =
20182034
StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
20192035
if (isa<SCEVCouldNotCompute>(Dist)) {
2020-
// TODO: Relax requirement that there is a common stride to retry with
2021-
// non-constant distance dependencies.
2022-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
20232036
LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
2037+
FoundNonConstantDistanceDependence = true;
20242038
return Dependence::Unknown;
20252039
}
20262040

@@ -2033,24 +2047,20 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20332047
// upper bound of the number of iterations), the accesses are independet, i.e.
20342048
// they are far enough appart that accesses won't access the same location
20352049
// across all loop ierations.
2036-
if (HasSameSize && isSafeDependenceDistance(
2037-
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
2038-
*Dist, MaxStride, TypeByteSize))
2050+
if (isSafeDependenceDistance(
2051+
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
20392052
return Dependence::NoDep;
20402053

2041-
const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2054+
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
20422055

20432056
// Attempt to prove strided accesses independent.
2044-
if (C) {
2045-
const APInt &Val = C->getAPInt();
2046-
int64_t Distance = Val.getSExtValue();
2057+
if (ConstDist) {
2058+
int64_t Distance = std::abs(ConstDist->getAPInt().getSExtValue());
20472059

20482060
// If the distance between accesses and their strides are known constants,
20492061
// check whether the accesses interlace each other.
2050-
if (std::abs(Distance) > 0 && CommonStride && *CommonStride > 1 &&
2051-
HasSameSize &&
2052-
areStridedAccessesIndependent(std::abs(Distance), *CommonStride,
2053-
TypeByteSize)) {
2062+
if (Distance > 0 && CommonStride && CommonStride > 1 &&
2063+
areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) {
20542064
LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
20552065
return Dependence::NoDep;
20562066
}
@@ -2063,15 +2073,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20632073

20642074
// Negative distances are not plausible dependencies.
20652075
if (SE.isKnownNonPositive(Dist)) {
2066-
if (SE.isKnownNonNegative(Dist)) {
2067-
if (HasSameSize) {
2068-
// Write to the same location with the same size.
2069-
return Dependence::Forward;
2070-
}
2071-
LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2072-
"different type sizes\n");
2073-
return Dependence::Unknown;
2074-
}
2076+
if (SE.isKnownNonNegative(Dist))
2077+
// Write to the same location.
2078+
return Dependence::Forward;
20752079

20762080
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
20772081
// Check if the first access writes to a location that is read in a later
@@ -2083,17 +2087,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20832087
// forward dependency will allow vectorization using any width.
20842088

20852089
if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2086-
if (!C) {
2087-
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2088-
// condition to consider retrying with runtime checks. Historically, we
2089-
// did not set it when strides were different but there is no inherent
2090-
// reason to.
2091-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2090+
if (!ConstDist) {
2091+
FoundNonConstantDistanceDependence = true;
20922092
return Dependence::Unknown;
20932093
}
2094-
if (!HasSameSize ||
2095-
couldPreventStoreLoadForward(C->getAPInt().abs().getZExtValue(),
2096-
TypeByteSize)) {
2094+
if (couldPreventStoreLoadForward(
2095+
ConstDist->getAPInt().abs().getZExtValue(), TypeByteSize)) {
20972096
LLVM_DEBUG(
20982097
dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
20992098
return Dependence::ForwardButPreventsForwarding;
@@ -2107,27 +2106,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21072106
int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue();
21082107
// Below we only handle strictly positive distances.
21092108
if (MinDistance <= 0) {
2110-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2109+
FoundNonConstantDistanceDependence = true;
21112110
return Dependence::Unknown;
21122111
}
21132112

2114-
if (!isa<SCEVConstant>(Dist)) {
2115-
// Previously this case would be treated as Unknown, possibly setting
2116-
// FoundNonConstantDistanceDependence to force re-trying with runtime
2117-
// checks. Until the TODO below is addressed, set it here to preserve
2118-
// original behavior w.r.t. re-trying with runtime checks.
2119-
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2120-
// condition to consider retrying with runtime checks. Historically, we
2121-
// did not set it when strides were different but there is no inherent
2122-
// reason to.
2123-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2124-
}
2125-
2126-
if (!HasSameSize) {
2127-
LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2128-
"different type sizes\n");
2129-
return Dependence::Unknown;
2130-
}
2113+
if (!ConstDist)
2114+
FoundNonConstantDistanceDependence = true;
21312115

21322116
if (!CommonStride)
21332117
return Dependence::Unknown;
@@ -2142,8 +2126,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21422126

21432127
// It's not vectorizable if the distance is smaller than the minimum distance
21442128
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2145-
// front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2146-
// TypeByteSize (No need to plus the last gap distance).
2129+
// front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
2130+
// (No need to plus the last gap distance).
21472131
//
21482132
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
21492133
// foo(int *A) {
@@ -2170,10 +2154,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21702154
// We know that Dist is positive, but it may not be constant. Use the signed
21712155
// minimum for computations below, as this ensures we compute the closest
21722156
// possible dependence distance.
2173-
uint64_t MinDistanceNeeded =
2174-
TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
2157+
uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
21752158
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2176-
if (!isa<SCEVConstant>(Dist)) {
2159+
if (!ConstDist) {
21772160
// For non-constant distances, we checked the lower bound of the
21782161
// dependence distance and the distance may be larger at runtime (and safe
21792162
// for vectorization). Classify it as Unknown, so we re-try with runtime
@@ -2228,12 +2211,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
22282211

22292212
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
22302213
// since there is a backwards dependency.
2231-
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
2214+
uint64_t MaxVF = MinDepDistBytes / *CommonStride;
22322215
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
22332216
<< " with max VF = " << MaxVF << '\n');
22342217

22352218
uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2236-
if (!isa<SCEVConstant>(Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) {
2219+
if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
22372220
// For non-constant distances, we checked the lower bound of the dependence
22382221
// distance and the distance may be larger at runtime (and safe for
22392222
// vectorization). Classify it as Unknown, so we re-try with runtime checks.

llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,8 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
130130
; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
131131
; CHECK-NEXT: loop:
132132
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
133-
; CHECK-NEXT: Unknown data dependence.
133+
; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
134134
; CHECK-NEXT: Dependences:
135-
; CHECK-NEXT: Unknown:
136-
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
137-
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
138-
; CHECK-EMPTY:
139-
; CHECK-NEXT: Unknown:
140-
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
141-
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
142-
; CHECK-EMPTY:
143135
; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
144136
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
145137
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8

llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,18 +106,24 @@ exit:
106106
ret void
107107
}
108108

109-
define void @unknown_dep_not_known_safe_due_to_backedge_taken_count(ptr %A) {
110-
; CHECK-LABEL: 'unknown_dep_not_known_safe_due_to_backedge_taken_count'
109+
define void @unknown_dep_safe_with_rtchecks(ptr %A) {
110+
; CHECK-LABEL: 'unknown_dep_safe_with_rtchecks'
111111
; CHECK-NEXT: loop:
112-
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
113-
; CHECK-NEXT: Unknown data dependence.
112+
; CHECK-NEXT: Memory dependences are safe with run-time checks
114113
; CHECK-NEXT: Dependences:
115-
; CHECK-NEXT: Unknown:
116-
; CHECK-NEXT: %l = load i32, ptr %gep, align 4 ->
117-
; CHECK-NEXT: store i32 %add, ptr %gep.mul.2, align 4
118-
; CHECK-EMPTY:
119114
; CHECK-NEXT: Run-time memory checks:
115+
; CHECK-NEXT: Check 0:
116+
; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]):
117+
; CHECK-NEXT: %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
118+
; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]):
119+
; CHECK-NEXT: %gep = getelementptr inbounds i32, ptr %A, i64 %iv
120120
; CHECK-NEXT: Grouped accesses:
121+
; CHECK-NEXT: Group [[GRP1]]:
122+
; CHECK-NEXT: (Low: (2040 + %A)<nuw> High: (4084 + %A))
123+
; CHECK-NEXT: Member: {(2040 + %A)<nuw>,+,8}<nuw><%loop>
124+
; CHECK-NEXT: Group [[GRP2]]:
125+
; CHECK-NEXT: (Low: %A High: (1024 + %A))
126+
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop>
121127
; CHECK-EMPTY:
122128
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
123129
; CHECK-NEXT: SCEV assumptions:

llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) {
7070
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
7171
; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1
7272
; CHECK-EMPTY:
73-
; CHECK-NEXT: Forward:
74-
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
75-
; CHECK-NEXT: store i24 %l, ptr %ptr.iv, align 1
76-
; CHECK-EMPTY:
7773
; CHECK-NEXT: Run-time memory checks:
7874
; CHECK-NEXT: Grouped accesses:
7975
; CHECK-EMPTY:

0 commit comments

Comments
 (0)