-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SROA]: Only defer trying partial sized ptr or ptr vector types #82279
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Jeffrey Byrnes (jrbyrnes) Changes#77678 introduced regressions of the following type: LoadStoreTys had the better candidate type, but it was not selected since considering LoadStoreTys was deferred until after full consideration of CandidateTys (those that are the same size as the partition). See: https://godbolt.org/z/G5sxbf1sx Full diff: https://github.com/llvm/llvm-project/pull/82279.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 6c8785d52c4eab..b12d156e067edd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2271,6 +2271,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// we have different element types.
SmallVector<VectorType *, 4> CandidateTys;
SetVector<Type *> LoadStoreTys;
+ SetVector<Type *> DeferredTys;
Type *CommonEltTy = nullptr;
VectorType *CommonVecPtrTy = nullptr;
bool HaveVecPtrTy = false;
@@ -2305,6 +2306,35 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
}
};
+ auto createAndCheckVectorTypesForPromotion =
+ [&](SetVector<Type *> OtherTys,
+ SmallVector<VectorType *, 4> CandidateTysCopy) {
+ // Consider additional vector types where the element type size is a
+ // multiple of load/store element size.
+ for (Type *Ty : OtherTys) {
+ if (!VectorType::isValidElementType(Ty))
+ continue;
+ unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
+ // Make a copy of CandidateTys and iterate through it, because we
+ // might append to CandidateTys in the loop.
+ for (VectorType *&VTy : CandidateTysCopy) {
+ unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
+ unsigned ElementSize =
+ DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
+ if (TypeSize != VectorSize && TypeSize != ElementSize &&
+ VectorSize % TypeSize == 0) {
+ VectorType *NewVTy =
+ VectorType::get(Ty, VectorSize / TypeSize, false);
+ CheckCandidateType(NewVTy);
+ }
+ }
+ }
+
+ return checkVectorTypesForPromotion(
+ P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
+ HaveCommonVecPtrTy, CommonVecPtrTy);
+ };
+
// Put load and store types into a set for de-duplication.
for (const Slice &S : P) {
Type *Ty;
@@ -2314,44 +2344,31 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
Ty = SI->getValueOperand()->getType();
else
continue;
+
+ auto CandTy =
+ isa<VectorType>(Ty) ? cast<VectorType>(Ty)->getElementType() : Ty;
+ if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
+ S.endOffset() != P.endOffset())) {
+ DeferredTys.insert(Ty);
+ continue;
+ }
+
LoadStoreTys.insert(Ty);
// Consider any loads or stores that are the exact size of the slice.
if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
CheckCandidateType(Ty);
}
- if (auto *VTy = checkVectorTypesForPromotion(
- P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
- HaveCommonVecPtrTy, CommonVecPtrTy))
+ SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
+ if (auto *VTy =
+ createAndCheckVectorTypesForPromotion(LoadStoreTys, CandidateTysCopy))
return VTy;
- // Consider additional vector types where the element type size is a
- // multiple of load/store element size.
- for (Type *Ty : LoadStoreTys) {
- if (!VectorType::isValidElementType(Ty))
- continue;
- unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
- // Make a copy of CandidateTys and iterate through it, because we might
- // append to CandidateTys in the loop.
- SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
- CandidateTys.clear();
- for (VectorType *&VTy : CandidateTysCopy) {
- unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
- unsigned ElementSize =
- DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
- if (TypeSize != VectorSize && TypeSize != ElementSize &&
- VectorSize % TypeSize == 0) {
- VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
- CheckCandidateType(NewVTy);
- }
- }
- }
-
- return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
- CommonEltTy, HaveVecPtrTy,
- HaveCommonVecPtrTy, CommonVecPtrTy);
+ CandidateTys.clear();
+ return createAndCheckVectorTypesForPromotion(DeferredTys, CandidateTysCopy);
}
+
/// Test whether a slice of an alloca is valid for integer widening.
///
/// This implements the necessary checking for the \c isIntegerWideningViable
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index e48dd5bb392082..dc70520fc5ca70 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -1392,6 +1392,38 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) {
ret <4 x ptr> %sroaval
}
+define <4 x i32> @validLoadStoreTy([2 x i64] %cond.coerce) {
+; CHECK-LABEL: @validLoadStoreTy(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0
+; CHECK-NEXT: [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT: [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1
+; CHECK-NEXT: [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> [[TMP0]]
+;
+; DEBUG-LABEL: @validLoadStoreTy(
+; DEBUG-NEXT: entry:
+; DEBUG-NEXT: call void @llvm.dbg.value(metadata ptr undef, metadata [[META553:![0-9]+]], metadata !DIExpression()), !dbg [[DBG557:![0-9]+]]
+; DEBUG-NEXT: call void @llvm.dbg.value(metadata ptr undef, metadata [[META554:![0-9]+]], metadata !DIExpression()), !dbg [[DBG558:![0-9]+]]
+; DEBUG-NEXT: [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0, !dbg [[DBG559:![0-9]+]]
+; DEBUG-NEXT: [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0, !dbg [[DBG559]]
+; DEBUG-NEXT: [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1, !dbg [[DBG559]]
+; DEBUG-NEXT: [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1, !dbg [[DBG559]]
+; DEBUG-NEXT: call void @llvm.dbg.value(metadata ptr undef, metadata [[META555:![0-9]+]], metadata !DIExpression()), !dbg [[DBG560:![0-9]+]]
+; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>, !dbg [[DBG561:![0-9]+]]
+; DEBUG-NEXT: call void @llvm.dbg.value(metadata <4 x i32> [[TMP0]], metadata [[META556:![0-9]+]], metadata !DIExpression()), !dbg [[DBG561]]
+; DEBUG-NEXT: ret <4 x i32> [[TMP0]], !dbg [[DBG562:![0-9]+]]
+;
+entry:
+ %cond = alloca <4 x i32>, align 8
+ %coerce.dive2 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0
+ store [2 x i64] %cond.coerce, ptr %coerce.dive2, align 8
+ %m5 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0
+ %0 = load <4 x i32>, ptr %m5, align 8
+ ret <4 x i32> %0
+}
+
declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
declare void @llvm.lifetime.end.p0(i64, ptr)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
@topperc This is what I was referring to |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Change-Id: Iea5d60b12e2de7033fc1a71e80aa96c261e998bf
Change-Id: Ic77f87290905addadd5819dff2d0c62f031022ab
Formatting |
…tion Change-Id: I7f63cfe0fdb9f08fd94e40c33c060af492bdd26a
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
// Make a copy of CandidateTys and iterate through it, because we | ||
// might append to CandidateTys in the loop. | ||
for (VectorType *&VTy : CandidateTysCopy) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't being mutated though? can you just use ArrayRef of CandidateTys?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think CheckCandidateType modifies CandidateTys
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But if it's append only the ArrayRef won't see the new elements
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the purpose of the loop ArrayRef is fine I think since it is treated as fixed size, but we need the actual copy for the second call to createAndCheckVectorTypesForPromotion
since we CandidateTys.clear();
.
The clear() is done to save iterations in checkVectorTypesForPromotion
by not retrying things, but we still need these Tys for the size-based candidateTys
generation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, this is causing a failure.
Looks like if push_back exceeds the capacity of the allocation for small_vector, push_back will call grow
void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) { |
This will malloc new memory, move the existing elements and reclaim their prior memory.
Thus the ArrayRef will be working with bad data.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Besides, we may CandidateTys.clear()
from CheckCandidateType
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a test that hit this failure?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no good way to capture the actual failure. This is because the failure depends on overwriting the reclaimed memory -- this is dependent upon malloc as well as hardware.
Instead, I've added an assert which enforces memory invariance to support the test. This is one of the conditions for the failure.
Change-Id: Iea1367c878118b6f1dcac3c43b53b98be0ca57e3
Change-Id: Ib00edfe8bd3517effb7f58bf9631bd2700dc84c1
Change-Id: I8f00f67fcd9db825b74ed98b260b8c720e17f653
Change-Id: Idff43d522274474a6b719548eaefcf675fdb66e2
Change-Id: Ic77f87290905addadd5819dff2d0c62f031022ab
This seems like an addition to the previous PR, although this is not a critical bug. /cherry-pick 1e828f8 |
/pull-request #86114 |
#77678 introduced regressions of the following type: LoadStoreTys had the better candidate type, but it was not selected since considering LoadStoreTys was deferred until after full consideration of CandidateTys (those that are the same size as the partition).
See: https://godbolt.org/z/G5sxbf1sx
Since the issue 77678 tried to resolve only required to deferring consideration of ptr or ptr vector types, this patch implements that (and resolves the class of regressions).