Skip to content

Commit de3445e

Browse files
aryanswMatzeB
authored andcommitted
[SROA] Create additional vector type candidates based on store and load slices
This patch adds additional vector types to be considered when doing promotion in SROA, based on the types of the store and load slices. This provides more promotion opportunities, by potentially using an optimal "intermediate" vector type. For example, the following code would currently not be promoted to a vector, since `__m128i` is a `<2 x i64>` vector. ``` __m128i packfoo0(int a, int b, int c, int d) { int r[4] = {a, b, c, d}; __m128i rm; std::memcpy(&rm, r, sizeof(rm)); return rm; } ``` ``` packfoo0(int, int, int, int): mov dword ptr [rsp - 24], edi mov dword ptr [rsp - 20], esi mov dword ptr [rsp - 16], edx mov dword ptr [rsp - 12], ecx movaps xmm0, xmmword ptr [rsp - 24] ret ``` By also considering the types of the elements, we could find that the `<4 x i32>` type would be valid for promotion, hence removing the memory accesses for this function. In other words, we can explore other new vector types, with the same size but different element types based on the load and store instructions from the Slices, which can provide us more promotion opportunities. Additionally, the step for removing duplicate elements from the `CandidateTys` vector was not using an equality comparator, which has been fixed. Differential Revision: https://reviews.llvm.org/D132096
1 parent d96f526 commit de3445e

File tree

2 files changed

+96
-7
lines changed

2 files changed

+96
-7
lines changed

llvm/lib/Transforms/Scalar/SROA.cpp

+29-3
Original file line numberDiff line numberDiff line change
@@ -1922,6 +1922,28 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
19221922
if (CandidateTys.empty())
19231923
return nullptr;
19241924

1925+
// Generate new candidate type based on load/store size.
1926+
for (const Slice &S : P) {
1927+
Type *Ty;
1928+
if (LoadInst *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
1929+
Ty = LI->getType();
1930+
else if (StoreInst *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
1931+
Ty = SI->getValueOperand()->getType();
1932+
else
1933+
continue;
1934+
if (isa<VectorType>(Ty))
1935+
continue;
1936+
// Create Vector with size of V, and each element of type Ty
1937+
VectorType *V = CandidateTys[0];
1938+
uint64_t ElementSize = DL.getTypeStoreSizeInBits(Ty).getFixedSize();
1939+
uint64_t VectorSize = DL.getTypeSizeInBits(V).getFixedSize();
1940+
if ((ElementSize != VectorSize) && (VectorSize % ElementSize == 0)) {
1941+
VectorType *VTy = VectorType::get(Ty, VectorSize / ElementSize, false);
1942+
CandidateTys.push_back(VTy);
1943+
if (CommonEltTy != Ty)
1944+
HaveCommonEltTy = false;
1945+
}
1946+
}
19251947
// Remove non-integer vector types if we had multiple common element types.
19261948
// FIXME: It'd be nice to replace them with integer vector types, but we can't
19271949
// do that until all the backends are known to produce good code for all
@@ -1949,10 +1971,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
19491971
return cast<FixedVectorType>(RHSTy)->getNumElements() <
19501972
cast<FixedVectorType>(LHSTy)->getNumElements();
19511973
};
1974+
auto RankVectorTypesEq = [&](VectorType *LHSTy, VectorType *RHSTy) {
1975+
return cast<FixedVectorType>(LHSTy)->getNumElements() ==
1976+
cast<FixedVectorType>(RHSTy)->getNumElements();
1977+
};
19521978
llvm::sort(CandidateTys, RankVectorTypes);
1953-
CandidateTys.erase(
1954-
std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
1955-
CandidateTys.end());
1979+
CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(),
1980+
RankVectorTypesEq),
1981+
CandidateTys.end());
19561982
} else {
19571983
// The only way to have the same element type in every vector type is to
19581984
// have the same vector type. Check that and remove all but one.

llvm/test/Transforms/SROA/vector-promotion.ll

+67-4
Original file line numberDiff line numberDiff line change
@@ -534,10 +534,9 @@ define <2 x float> @test11(<4 x i16> %x, i32 %y) {
534534
; heuristic for making a deterministic decision.
535535
; CHECK-LABEL: @test11(
536536
; CHECK-NEXT: entry:
537-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[Y:%.*]] to <2 x i16>
538-
; CHECK-NEXT: [[A_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
539-
; CHECK-NEXT: [[A_SROA_0_4_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i16> [[A_SROA_0_4_VEC_EXPAND]], <4 x i16> [[X:%.*]]
540-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[A_SROA_0_4_VECBLEND]] to <2 x float>
537+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32>
538+
; CHECK-NEXT: [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1
539+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[A_SROA_0_4_VEC_INSERT]] to <2 x float>
541540
; CHECK-NEXT: ret <2 x float> [[TMP1]]
542541
;
543542
entry:
@@ -565,3 +564,67 @@ define <4 x float> @test12(<4 x i32> %val) {
565564

566565
ret <4 x float> %vec
567566
}
567+
568+
define <2 x i64> @test13(i32 %a, i32 %b, i32 %c, i32 %d) {
569+
; Ensure that we can promote an alloca that needs to be
570+
; cast to a different vector type
571+
; CHECK-LABEL: @test13(
572+
; CHECK-NEXT: entry:
573+
; CHECK-NEXT: [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
574+
; CHECK-NEXT: [[X_SROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_0_VEC_INSERT]], i32 [[B:%.*]], i32 1
575+
; CHECK-NEXT: [[X_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_4_VEC_INSERT]], i32 [[C:%.*]], i32 2
576+
; CHECK-NEXT: [[X_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_8_VEC_INSERT]], i32 [[D:%.*]], i32 3
577+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[X_SROA_0_12_VEC_INSERT]] to <2 x i64>
578+
; CHECK-NEXT: ret <2 x i64> [[TMP0]]
579+
;
580+
entry:
581+
%x = alloca [4 x i32]
582+
583+
store i32 %a, ptr %x
584+
%x.tmp2 = getelementptr inbounds i32, ptr %x, i64 1
585+
store i32 %b, ptr %x.tmp2
586+
%x.tmp3 = getelementptr inbounds i32, ptr %x, i64 2
587+
store i32 %c, ptr %x.tmp3
588+
%x.tmp4 = getelementptr inbounds i32, ptr %x, i64 3
589+
store i32 %d, ptr %x.tmp4
590+
591+
592+
%result = load <2 x i64>, ptr %x
593+
594+
ret <2 x i64> %result
595+
}
596+
597+
define i32 @test14(<2 x i64> %x) {
598+
; Ensure that we can promote an alloca that needs to be
599+
; cast to a different vector type
600+
; CHECK-LABEL: @test14(
601+
; CHECK-NEXT: entry:
602+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[X:%.*]] to <4 x i32>
603+
; CHECK-NEXT: [[X_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
604+
; CHECK-NEXT: [[X_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
605+
; CHECK-NEXT: [[X_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
606+
; CHECK-NEXT: [[X_SROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
607+
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[X_SROA_0_0_VEC_EXTRACT]], [[X_SROA_0_4_VEC_EXTRACT]]
608+
; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[X_SROA_0_8_VEC_EXTRACT]], [[X_SROA_0_12_VEC_EXTRACT]]
609+
; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD]], [[ADD1]]
610+
; CHECK-NEXT: ret i32 [[ADD2]]
611+
;
612+
entry:
613+
614+
%x.addr = alloca <2 x i64>, align 16
615+
store <2 x i64> %x, <2 x i64>* %x.addr, align 16
616+
%x.cast = bitcast <2 x i64>* %x.addr to i32*
617+
618+
%a = load i32, ptr %x.cast
619+
%x.tmp2 = getelementptr inbounds i32, ptr %x.cast, i64 1
620+
%b = load i32, ptr %x.tmp2
621+
%x.tmp3 = getelementptr inbounds i32, ptr %x.cast, i64 2
622+
%c = load i32, ptr %x.tmp3
623+
%x.tmp4 = getelementptr inbounds i32, ptr %x.cast, i64 3
624+
%d = load i32, ptr %x.tmp4
625+
626+
%add = add i32 %a, %b
627+
%add1 = add i32 %c, %d
628+
%add2 = add i32 %add, %add1
629+
ret i32 %add2
630+
}

0 commit comments

Comments
 (0)