-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP][REVEC] Fix the number of elements in the mask of a ShuffleVectorInst is not a power of 2. #119689
[SLP][REVEC] Fix the number of elements in the mask of a ShuffleVectorInst is not a power of 2. #119689
Conversation
ShuffleVectorInst is not a power of 2. The following shufflevector should not be vectorized when slp-vectorize-non-power-of-2 is enabled. shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Han-Kuan Chen (HanKuanChen) ChangesThe following shufflevector should not be vectorized when shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> Full diff: https://github.com/llvm/llvm-project/pull/119689.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1d7515f031cfc..611a42eb0b764f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -344,6 +344,8 @@ static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
unsigned SVNumElements =
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
unsigned ShuffleMaskSize = SV->getShuffleMask().size();
+ if (SVNumElements % ShuffleMaskSize != 0)
+ return 0;
unsigned GroupSize = SVNumElements / ShuffleMaskSize;
if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
return 0;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index b312688b7932dc..61ff4f5766d309 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck --check-prefixes=CHECK,POWEROF2 %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s
define i32 @test() {
; CHECK-LABEL: @test(
@@ -134,3 +135,99 @@ for.body:
%6 = select <2 x i1> %4, <2 x float> %3, <2 x float> zeroinitializer
br label %for.cond.cleanup
}
+
+define ptr @test4() {
+; POWEROF2-LABEL: @test4(
+; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
+; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
+; POWEROF2-NEXT: br label [[TMP8:%.*]]
+; POWEROF2: 7:
+; POWEROF2-NEXT: br label [[TMP8]]
+; POWEROF2: 8:
+; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
+; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2-NEXT: br label [[TMP11:%.*]]
+; POWEROF2: 11:
+; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
+; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
+; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
+; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
+; POWEROF2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
+; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
+; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
+; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
+; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
+; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
+; POWEROF2-NEXT: ret ptr null
+;
+; NONPOWEROF2-LABEL: @test4(
+; NONPOWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; NONPOWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NONPOWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; NONPOWEROF2-NEXT: [[TMP4:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> poison, <3 x float> [[TMP2]], i64 0)
+; NONPOWEROF2-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> [[TMP4]], <3 x float> [[TMP3]], i64 3)
+; NONPOWEROF2-NEXT: br label [[TMP7:%.*]]
+; NONPOWEROF2: 6:
+; NONPOWEROF2-NEXT: br label [[TMP7]]
+; NONPOWEROF2: 7:
+; NONPOWEROF2-NEXT: [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
+; NONPOWEROF2-NEXT: br label [[TMP9:%.*]]
+; NONPOWEROF2: 9:
+; NONPOWEROF2-NEXT: [[TMP10:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 0)
+; NONPOWEROF2-NEXT: [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
+; NONPOWEROF2-NEXT: [[TMP12:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 3)
+; NONPOWEROF2-NEXT: [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT: [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT: [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
+; NONPOWEROF2-NEXT: [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
+; NONPOWEROF2-NEXT: [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
+; NONPOWEROF2-NEXT: ret ptr null
+;
+ %1 = fadd <8 x float> zeroinitializer, zeroinitializer
+ %2 = extractelement <8 x float> %1, i64 0
+ %3 = extractelement <8 x float> %1, i64 1
+ %4 = extractelement <8 x float> %1, i64 2
+ %5 = extractelement <8 x float> %1, i64 4
+ %6 = extractelement <8 x float> %1, i64 5
+ %7 = extractelement <8 x float> %1, i64 6
+ br label %9
+
+8:
+ br label %9
+
+9:
+ %10 = phi float [ 0.000000e+00, %8 ], [ %7, %0 ]
+ %11 = phi float [ 0.000000e+00, %8 ], [ %6, %0 ]
+ %12 = phi float [ 0.000000e+00, %8 ], [ %5, %0 ]
+ %13 = phi float [ 0.000000e+00, %8 ], [ %4, %0 ]
+ %14 = phi float [ 0.000000e+00, %8 ], [ %3, %0 ]
+ %15 = phi float [ 0.000000e+00, %8 ], [ %2, %0 ]
+ br label %16
+
+16:
+ %17 = fmul float %14, 0.000000e+00
+ %18 = fmul float 0.000000e+00, %11
+ %19 = fmul float 0.000000e+00, %15
+ %20 = fmul float %12, 0.000000e+00
+ %21 = fadd reassoc nsz float %17, %19
+ %22 = fadd reassoc nsz float %18, %20
+ %23 = fmul float %13, 0.000000e+00
+ %24 = fmul float %10, 0.000000e+00
+ %25 = fadd reassoc nsz float %21, %23
+ %26 = fadd reassoc nsz float %22, %24
+ %27 = tail call float @llvm.sqrt.f32(float %25)
+ %28 = tail call float @llvm.sqrt.f32(float %26)
+ ret ptr null
+}
|
The following shufflevector should not be vectorized when
slp-vectorize-non-power-of-2 is enabled.
shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>