Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP][REVEC] Fix the number of elements in the mask of a ShuffleVectorInst is not a power of 2. #119689

Merged

Conversation

HanKuanChen
Copy link
Contributor

The following shufflevector should not be vectorized when
slp-vectorize-non-power-of-2 is enabled.

shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>

ShuffleVectorInst is not a power of 2.

The following shufflevector should not be vectorized when
slp-vectorize-non-power-of-2 is enabled.

shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
@llvmbot
Copy link
Member

llvmbot commented Dec 12, 2024

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Han-Kuan Chen (HanKuanChen)

Changes

The following shufflevector should not be vectorized when
slp-vectorize-non-power-of-2 is enabled.

shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
shufflevector <8 x float> %1, <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>


Full diff: https://github.com/llvm/llvm-project/pull/119689.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+2)
  • (modified) llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll (+98-1)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1d7515f031cfc..611a42eb0b764f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -344,6 +344,8 @@ static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
   unsigned SVNumElements =
       cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
   unsigned ShuffleMaskSize = SV->getShuffleMask().size();
+  if (SVNumElements % ShuffleMaskSize != 0)
+    return 0;
   unsigned GroupSize = SVNumElements / ShuffleMaskSize;
   if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
     return 0;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index b312688b7932dc..61ff4f5766d309 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck --check-prefixes=CHECK,POWEROF2 %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s
 
 define i32 @test() {
 ; CHECK-LABEL: @test(
@@ -134,3 +135,99 @@ for.body:
   %6 = select <2 x i1> %4, <2 x float> %3, <2 x float> zeroinitializer
   br label %for.cond.cleanup
 }
+
+define ptr @test4() {
+; POWEROF2-LABEL: @test4(
+; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
+; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
+; POWEROF2-NEXT:    br label [[TMP8:%.*]]
+; POWEROF2:       7:
+; POWEROF2-NEXT:    br label [[TMP8]]
+; POWEROF2:       8:
+; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
+; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2-NEXT:    br label [[TMP11:%.*]]
+; POWEROF2:       11:
+; POWEROF2-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
+; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
+; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
+; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
+; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
+; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
+; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; POWEROF2-NEXT:    [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
+; POWEROF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; POWEROF2-NEXT:    [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
+; POWEROF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; POWEROF2-NEXT:    [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT:    [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
+; POWEROF2-NEXT:    [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
+; POWEROF2-NEXT:    ret ptr null
+;
+; NONPOWEROF2-LABEL: @test4(
+; NONPOWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; NONPOWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NONPOWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; NONPOWEROF2-NEXT:    [[TMP4:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> poison, <3 x float> [[TMP2]], i64 0)
+; NONPOWEROF2-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> [[TMP4]], <3 x float> [[TMP3]], i64 3)
+; NONPOWEROF2-NEXT:    br label [[TMP7:%.*]]
+; NONPOWEROF2:       6:
+; NONPOWEROF2-NEXT:    br label [[TMP7]]
+; NONPOWEROF2:       7:
+; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
+; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
+; NONPOWEROF2:       9:
+; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 0)
+; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
+; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 3)
+; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
+; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
+; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
+; NONPOWEROF2-NEXT:    ret ptr null
+;
+  %1 = fadd <8 x float> zeroinitializer, zeroinitializer
+  %2 = extractelement <8 x float> %1, i64 0
+  %3 = extractelement <8 x float> %1, i64 1
+  %4 = extractelement <8 x float> %1, i64 2
+  %5 = extractelement <8 x float> %1, i64 4
+  %6 = extractelement <8 x float> %1, i64 5
+  %7 = extractelement <8 x float> %1, i64 6
+  br label %9
+
+8:
+  br label %9
+
+9:
+  %10 = phi float [ 0.000000e+00, %8 ], [ %7, %0 ]
+  %11 = phi float [ 0.000000e+00, %8 ], [ %6, %0 ]
+  %12 = phi float [ 0.000000e+00, %8 ], [ %5, %0 ]
+  %13 = phi float [ 0.000000e+00, %8 ], [ %4, %0 ]
+  %14 = phi float [ 0.000000e+00, %8 ], [ %3, %0 ]
+  %15 = phi float [ 0.000000e+00, %8 ], [ %2, %0 ]
+  br label %16
+
+16:
+  %17 = fmul float %14, 0.000000e+00
+  %18 = fmul float 0.000000e+00, %11
+  %19 = fmul float 0.000000e+00, %15
+  %20 = fmul float %12, 0.000000e+00
+  %21 = fadd reassoc nsz float %17, %19
+  %22 = fadd reassoc nsz float %18, %20
+  %23 = fmul float %13, 0.000000e+00
+  %24 = fmul float %10, 0.000000e+00
+  %25 = fadd reassoc nsz float %21, %23
+  %26 = fadd reassoc nsz float %22, %24
+  %27 = tail call float @llvm.sqrt.f32(float %25)
+  %28 = tail call float @llvm.sqrt.f32(float %26)
+  ret ptr null
+}

@HanKuanChen HanKuanChen merged commit 2546ae4 into llvm:main Dec 12, 2024
11 checks passed
@HanKuanChen HanKuanChen deleted the slp-revec-non-power-of-2-shufflevector branch December 12, 2024 18:22
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants