Skip to content

Commit

Permalink
[SLP] Vectorize non-power-of-2 ops with padding.
Browse files Browse the repository at this point in the history
This patch introduces a new VectorizeWithPadding node type for root and
leave nodes to allow vectorizing loads/stores with non-power-of-2 number
of elements.

VectorizeWithPadding load nodes will pad the result to the next power of 2
with poison elements.

Non-leaf nodes will operate on normal power-of-2 vectors. For those
non-leaf nodes, we still track the number of padding elements needed to
go to the next power-of-2, to be used in various places, like cost
computation.

VectorizeWithPadding store nodes strip away the padding elements and
store the non-power-of-2 number of data elements.

Note that re-ordering and shuffling is not implemented for nodes
requiring padding yet to keep the initial implementation simpler.

The initial implementation also only tries to vectorize with padding if
original number of elements + 1 is a power-of-2, i.e. if only a single
padding element is needed.

The feature is guarded by a new flag, off by defaul for now.
  • Loading branch information
fhahn committed Jan 11, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 3b3da7c commit 2e095fd
Showing 7 changed files with 807 additions and 348 deletions.
281 changes: 233 additions & 48 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Large diffs are not rendered by default.

90 changes: 59 additions & 31 deletions llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
Original file line number Diff line number Diff line change
@@ -1,35 +1,65 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-with-padding -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=PADDING %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-with-padding=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NO-PADDING %s

define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
; CHECK-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
; CHECK-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
; CHECK-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
; CHECK-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
; CHECK-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
; CHECK-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
; CHECK-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
; CHECK-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
; CHECK-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
; CHECK-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
; CHECK-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
; CHECK-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
; CHECK-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
; CHECK-NEXT: ret void
; PADDING-LABEL: define void @v15_load_i8_mul_by_constant_store(
; PADDING-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; PADDING-NEXT: entry:
; PADDING-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
; PADDING-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
; PADDING-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
; PADDING-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
; PADDING-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
; PADDING-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
; PADDING-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
; PADDING-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
; PADDING-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
; PADDING-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
; PADDING-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
; PADDING-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
; PADDING-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
; PADDING-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
; PADDING-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
; PADDING-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
; PADDING-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
; PADDING-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
; PADDING-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
; PADDING-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
; PADDING-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
; PADDING-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
; PADDING-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
; PADDING-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
; PADDING-NEXT: ret void
;
; NO-PADDING-LABEL: define void @v15_load_i8_mul_by_constant_store(
; NO-PADDING-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; NO-PADDING-NEXT: entry:
; NO-PADDING-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
; NO-PADDING-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
; NO-PADDING-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
; NO-PADDING-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
; NO-PADDING-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
; NO-PADDING-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
; NO-PADDING-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
; NO-PADDING-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
; NO-PADDING-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
; NO-PADDING-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
; NO-PADDING-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
; NO-PADDING-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
; NO-PADDING-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
; NO-PADDING-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
; NO-PADDING-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
; NO-PADDING-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
; NO-PADDING-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
; NO-PADDING-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
; NO-PADDING-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
; NO-PADDING-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
; NO-PADDING-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
; NO-PADDING-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
; NO-PADDING-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
; NO-PADDING-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
; NO-PADDING-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
@@ -123,5 +153,3 @@ entry:

ret void
}


Loading

0 comments on commit 2e095fd

Please sign in to comment.