forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 333
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
In MVETailPredication, clear the vector before running on a new loop. Differential Revision: https://reviews.llvm.org/D73048
- Loading branch information
1 parent
45538b5
commit c04b9ba
Showing
2 changed files
with
146 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
145 changes: 145 additions & 0 deletions
145
llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | ||
; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve.fp -mve-tail-predication -disable-mve-tail-predication=false %s -S -o - | FileCheck %s | ||
|
||
define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* noalias nocapture readonly %s2, i32 %x, i32* noalias nocapture %d, i32 %n) { | ||
; CHECK-LABEL: @_Z4loopPiPjiS0_i( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[CMP63:%.*]] = icmp sgt i32 [[N:%.*]], 0 | ||
; CHECK-NEXT: br i1 [[CMP63]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] | ||
; CHECK: for.body.lr.ph: | ||
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 | ||
; CHECK-NEXT: [[N_RND_UP77:%.*]] = add nuw i32 [[N]], 3 | ||
; CHECK-NEXT: [[N_VEC79:%.*]] = and i32 [[N_RND_UP77]], -4 | ||
; CHECK-NEXT: [[TRIP_COUNT_MINUS_183:%.*]] = add nsw i32 [[N]], -1 | ||
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC79]], -4 | ||
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 | ||
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 | ||
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], 1 | ||
; CHECK-NEXT: br i1 [[TOBOOL]], label [[VECTOR_BODY75_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] | ||
; CHECK: vector.body75.preheader: | ||
; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) | ||
; CHECK-NEXT: br label [[VECTOR_BODY75:%.*]] | ||
; CHECK: vector.ph: | ||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 | ||
; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer | ||
; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) | ||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | ||
; CHECK: vector.body: | ||
; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] | ||
; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* | ||
; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]]) | ||
; CHECK-NEXT: [[TMP7]] = sub i32 [[TMP5]], 4 | ||
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT72]], <4 x i32>* [[LSR_IV911]], i32 4, <4 x i1> [[TMP6]]) | ||
; CHECK-NEXT: [[SCEVGEP10]] = getelementptr i32, i32* [[LSR_IV9]], i32 4 | ||
; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP4]], i32 1) | ||
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 | ||
; CHECK-NEXT: br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]] | ||
; CHECK: vector.body75: | ||
; CHECK-NEXT: [[LSR_IV6:%.*]] = phi i32* [ [[S1:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP7:%.*]], [[VECTOR_BODY75]] ] | ||
; CHECK-NEXT: [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ] | ||
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ] | ||
; CHECK-NEXT: [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ] | ||
; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP15:%.*]], [[VECTOR_BODY75]] ] | ||
; CHECK-NEXT: [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>* | ||
; CHECK-NEXT: [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>* | ||
; CHECK-NEXT: [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* | ||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT84:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX80]], i32 0 | ||
; CHECK-NEXT: [[BROADCAST_SPLAT85:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT84]], <4 x i32> undef, <4 x i32> zeroinitializer | ||
; CHECK-NEXT: [[INDUCTION86:%.*]] = add <4 x i32> [[BROADCAST_SPLAT85]], <i32 0, i32 1, i32 2, i32 3> | ||
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0 | ||
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> zeroinitializer | ||
; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP12]] | ||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV68]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) | ||
; CHECK-NEXT: [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV35]], i32 4, <4 x i1> [[TMP13]], <4 x i32> undef) | ||
; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]]) | ||
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP14]], <4 x i32>* [[LSR_IV2]], i32 4, <4 x i1> [[TMP13]]) | ||
; CHECK-NEXT: [[INDEX_NEXT81]] = add i32 [[INDEX80]], 4 | ||
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 | ||
; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i32, i32* [[LSR_IV3]], i32 4 | ||
; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i32, i32* [[LSR_IV6]], i32 4 | ||
; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP10]], i32 1) | ||
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 | ||
; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY75]], label [[FOR_COND_CLEANUP]] | ||
; CHECK: for.cond.cleanup: | ||
; CHECK-NEXT: ret i32 0 | ||
; | ||
entry: | ||
%cmp63 = icmp sgt i32 %n, 0 | ||
br i1 %cmp63, label %for.body.lr.ph, label %for.cond.cleanup | ||
|
||
for.body.lr.ph: ; preds = %entry | ||
%tobool = icmp eq i32 %x, 0 | ||
%n.rnd.up77 = add nuw i32 %n, 3 | ||
%n.vec79 = and i32 %n.rnd.up77, -4 | ||
%trip.count.minus.183 = add nsw i32 %n, -1 | ||
%0 = add i32 %n.vec79, -4 | ||
%1 = lshr i32 %0, 2 | ||
%2 = add nuw nsw i32 %1, 1 | ||
%3 = add nuw nsw i32 %1, 1 | ||
br i1 %tobool, label %vector.body75.preheader, label %vector.ph | ||
|
||
vector.body75.preheader: ; preds = %for.body.lr.ph | ||
call void @llvm.set.loop.iterations.i32(i32 %2) | ||
br label %vector.body75 | ||
|
||
vector.ph: ; preds = %for.body.lr.ph | ||
%broadcast.splatinsert71 = insertelement <4 x i32> undef, i32 %x, i32 0 | ||
%broadcast.splat72 = shufflevector <4 x i32> %broadcast.splatinsert71, <4 x i32> undef, <4 x i32> zeroinitializer | ||
call void @llvm.set.loop.iterations.i32(i32 %3) | ||
br label %vector.body | ||
|
||
vector.body: ; preds = %vector.body, %vector.ph | ||
%lsr.iv9 = phi i32* [ %scevgep10, %vector.body ], [ %d, %vector.ph ] | ||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] | ||
%4 = phi i32 [ %3, %vector.ph ], [ %8, %vector.body ] | ||
%lsr.iv911 = bitcast i32* %lsr.iv9 to <4 x i32>* | ||
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 | ||
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> | ||
%5 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 | ||
%6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%7 = icmp ule <4 x i32> %induction, %6 | ||
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %broadcast.splat72, <4 x i32>* %lsr.iv911, i32 4, <4 x i1> %7) | ||
%index.next = add i32 %index, 4 | ||
%scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 4 | ||
%8 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %4, i32 1) | ||
%9 = icmp ne i32 %8, 0 | ||
br i1 %9, label %vector.body, label %for.cond.cleanup | ||
|
||
vector.body75: ; preds = %vector.body75, %vector.body75.preheader | ||
%lsr.iv6 = phi i32* [ %s1, %vector.body75.preheader ], [ %scevgep7, %vector.body75 ] | ||
%lsr.iv3 = phi i32* [ %s2, %vector.body75.preheader ], [ %scevgep4, %vector.body75 ] | ||
%lsr.iv = phi i32* [ %d, %vector.body75.preheader ], [ %scevgep, %vector.body75 ] | ||
%index80 = phi i32 [ %index.next81, %vector.body75 ], [ 0, %vector.body75.preheader ] | ||
%10 = phi i32 [ %2, %vector.body75.preheader ], [ %15, %vector.body75 ] | ||
%lsr.iv68 = bitcast i32* %lsr.iv6 to <4 x i32>* | ||
%lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>* | ||
%lsr.iv2 = bitcast i32* %lsr.iv to <4 x i32>* | ||
%broadcast.splatinsert84 = insertelement <4 x i32> undef, i32 %index80, i32 0 | ||
%broadcast.splat85 = shufflevector <4 x i32> %broadcast.splatinsert84, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%induction86 = add <4 x i32> %broadcast.splat85, <i32 0, i32 1, i32 2, i32 3> | ||
%11 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0 | ||
%12 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%13 = icmp ule <4 x i32> %induction86, %12 | ||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv68, i32 4, <4 x i1> %13, <4 x i32> undef) | ||
%wide.masked.load89 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv35, i32 4, <4 x i1> %13, <4 x i32> undef) | ||
%14 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %wide.masked.load89, <4 x i32> %wide.masked.load) | ||
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %14, <4 x i32>* %lsr.iv2, i32 4, <4 x i1> %13) | ||
%index.next81 = add i32 %index80, 4 | ||
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4 | ||
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 4 | ||
%scevgep7 = getelementptr i32, i32* %lsr.iv6, i32 4 | ||
%15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %10, i32 1) | ||
%16 = icmp ne i32 %15, 0 | ||
br i1 %16, label %vector.body75, label %for.cond.cleanup | ||
|
||
for.cond.cleanup: ; preds = %vector.body, %vector.body75, %entry | ||
ret i32 0 | ||
} | ||
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) | ||
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) | ||
declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) | ||
declare void @llvm.set.loop.iterations.i32(i32) | ||
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) |