Skip to content

Commit

Permalink
[AArch64] Sink vscale calls into loops for better isel (#70304)
Browse files Browse the repository at this point in the history
For more recent sve capable CPUs it is beneficial to use the inc*
instruction
to increment a value by vscale (potentially shifted or multiplied) even
in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.
  • Loading branch information
huntergr-arm authored Nov 7, 2023
1 parent fd48044 commit a850dbc
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 33 deletions.
1 change: 0 additions & 1 deletion llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
return true;
}
return false;
}

if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
Expand Down
29 changes: 29 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14552,6 +14552,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
return true;
}

/// We want to sink following cases:
/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
if (match(Op, m_VScale()))
return true;
if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
return true;
}
return false;
}

/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
Expand Down Expand Up @@ -14668,6 +14681,22 @@ bool AArch64TargetLowering::shouldSinkOperands(
}
}

// Sink vscales closer to uses for better isel
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
case Instruction::Add:
case Instruction::Sub:
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
Ops.push_back(&I->getOperandUse(Op));
return true;
}
}
break;
default:
break;
}

if (!I->getType()->isVectorTy())
return false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cntd x9
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: mov w11, #100 // =0x64
; CHECK-NEXT: neg x9, x9
; CHECK-NEXT: mov w10, #100 // =0x64
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: and x10, x9, x10
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
Expand All @@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
Expand Down Expand Up @@ -106,11 +106,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: fmov d2, #2.00000000
; CHECK-NEXT: cntd x9
; CHECK-NEXT: mov w11, #100 // =0x64
; CHECK-NEXT: mov w10, #100 // =0x64
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: neg x9, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: and x10, x9, x10
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
; CHECK-NEXT: mov z1.d, p0/m, z2.d
Expand All @@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
Expand Down Expand Up @@ -193,34 +193,34 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cntw x9
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: mov w11, #1000 // =0x3e8
; CHECK-NEXT: rdvl x13, #2
; CHECK-NEXT: neg x9, x9
; CHECK-NEXT: mov w10, #1000 // =0x3e8
; CHECK-NEXT: rdvl x12, #2
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: and x10, x9, x10
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: rdvl x11, #4
; CHECK-NEXT: add x12, x1, x13
; CHECK-NEXT: add x13, x0, x13
; CHECK-NEXT: add x11, x1, x12
; CHECK-NEXT: add x12, x0, x12
; CHECK-NEXT: rdvl x13, #4
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x14, x0, x8
; CHECK-NEXT: add x15, x13, x8
; CHECK-NEXT: add x15, x12, x8
; CHECK-NEXT: add x16, x1, x8
; CHECK-NEXT: add x17, x12, x8
; CHECK-NEXT: add x17, x11, x8
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x13, x8]
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x15, #1, mul vl]
; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x16, #1, mul vl]
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x12, x8]
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x13
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #0
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AArch64/sve-int-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
; CHECK-NEXT: b.lt .LBB70_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, w3
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z0.s, #1 // =0x1
; CHECK-NEXT: whilelo p0.s, xzr, x9
; CHECK-NEXT: whilelo p1.s, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntw x10
; CHECK-NEXT: .LBB70_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p1/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p0.s, x8, x9
; CHECK-NEXT: whilelo p1.s, x8, x9
; CHECK-NEXT: b.mi .LBB70_2
; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup
; CHECK-NEXT: ret
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
; CHECK-NEXT: whilelt p0.s, wzr, w0
; CHECK-NEXT: b.pl .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, wzr
; CHECK-NEXT: cntw x8
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: cntw x9
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: whilelt p0.s, w9, w0
; CHECK-NEXT: add w9, w9, w8
; CHECK-NEXT: whilelt p0.s, w8, w0
; CHECK-NEXT: add w8, w8, w9
; CHECK-NEXT: b.mi .LBB0_2
; CHECK-NEXT: .LBB0_3: // %exit
; CHECK-NEXT: ret
Expand Down
168 changes: 168 additions & 0 deletions llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -codegenprepare -S -o - %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
; CHECK-LABEL: define void @inc_add
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
entry:
%wide.trip.count = zext i32 %N to i64
%0 = tail call i64 @llvm.vscale.i64()
%1 = shl nuw nsw i64 %0, 2
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%2 = getelementptr inbounds float, ptr %in1, i64 %index
%wide.load = load <vscale x 4 x float>, ptr %2, align 4
%3 = getelementptr inbounds float, ptr %in2, i64 %index
%wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
%5 = getelementptr inbounds float, ptr %out, i64 %index
store <vscale x 4 x float> %4, ptr %5, align 4
%index.next = add nuw i64 %index, %1
%6 = icmp eq i64 %index.next, %wide.trip.count
br i1 %6, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
ret void
}

define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
; CHECK-LABEL: define void @dec_sub
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
; CHECK-NEXT: [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
; CHECK-NEXT: [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
; CHECK-NEXT: [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
; CHECK-NEXT: [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
entry:
%0 = zext i32 %N to i64
%1 = tail call i64 @llvm.vscale.i64()
%2 = shl nuw nsw i64 %1, 2
%3 = sub nsw i64 1, %2
%invariant.gep = getelementptr float, ptr %in1, i64 %3
%invariant.gep20 = getelementptr float, ptr %in2, i64 %3
%invariant.gep22 = getelementptr float, ptr %out, i64 %3
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%offset.idx = sub i64 %0, %index
%gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
%wide.load = load <vscale x 4 x float>, ptr %gep, align 4
%gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
%wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
%gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
store <vscale x 4 x float> %4, ptr %gep23, align 4
%index.next = add nuw i64 %index, %2
%5 = icmp eq i64 %index.next, %0
br i1 %5, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
ret void
}

define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
; CHECK-LABEL: define void @gep
; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4
; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 4
; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 4
; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
; CHECK: for.exit:
; CHECK-NEXT: ret void
;
entry:
%0 = tail call i64 @llvm.vscale.i64()
%1 = shl i64 %0, 4
br label %for.body

for.body: ; preds = %for.body, %for.body.lr.ph.new
%lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
%ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
%add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
%lsr.iv.next = add i32 %lsr.iv, -4
%cmp = icmp eq i32 %lsr.iv.next, 0
br i1 %cmp, label %for.exit, label %for.body

for.exit:
ret void
}

declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)

declare i64 @llvm.vscale.i64()

attributes #0 = { "target-features"="+sve2" }

0 comments on commit a850dbc

Please sign in to comment.