diff --git a/llvm/test/CodeGen/AArch64/pr161420.ll b/llvm/test/CodeGen/AArch64/pr161420.ll new file mode 100644 index 0000000000000..515a1bf47cc1e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pr161420.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-apple-macosx15.0.0" + +; From: https://github.com/llvm/llvm-project/issues/161420. This test checks that +; two `luti4` instructions are emitted. FIXME: This is currently broken! +define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 { +; CHECK-LABEL: pluto: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: mov w8, #0 ; =0x0 +; CHECK-NEXT: ldr zt0, [x1] +; CHECK-NEXT: ldr z0, [x3] +; CHECK-NEXT: ptrue pn8.h +; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0] +; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0] +; CHECK-NEXT: fmla za.h[w8, 2, vgx4], { z4.h - z7.h }, { z0.h - z3.h } +; CHECK-NEXT: ret +bb: + tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1) + %load = load , ptr %arg3, align 16 + %call = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() + %call4 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %call, ptr %arg) + %extractvalue = extractvalue { , , , } %call4, 0 + %extractvalue5 = extractvalue { , , , } %call4, 1 + %extractvalue6 = extractvalue { , , , } %call4, 2 + %extractvalue7 = extractvalue { , , , } %call4, 3 + %call8 = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, %load, i32 0) + %extractvalue9 = extractvalue { , , , } %call8, 0 + %extractvalue10 = extractvalue { , , , } %call8, 1 + %extractvalue11 = extractvalue { , , , } %call8, 2 + %extractvalue12 = extractvalue { , , , } %call8, 3 + tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 0, %extractvalue, %extractvalue5, %extractvalue6, %extractvalue7, %extractvalue9, %extractvalue10, %extractvalue11, %extractvalue12) + tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg2) + %call13 = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, %load, i32 0) + %extractvalue14 = extractvalue { , , , } %call13, 0 + %extractvalue15 = extractvalue { , , , } %call13, 1 + %extractvalue16 = extractvalue { , , , } %call13, 2 + %extractvalue17 = extractvalue { , , , } %call13, 3 + tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 2, %extractvalue, %extractvalue5, %extractvalue6, %extractvalue7, %extractvalue14, %extractvalue15, %extractvalue16, %extractvalue17) + ret void +} + +declare void @llvm.aarch64.sme.ldr.zt(i32, ptr) +declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 immarg, , i32 immarg) +declare void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32, , , , , , , , ) + +attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn uwtable(sync) "aarch64_inout_za" "aarch64_inout_zt0" "aarch64_pstate_sm_enabled" "target-cpu"="apple-m1" "target-features"="+fp-armv8,+lse,+neon,+sme,+sme-f16f16,+sme2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a" } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll index 92d3e1182bf34..cf306e5238018 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll @@ -48,6 +48,24 @@ define {, , , , , , } %res } +; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd. +; FIXME: This is currently broken! +define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, %x) { +; CHECK-LABEL: test_multiple_luti4_zt_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1] +; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3 +; CHECK-NEXT: ret + tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrA) + %res1 = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, %x, i32 1) + tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrB) + %res2 = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, %x, i32 1) + + call void (...) @llvm.fake.use({, , , } %res1) + call void (...) @llvm.fake.use({, , , } %res2) + ret void +} + declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, , i32) declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, , i32) declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll index 778f31194baf4..0024b70bd7c8f 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll @@ -14,4 +14,24 @@ define {, , , , , , } %res } +; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd. +; FIXME: This is currently broken! +define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, %v0, %v1) #0 { +; CHECK-LABEL: test_multiple_luti4_zt_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 } +; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3 +; CHECK-NEXT: ret + tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrA) + %res1 = call {, , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, %v0, %v1) + tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrB) + %res2 = call {, , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, %v0, %v1) + + call void (...) @llvm.fake.use({ , , , } %res1) + call void (...) @llvm.fake.use({ , , , } %res2) + ret void +} + attributes #0 = { "target-features"="+sme2,+sme-lutv2"}