From 239abac2a1bbe3d210862968db19f79669380b6d Mon Sep 17 00:00:00 2001
From: Andreu Carminati <andreu.carminati@amd.com>
Date: Mon, 9 Sep 2024 16:21:55 +0100
Subject: [PATCH] [AIEX] Add WAW Sticky Register dep. mutator

Now we can break some WAW dependencies related to status registers
when such registers are not explicitly read or written.
---
 llvm/lib/Target/AIE/AIE2RegisterInfo.cpp      |  15 +
 llvm/lib/Target/AIE/AIE2RegisterInfo.h        |   1 +
 llvm/lib/Target/AIE/AIEBaseRegisterInfo.h     |   6 +
 llvm/lib/Target/AIE/AIEBaseSubtarget.cpp      | 107 ++++-
 .../AIE/aie2/end-to-end/TahnTemplated-swp.ll  | 449 ++++++++++++++++++
 .../AIE/aie2/schedule/swp/swp-srflags.mir     |  37 +-
 6 files changed, 589 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/AIE/aie2/end-to-end/TahnTemplated-swp.ll
diff --git a/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp b/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp
index 21d592b50ced..6b5936c41f60 100644
--- a/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp
+++ b/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp
@@ -126,6 +126,21 @@ bool AIE2RegisterInfo::isSimplifiableReservedReg(MCRegister PhysReg) const {
                               AIE2::mSRmRegClass.contains(PhysReg));
 }
 
+bool AIE2RegisterInfo::isReservedStickyReg(MCRegister PhysReg) const {
+  switch (PhysReg) {
+  case AIE2::srCompr_uf:
+  case AIE2::srSparse_of:
+  case AIE2::srF2FFlags:
+  case AIE2::srF2IFlags:
+  case AIE2::srFPFlags:
+  case AIE2::srSRS_of:
+  case AIE2::srUPS_of:
+    return true;
+  default:
+    return false;
+  }
+}
+
 const uint32_t *AIE2RegisterInfo::getNoPreservedMask() const {
   return CSR_NoRegs_RegMask;
 }
diff --git a/llvm/lib/Target/AIE/AIE2RegisterInfo.h b/llvm/lib/Target/AIE/AIE2RegisterInfo.h
index 3a076c729fd4..fa170ea87413 100644
--- a/llvm/lib/Target/AIE/AIE2RegisterInfo.h
+++ b/llvm/lib/Target/AIE/AIE2RegisterInfo.h
@@ -81,6 +81,7 @@ struct AIE2RegisterInfo : public AIE2GenRegisterInfo {
   SmallSet<int, 8>
   getCoveringSubRegs(const TargetRegisterClass &RC) const override;
   bool isSimplifiableReservedReg(MCRegister PhysReg) const override;
+  bool isReservedStickyReg(MCRegister PhysReg) const override;
 
   const TargetRegisterClass *get2DIteratorRegClass() const override {
     return &AIE2::eDRegClass;
diff --git a/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h b/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h
index 4a29243cf250..4d617d79f179 100644
--- a/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h
+++ b/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h
@@ -52,6 +52,12 @@ struct AIEBaseRegisterInfo : public TargetRegisterInfo {
     llvm_unreachable("Target didn't implement isVecOrAccRegClass()");
   }
 
+  // Whether a reserved register is sticky. We consider a register as
+  // sticky when associated with implicit defs of instructions and they
+  // keep their result(s) (or flag status) until an explicit reset.
+  virtual bool isReservedStickyReg(MCRegister PhysReg) const {
+    llvm_unreachable("Target didn't implement isReservedStickyReg!");
+  }
 #if 0
   /// Returns a BitVector of the intersection of GPR RegClass
   /// and CalleeSaved Registers
diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
index d7977ba6a1a5..04966c5d0075 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AIEBaseSubtarget.h"
+#include "AIE.h"
 #include "AIE2Subtarget.h"
 #include "AIEBaseRegisterInfo.h"
 #include "AIEInterBlockScheduling.h"
@@ -45,6 +46,14 @@ static cl::opt<bool> EnablePipelinerSchedPropagateIncomingLatencies(
     "aie-pipeliner-propagate-incoming-latencies", cl::Hidden, cl::init(true),
     cl::desc(
         "Move input latency of copy-like instructions to their successors"));
+// The following options are also testing options
+static cl::opt<bool> EnableWAWStickyRegisters(
+    "aie-pipeliner-waw-sticky-registers", cl::Hidden, cl::init(true),
+    cl::desc("Apply sticky registers WAW dependency removal"));
+static cl::opt<unsigned> WAWStickyRegistersMemOpsThreshold(
+    "aie-waw-sticky-register-mem-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Number of memory instructions to enable the register exclusion "
+             "heuristic in WAW sticky registers dep. removal"));
 
 // These are debugging/testing options.
 
@@ -439,6 +448,14 @@ void dumpDependencies(ScheduleDAGInstrs *DAG, SDep::Kind depType,
   }
 }
 
+// Collect all edges in a separate vector. This allows modifying SU.Preds
+// without invalidating iterators.
+static SmallVector<SDep, 4> getPreds(SUnit &SU) {
+  SmallVector<SDep, 4> Preds;
+  copy(SU.Preds, std::back_inserter(Preds));
+  return Preds;
+}
+
 /// Prevent WAW dependencies on physical register writes. Instructions that
 /// write a register have very limited scheduler freedom. That could be improved
 /// by ignoring the writes that don't reach a read. Algorithm starts with the
@@ -447,13 +464,6 @@ void dumpDependencies(ScheduleDAGInstrs *DAG, SDep::Kind depType,
 class WAWEdges : public ScheduleDAGMutation {
 
   AIEPostRASchedStrategy *Scheduler = nullptr;
-  // Collect all edges in a separate vector. This allows modifying SU.Preds
-  // without invalidating iterators.
-  SmallVector<SDep, 4> getPreds(SUnit &SU) {
-    SmallVector<SDep, 4> Preds;
-    copy(SU.Preds, std::back_inserter(Preds));
-    return Preds;
-  }
   // Updates the dependency to the instruction with last live write of the same
   // register
   void updateOutputDeps(SUnit *SU, Register Reg,
@@ -548,6 +558,87 @@ class MachineSchedWAWEdges : public WAWEdges {
 class SWPWAWEdges : public WAWEdges {
   void apply(ScheduleDAGInstrs *DAG) override { WAWEdges::apply(DAG); }
 };
+
+class WAWStickyRegistersEdges : public ScheduleDAGMutation {
+  void apply(ScheduleDAGInstrs *DAG) override {
+    MachineFunction &MF = DAG->MF;
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    auto *RI = static_cast<const AIEBaseRegisterInfo *>(TRI);
+
+    BitVector AllRegs(RI->getNumRegs());
+    AllRegs.reset();
+    // Here, we analyze which sticky registers are explicitly redefined
+    // or read. We also track all instructions implicitly reading or
+    // defining such registers.
+    std::map<const Register, SmallVector<const MachineInstr *, 16>> RegMIsMap;
+    for (const MachineInstr &MI : make_range(DAG->begin(), DAG->end())) {
+      for (const MachineOperand &MOP : MI.operands()) {
+        if (!MOP.isReg())
+          continue;
+
+        const Register Reg = MOP.getReg();
+        if (!Reg.isPhysical() || !RI->isReservedStickyReg(Reg))
+          continue;
+
+        if ((!MOP.isImplicit() && (MOP.isDef() || MOP.readsReg())) ||
+            (MOP.isImplicit() && MOP.readsReg())) {
+          AllRegs.set(Reg);
+
+        } else if (MOP.isImplicit() && MOP.isDef()) {
+          // Instruction that could have a dependency removal.
+          // We track it because of the next heuristic.
+          RegMIsMap[Reg].push_back(&MI);
+        }
+      }
+    }
+
+    auto IsLoad = [&](const MachineInstr *MI) -> bool { return MI->mayLoad(); };
+    auto IsStore = [&](const MachineInstr *MI) -> bool {
+      return MI->mayStore();
+    };
+
+    // This is the heuristic component. We catch basically cases where
+    // registers are only defined by loads or store within a region,
+    // For example, cases like exemplified below (region):
+    //   [sequence non-defining sticky regs. instructions.]
+    //   VST.CONV ...
+    //   VST.CONV ...
+    //   VST.CONV ...
+    //   VST.CONV ...
+    // In this case, by removing dependencies between pairs of VST.CONVs,
+    // we give too much freedom to the scheduler to do good, but also
+    // not good choices. In this way, we filter those cases off.
+    for (auto RMIs : RegMIsMap) {
+      const Register Reg = RMIs.first;
+      SmallVector<const MachineInstr *, 16> &MIs = RMIs.second;
+      // The first thing to test is the tuning parameter: we only consider
+      // cases where the number of memory ops are <= the threshold.
+      if (MIs.size() <= WAWStickyRegistersMemOpsThreshold &&
+          ((all_of(MIs, IsLoad) || all_of(MIs, IsStore))))
+        AllRegs.set(Reg);
+    }
+
+    // Next part is to drop all output latencies related to
+    // registers that are not explicitly read or defined also
+    // considering the heuristically filtered cases.
+    for (SUnit &SU : DAG->SUnits) {
+      for (const SDep &Dep : getPreds(SU)) {
+        if (Dep.getKind() != SDep::Kind::Output)
+          continue;
+
+        Register Reg = Dep.getReg();
+        if (!Reg.isPhysical() || !RI->isReservedStickyReg(Reg))
+          continue;
+
+        if (!AllRegs.test(Reg))
+          SU.removePred(Dep);
+      }
+    }
+
+    LLVM_DEBUG(dumpDependencies(DAG, SDep::Output, "WAW"));
+  }
+};
+
 } // namespace
 
 std::vector<std::unique_ptr<ScheduleDAGMutation>>
@@ -590,6 +681,8 @@ AIEBaseSubtarget::getSMSMutationsImpl(const Triple &TT) {
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
   if (!TT.isAIE1()) {
     Mutations.emplace_back(std::make_unique<SWPWAWEdges>());
+    if (EnableWAWStickyRegisters)
+      Mutations.emplace_back(std::make_unique<WAWStickyRegistersEdges>());
     if (EnablePipelinerSchedPropagateIncomingLatencies)
       Mutations.emplace_back(std::make_unique<PropagateIncomingLatencies>());
   }
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TahnTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TahnTemplated-swp.ll
new file mode 100644
index 000000000000..98417f324c92
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TahnTemplated-swp.ll
@@ -0,0 +1,449 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+; RUN: llc -O2 -mtriple=aie2 %s -o - | FileCheck %s
+
+; The test is meant as a quick way to spot QoR regressions. In this test, the
+; code can only be pipelined (Pre-SWP) because of the removal of some WAW
+; dependencies related to sticky status registers.
+
+; Function Attrs: nounwind memory(none)
+declare <8 x i64> @llvm.aie2.v16accfloat() #0
+
+; Function Attrs: nounwind memory(none)
+declare <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat) #0
+
+; Function Attrs: nounwind memory(none)
+declare <32 x bfloat> @llvm.aie2.v32bfloat16() #0
+
+; Function Attrs: nounwind memory(none)
+declare <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat>, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: read)
+declare <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat>, <32 x bfloat>, i32) #1
+
+; Function Attrs: nounwind memory(inaccessiblemem: read)
+declare <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat>, <32 x bfloat>, <8 x i64>, i32) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat>, <32 x bfloat>) #2
+
+; Function Attrs: nounwind memory(none)
+declare <16 x bfloat> @llvm.aie2.v16bfloat16() #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64>) #1
+
+; Function Attrs: nounwind memory(none)
+declare <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat>, i32) #0
+
+; Function Attrs: nounwind memory(none)
+declare <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat>) #0
+
+; Function Attrs: nounwind memory(none)
+declare <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat>, <16 x bfloat>, i32) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64>, <8 x i64>, i32) #3
+
+; Function Attrs: nounwind memory(inaccessiblemem: read)
+declare <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat>, <32 x bfloat>, <8 x i64>, i32) #1
+
+; Function Attrs: nounwind memory(none)
+declare <32 x i16> @llvm.aie2.vbroadcast16.I512(i32) #0
+
+; Function Attrs: mustprogress noinline
+define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 32 dereferenceable(64) %params) align 2 {
+; WAW-STICKY-ON-LABEL: TanhTemplated:
+; WAW-STICKY-ON:         .p2align 4
+; WAW-STICKY-ON-NEXT:  // %bb.0: // %for.body.lr.ph
+; WAW-STICKY-ON-NEXT:    nopx ; mov r9, r16
+; WAW-STICKY-ON-NEXT:    movxm r3, #16512
+; WAW-STICKY-ON-NEXT:    movxm r0, #16256
+; WAW-STICKY-ON-NEXT:    movxm r1, #16384
+; WAW-STICKY-ON-NEXT:    lda r5, [p2, #0]; movxm r2, #16128
+; WAW-STICKY-ON-NEXT:    vbcst.16 x0, r1
+; WAW-STICKY-ON-NEXT:    vbcst.16 x3, r0
+; WAW-STICKY-ON-NEXT:    vldb wl3, [p0], #32; vbcst.16 x2, r2
+; WAW-STICKY-ON-NEXT:    mova r0, #0; vconv.fp32.bf16 bmh0, wl2
+; WAW-STICKY-ON-NEXT:    vbcst.16 x2, r0
+; WAW-STICKY-ON-NEXT:    vconv.fp32.bf16 bmh1, wl3
+; WAW-STICKY-ON-NEXT:    mova r1, #-5; vmov wh0, wl2
+; WAW-STICKY-ON-NEXT:    lshl r1, r5, r1; vmov wh3, wl2
+; WAW-STICKY-ON-NEXT:    mova r1, #60; vldb wl3, [p0], #32; add.nc r2, r1, #-2
+; WAW-STICKY-ON-NEXT:    movxm r4, #-16256; vmul.f bmh2, x0, x3, r1
+; WAW-STICKY-ON-NEXT:    movxm r6, #32767
+; WAW-STICKY-ON-NEXT:    movxm r7, #15616
+; WAW-STICKY-ON-NEXT:    movxm r8, #16000
+; WAW-STICKY-ON-NEXT:    vbcst.16 x1, r3
+; WAW-STICKY-ON-NEXT:    vbcst.16 x10, r4
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl3, bmh2; vbcst.16 x8, r6; vmul.f bmh3, x0, x3, r1
+; WAW-STICKY-ON-NEXT:    vbcst.16 x6, r7
+; WAW-STICKY-ON-NEXT:    vmin_ge.bf16 x3, r16, x3, x1
+; WAW-STICKY-ON-NEXT:    vmax_lt.bf16 x3, r16, x3, x10
+; WAW-STICKY-ON-NEXT:    vmov wh3, wl2
+; WAW-STICKY-ON-NEXT:    vmov wh6, wl2
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3
+; WAW-STICKY-ON-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2
+; WAW-STICKY-ON-NEXT:    vmin_ge.bf16 x5, r16, x5, x1
+; WAW-STICKY-ON-NEXT:    vmax_lt.bf16 x5, r16, x5, x10
+; WAW-STICKY-ON-NEXT:    vldb wl7, [p0], #32; vband x7, x8, x5
+; WAW-STICKY-ON-NEXT:    vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
+; WAW-STICKY-ON-NEXT:    vbcst.16 x4, r8
+; WAW-STICKY-ON-NEXT:    vmov wh4, wl2; vmul.f bmh4, x6, x7, r1
+; WAW-STICKY-ON-NEXT:    vmov wh5, wl2; vmul.f bmh5, x0, x7, r1
+; WAW-STICKY-ON-NEXT:    vmac.f bmh3, bmh0, x3, x4, r1
+; WAW-STICKY-ON-NEXT:    movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1
+; WAW-STICKY-ON-NEXT:    add.nc lc, r2, #0
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl3, bmh4; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; nops ; nopxm ; vmsc.f bmh2, bmh6, x3, x5, r1
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; nops ; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; nops ; nopx ; vmov wh3, wl2; nopv
+; WAW-STICKY-ON-NEXT:    nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
+; WAW-STICKY-ON-NEXT:    .p2align 4
+; WAW-STICKY-ON-NEXT:  .LBB0_1: // %for.body
+; WAW-STICKY-ON-NEXT:    // =>This Inner Loop Header: Depth=1
+; WAW-STICKY-ON-NEXT:    nopa ; nopb ; nopx ; vband x9, x8, x3; nops
+; WAW-STICKY-ON-NEXT:    vmov wh6, wl2
+; WAW-STICKY-ON-NEXT:    vmax_lt.bf16 x5, r16, x5, x10
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl7, bmh2; vmov wh7, wl2
+; WAW-STICKY-ON-NEXT:    vldb wl7, [p0], #32; vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
+; WAW-STICKY-ON-NEXT:    vmov wh5, wl2; vmul.f bmh2, x7, x0, r1
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh3, x6, x9, r1
+; WAW-STICKY-ON-NEXT:    vmov wh9, wl2; vsub.f bmh6, bmh6, bmh1, r0
+; WAW-STICKY-ON-NEXT:    vsub.f bml0, bmh2, bmh1, r0
+; WAW-STICKY-ON-NEXT:    vmov wh4, wl2; vmul.f bmh4, x6, x9, r1
+; WAW-STICKY-ON-NEXT:    vmul.f bmh7, x0, x7, r1
+; WAW-STICKY-ON-NEXT:    vmac.f bmh5, bmh0, x3, x4, r1
+; WAW-STICKY-ON-NEXT:    vmac.f bmh2, bmh0, x5, x4, r1
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl3, bmh4; vmov wh3, wl2; vmsc.f bmh3, bmh5, x7, x3, r1
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl3, bmh7
+; WAW-STICKY-ON-NEXT:    vmsc.f bmh2, bmh2, x3, x5, r1
+; WAW-STICKY-ON-NEXT:    vst.conv.bf16.fp32 bmh6, [p1], #32; vmin_ge.bf16 x3, r16, x3, x1
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl5, bmh8; vmax_lt.bf16 x3, r16, x3, x10
+; WAW-STICKY-ON-NEXT:    vst.conv.bf16.fp32 bml0, [p1], #32; vmov wh3, wl2
+; WAW-STICKY-ON-NEXT:  .L_LEnd0:
+; WAW-STICKY-ON-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
+; WAW-STICKY-ON-NEXT:  // %bb.2:
+; WAW-STICKY-ON-NEXT:    nopx
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl1, bmh2; vmov wh1, wl2
+; WAW-STICKY-ON-NEXT:    vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
+; WAW-STICKY-ON-NEXT:    vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
+; WAW-STICKY-ON-NEXT:    vband x1, x8, x3
+; WAW-STICKY-ON-NEXT:    vband x8, x8, x10
+; WAW-STICKY-ON-NEXT:    vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
+; WAW-STICKY-ON-NEXT:    vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0
+; WAW-STICKY-ON-NEXT:    vmul.f bmh2, x6, x1, r1
+; WAW-STICKY-ON-NEXT:    vmul.f bmh3, x6, x8, r1
+; WAW-STICKY-ON-NEXT:    vmov wh4, wl2
+; WAW-STICKY-ON-NEXT:    vmov wh10, wl2
+; WAW-STICKY-ON-NEXT:    vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
+; WAW-STICKY-ON-NEXT:    vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl4, bmh2
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl4, bmh3
+; WAW-STICKY-ON-NEXT:    vmsc.f bmh2, bmh4, x4, x3, r1
+; WAW-STICKY-ON-NEXT:    vmsc.f bmh0, bmh0, x4, x10, r1
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl4, bmh2
+; WAW-STICKY-ON-NEXT:    vconv.bf16.fp32 wl4, bmh0
+; WAW-STICKY-ON-NEXT:    vmul.f bmh2, x4, x0, r1
+; WAW-STICKY-ON-NEXT:    vmul.f bmh0, x4, x0, r1
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    vsub.f bmh2, bmh2, bmh1, r0
+; WAW-STICKY-ON-NEXT:    vsub.f bmh0, bmh0, bmh1, r0
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    nop
+; WAW-STICKY-ON-NEXT:    ret lr
+; WAW-STICKY-ON-NEXT:    nop // Delay Slot 5
+; WAW-STICKY-ON-NEXT:    vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 4
+; WAW-STICKY-ON-NEXT:    vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 3
+; WAW-STICKY-ON-NEXT:    nop // Delay Slot 2
+; WAW-STICKY-ON-NEXT:    mov r16, r9 // Delay Slot 1
+; CHECK-LABEL: TanhTemplated:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %for.body.lr.ph
+; CHECK-NEXT:    nopa ; nopb ; nopx ; mov r8, r16; nops
+; CHECK-NEXT:    movxm r3, #16512
+; CHECK-NEXT:    movxm r0, #16256
+; CHECK-NEXT:    movxm r1, #16384
+; CHECK-NEXT:    lda r0, [p2, #0]; movxm r2, #16128
+; CHECK-NEXT:    vbcst.16 x0, r1
+; CHECK-NEXT:    vldb wl3, [p0], #32; vbcst.16 x3, r0
+; CHECK-NEXT:    vbcst.16 x2, r2
+; CHECK-NEXT:    mova r1, #0; vconv.fp32.bf16 bmh0, wl2
+; CHECK-NEXT:    vbcst.16 x2, r1
+; CHECK-NEXT:    vmov wh0, wl2
+; CHECK-NEXT:    mova r1, #-5; vmov wh3, wl2
+; CHECK-NEXT:    mova r1, #60; vldb wl3, [p0], #32; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
+; CHECK-NEXT:    movxm r4, #-16256; vmul.f bmh2, x0, x3, r1
+; CHECK-NEXT:    movxm r5, #32767
+; CHECK-NEXT:    movxm r6, #15616
+; CHECK-NEXT:    movxm r7, #16000
+; CHECK-NEXT:    vbcst.16 x1, r3
+; CHECK-NEXT:    vbcst.16 x10, r4
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh2; vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1
+; CHECK-NEXT:    vbcst.16 x6, r6
+; CHECK-NEXT:    vmin_ge.bf16 x3, r16, x3, x1
+; CHECK-NEXT:    vmax_lt.bf16 x3, r16, x3, x10
+; CHECK-NEXT:    vmov wh3, wl2
+; CHECK-NEXT:    vmov wh6, wl2
+; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3
+; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2
+; CHECK-NEXT:    vmin_ge.bf16 x5, r16, x5, x1
+; CHECK-NEXT:    vmax_lt.bf16 x5, r16, x5, x10
+; CHECK-NEXT:    vldb wl7, [p0], #32; vband x7, x8, x5
+; CHECK-NEXT:    vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
+; CHECK-NEXT:    vbcst.16 x4, r7
+; CHECK-NEXT:    vmov wh4, wl2; vmul.f bmh4, x6, x7, r1
+; CHECK-NEXT:    vmov wh5, wl2; vmul.f bmh5, x0, x7, r1
+; CHECK-NEXT:    vmac.f bmh3, bmh0, x3, x4, r1
+; CHECK-NEXT:    movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1
+; CHECK-NEXT:    add.nc lc, r2, #-2
+; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl3, bmh4; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1
+; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv
+; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1
+; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
+; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
+; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vmov wh3, wl2; nopv
+; CHECK-NEXT:    nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vband x9, x8, x3
+; CHECK-NEXT:    vmov wh6, wl2
+; CHECK-NEXT:    vmax_lt.bf16 x5, r16, x5, x10
+; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh4, wl2
+; CHECK-NEXT:    vmov wh7, wl2
+; CHECK-NEXT:    vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh5, wl2; vmac.f bmh5, bmh0, x3, x4, r1
+; CHECK-NEXT:    vmul.f bmh3, x6, x9, r1
+; CHECK-NEXT:    vband x9, x8, x5; vmul.f bmh2, x7, x0, r1
+; CHECK-NEXT:    vmov wh9, wl2; vsub.f bml1, bmh6, bmh1, r0
+; CHECK-NEXT:    vmul.f bmh7, x0, x7, r1
+; CHECK-NEXT:    vmul.f bmh4, x6, x9, r1
+; CHECK-NEXT:    vsub.f bml0, bmh2, bmh1, r0
+; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
+; CHECK-NEXT:    vmac.f bml2, bmh0, x5, x4, r1
+; CHECK-NEXT:    vmsc.f bml3, bmh5, x7, x3, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl11, bmh7
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4; vmov wh3, wl2
+; CHECK-NEXT:    vst.conv.bf16.fp32 bml1, [p1], #32; vmin_ge.bf16 x3, r16, x11, x1
+; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh8; vmax_lt.bf16 x3, r16, x3, x10; vmsc.f bml4, bml2, x3, x5, r1
+; CHECK-NEXT:    vst.conv.bf16.fp32 bml0, [p1], #32; vmov wh3, wl2
+; CHECK-NEXT:  .L_LEnd0:
+; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
+; CHECK-NEXT:    vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
+; CHECK-NEXT:    vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
+; CHECK-NEXT:    vband x1, x8, x3
+; CHECK-NEXT:    vband x8, x8, x10
+; CHECK-NEXT:    vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
+; CHECK-NEXT:    vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0
+; CHECK-NEXT:    vmul.f bmh2, x6, x1, r1
+; CHECK-NEXT:    vmul.f bmh3, x6, x8, r1
+; CHECK-NEXT:    vmov wh4, wl2
+; CHECK-NEXT:    vmov wh10, wl2
+; CHECK-NEXT:    vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
+; CHECK-NEXT:    vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh2
+; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh3
+; CHECK-NEXT:    vmsc.f bmh2, bmh4, x4, x3, r1
+; CHECK-NEXT:    vmsc.f bmh0, bmh0, x4, x10, r1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh2
+; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh0
+; CHECK-NEXT:    vmul.f bmh2, x4, x0, r1
+; CHECK-NEXT:    vmul.f bmh0, x4, x0, r1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vsub.f bmh2, bmh2, bmh1, r0
+; CHECK-NEXT:    vsub.f bmh0, bmh0, bmh1, r0
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 4
+; CHECK-NEXT:    vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    mov r16, r8 // Delay Slot 1
+for.body.lr.ph:
+  %0 = tail call noundef <16 x bfloat> @llvm.aie2.v16bfloat16()
+  %1 = tail call noundef <8 x i64> @llvm.aie2.v16accfloat()
+  %2 = tail call noundef <32 x bfloat> @llvm.aie2.v32bfloat16()
+  %3 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3F80)
+  %4 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %3, i32 0)
+  %5 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR4000)
+  %6 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %5, i32 0)
+  %7 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %6)
+  %8 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3F00)
+  %9 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %8, i32 0)
+  %10 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %9)
+  %11 = load i32, ptr %params, align 32, !tbaa !4
+  %div16 = lshr i32 %11, 5
+  %12 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR0000)
+  %13 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %12, i32 0)
+  %14 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %6, i32 0)
+  %15 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %14, <16 x bfloat> %13, i32 1)
+  call void @llvm.set.loop.iterations.i32(i32 %div16)
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %p_out.0.in17 = phi ptr [ %ofm, %for.body.lr.ph ], [ %add.ptr.i9.i, %for.body ]
+  %p_in.0.in16 = phi ptr [ %ifm, %for.body.lr.ph ], [ %add.ptr.i.i.i, %for.body ]
+  %p_out.0 = addrspacecast ptr %p_out.0.in17 to ptr addrspace(6)
+  %p_in.0 = addrspacecast ptr %p_in.0.in16 to ptr addrspace(5)
+  %16 = load <16 x bfloat>, ptr addrspace(5) %p_in.0, align 32, !tbaa !11
+  %add.ptr.i.i = getelementptr inbounds i8, ptr %p_in.0.in16, i20 32
+  %add.ptr.ascast.i.i = addrspacecast ptr %add.ptr.i.i to ptr addrspace(5)
+  %17 = load <16 x bfloat>, ptr addrspace(5) %add.ptr.ascast.i.i, align 32, !tbaa !11
+  %add.ptr.i.i.i = getelementptr inbounds i8, ptr %p_in.0.in16, i20 64
+  %18 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %16, i32 0)
+  %19 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %18, <16 x bfloat> %13, i32 1)
+  %20 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %15, <32 x bfloat> %19, i32 60)
+  %21 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %20)
+  %22 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR4080)
+  %23 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %22, i32 0)
+  %24 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %21, i32 0)
+  %25 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %23, i32 0)
+  %26 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat> %24, <32 x bfloat> %25)
+  %27 = extractvalue { <32 x bfloat>, i32 } %26, 0
+  %28 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %27, i32 0)
+  %29 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xRC080)
+  %30 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %29, i32 0)
+  %31 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %28, i32 0)
+  %32 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %30, i32 0)
+  %33 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat> %31, <32 x bfloat> %32)
+  %34 = extractvalue { <32 x bfloat>, i32 } %33, 0
+  %35 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %34, i32 0)
+  %36 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %35, i32 0)
+  %37 = bitcast <32 x bfloat> %36 to <32 x i16>
+  %38 = tail call noundef <32 x i16> @llvm.aie2.vbroadcast16.I512(i32 32767)
+  %and.i.i.i.i.i.i.i.i.i.i.i = and <32 x i16> %38, %37
+  %39 = bitcast <32 x i16> %and.i.i.i.i.i.i.i.i.i.i.i to <32 x bfloat>
+  %40 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %39, i32 0)
+  %41 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3D00)
+  %42 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %41, i32 0)
+  %43 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %42, i32 0)
+  %44 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %43, <16 x bfloat> %13, i32 1)
+  %45 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %40, i32 0)
+  %46 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %45, <16 x bfloat> %13, i32 1)
+  %47 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %44, <32 x bfloat> %46, i32 60)
+  %48 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3E80)
+  %49 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %48, i32 0)
+  %50 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %36, <16 x bfloat> %13, i32 1)
+  %51 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %49, i32 0)
+  %52 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %51, <16 x bfloat> %13, i32 1)
+  %53 = tail call noundef <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat> %50, <32 x bfloat> %52, <8 x i64> %10, i32 60)
+  %54 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %47)
+  %55 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %54, i32 0)
+  %56 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %55, <16 x bfloat> %13, i32 1)
+  %57 = tail call noundef <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat> %56, <32 x bfloat> %50, <8 x i64> %53, i32 60)
+  %58 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %57)
+  %59 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %58, i32 0)
+  %60 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %59, <16 x bfloat> %13, i32 1)
+  %61 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %60, <32 x bfloat> %15, i32 60)
+  %62 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %4)
+  %63 = tail call noundef <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64> %61, <8 x i64> %62, i32 28)
+  %64 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %63)
+  %65 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %17, i32 0)
+  %66 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %65, <16 x bfloat> %13, i32 1)
+  %67 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %15, <32 x bfloat> %66, i32 60)
+  %68 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %67)
+  %69 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %68, i32 0)
+  %70 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat> %69, <32 x bfloat> %25)
+  %71 = extractvalue { <32 x bfloat>, i32 } %70, 0
+  %72 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %71, i32 0)
+  %73 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %72, i32 0)
+  %74 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat> %73, <32 x bfloat> %32)
+  %75 = extractvalue { <32 x bfloat>, i32 } %74, 0
+  %76 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %75, i32 0)
+  %77 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %76, i32 0)
+  %78 = bitcast <32 x bfloat> %77 to <32 x i16>
+  %and.i.i.i.i.i.i.i.i.i.i.i.i = and <32 x i16> %38, %78
+  %79 = bitcast <32 x i16> %and.i.i.i.i.i.i.i.i.i.i.i.i to <32 x bfloat>
+  %80 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %79, i32 0)
+  %81 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %80, i32 0)
+  %82 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %81, <16 x bfloat> %13, i32 1)
+  %83 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %44, <32 x bfloat> %82, i32 60)
+  %84 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %77, <16 x bfloat> %13, i32 1)
+  %85 = tail call noundef <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat> %84, <32 x bfloat> %52, <8 x i64> %10, i32 60)
+  %86 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %83)
+  %87 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %86, i32 0)
+  %88 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %87, <16 x bfloat> %13, i32 1)
+  %89 = tail call noundef <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat> %88, <32 x bfloat> %84, <8 x i64> %85, i32 60)
+  %90 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %89)
+  %91 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %90, i32 0)
+  %92 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %91, <16 x bfloat> %13, i32 1)
+  %93 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %92, <32 x bfloat> %15, i32 60)
+  %94 = tail call noundef <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64> %93, <8 x i64> %62, i32 28)
+  %95 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %94)
+  store <16 x bfloat> %64, ptr addrspace(6) %p_out.0, align 32, !tbaa !11
+  %add.ptr.i7.i.i = getelementptr inbounds i8, ptr %p_out.0.in17, i20 32
+  %add.ptr.ascast.i8.i.i = addrspacecast ptr %add.ptr.i7.i.i to ptr addrspace(6)
+  store <16 x bfloat> %95, ptr addrspace(6) %add.ptr.ascast.i8.i.i, align 32, !tbaa !11
+  %add.ptr.i9.i = getelementptr inbounds i8, ptr %p_out.0.in17, i20 64
+  %96 = call i1 @llvm.loop.decrement.i32(i32 1)
+  br i1 %96, label %for.body, label %for.cond.cleanup, !llvm.loop !12
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat>, <32 x bfloat>) #2
+
+; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+declare void @llvm.set.loop.iterations.i32(i32) #5
+
+; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+declare i1 @llvm.loop.decrement.i32(i32) #5
+
+attributes #0 = { nounwind memory(none) }
+attributes #1 = { nounwind memory(inaccessiblemem: read) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress noinline "no-builtin-memcpy" "no-jump-tables"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #5 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+!llvm.linker.options = !{}
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 7, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{!"clang version 19.0.0git (/scratch/llvm-aie/clang 640962db16e997d4aaf9dadcf09d9a4fc7e06fe4)"}
+!4 = !{!5, !6, i64 0}
+!5 = !{!"TanhTemplated", !6, i64 0, !7, i64 4, !7, i64 5, !7, i64 6, !7, i64 7, !7, i64 8, !7, i64 9, !9, i64 10, !6, i64 12, !6, i64 16, !6, i64 20, !6, i64 24, !6, i64 28, !10, i64 32}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!"short", !7, i64 0}
+!10 = !{!"_ZTS23tanh_templated_params_tIu6__bf16E"}
+!11 = !{!7, !7, i64 0}
+!12 = distinct !{!12, !13, !14}
+!13 = !{!"llvm.loop.mustprogress"}
+!14 = !{!"llvm.loop.itercount.range", i64 4}
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-srflags.mir b/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-srflags.mir
index 9000b8d615a7..bd3bc0fbb99c 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-srflags.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-srflags.mir
@@ -142,41 +142,40 @@ body:             |
   ; CHECK-NEXT:   [[VADD_F2:%[0-9]+]]:acc512 = VADD_F [[VADD_F1]], [[COPY4]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
   ; CHECK-NEXT:   [[VADD_F3:%[0-9]+]]:acc512 = VADD_F [[COPY6]], [[COPY2]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
   ; CHECK-NEXT:   [[VADD_F4:%[0-9]+]]:acc512 = VADD_F [[VADD_F3]], [[COPY3]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
-  ; CHECK-NEXT:   [[VADD_F5:%[0-9]+]]:acc512 = VADD_F [[VADD_F4]], [[COPY4]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
   ; CHECK-NEXT:   [[ADD_add_r_ri:%[0-9]+]]:er = nsw ADD_add_r_ri [[COPY1]], -1, implicit-def $srcarry
   ; CHECK-NEXT:   PseudoJ_jump_imm %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:er = PHI [[ADD_add_r_ri]], %bb.3, %26, %bb.4
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:er = PHI [[ADD_add_r_ri]], %bb.3, %25, %bb.4
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:acc512 = PHI [[VADD_F2]], %bb.3, %30, %bb.4
-  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:acc512 = PHI [[VADD_F5]], %bb.3, %31, %bb.4
-  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:acc512 = PHI [[VADD_F2]], %bb.3, %30, %bb.4
-  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:acc512 = PHI [[VADD_F5]], %bb.3, %31, %bb.4
-  ; CHECK-NEXT:   [[VADD_F6:%[0-9]+]]:acc512 = VADD_F [[PHI1]], [[COPY2]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:acc512 = PHI [[VADD_F2]], %bb.3, %30, %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:acc512 = PHI [[VADD_F4]], %bb.3, %29, %bb.4
+  ; CHECK-NEXT:   [[VADD_F5:%[0-9]+]]:acc512 = VADD_F [[PHI1]], [[COPY2]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
   ; CHECK-NEXT:   [[ADD_add_r_ri1:%[0-9]+]]:er = nsw ADD_add_r_ri [[PHI]], -1, implicit-def $srcarry
-  ; CHECK-NEXT:   [[VADD_F7:%[0-9]+]]:acc512 = VADD_F [[PHI2]], [[COPY2]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI3]].sub_256_lo, [[COPY]], 0
-  ; CHECK-NEXT:   [[VADD_F8:%[0-9]+]]:acc512 = VADD_F [[VADD_F6]], [[COPY3]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI3]].sub_256_hi, [[COPY]], 0
+  ; CHECK-NEXT:   [[VADD_F6:%[0-9]+]]:acc512 = VADD_F [[PHI3]], [[COPY4]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
+  ; CHECK-NEXT:   [[VADD_F7:%[0-9]+]]:acc512 = VADD_F [[VADD_F6]], [[COPY2]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
+  ; CHECK-NEXT:   [[VADD_F8:%[0-9]+]]:acc512 = VADD_F [[VADD_F5]], [[COPY3]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI2]].sub_256_lo, [[COPY]], 0
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI2]].sub_256_hi, [[COPY]], 0
   ; CHECK-NEXT:   [[VADD_F9:%[0-9]+]]:acc512 = VADD_F [[VADD_F7]], [[COPY3]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI4]].sub_256_lo, [[COPY]], 0
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI4]].sub_256_hi, [[COPY]], 0
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[VADD_F6]].sub_256_lo, [[COPY]], 0
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[VADD_F6]].sub_256_hi, [[COPY]], 0
   ; CHECK-NEXT:   [[VADD_F10:%[0-9]+]]:acc512 = VADD_F [[VADD_F8]], [[COPY4]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
-  ; CHECK-NEXT:   [[VADD_F11:%[0-9]+]]:acc512 = VADD_F [[VADD_F9]], [[COPY4]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
   ; CHECK-NEXT:   PseudoJNZ [[ADD_add_r_ri1]], %bb.4
   ; CHECK-NEXT:   PseudoJ_jump_imm %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:acc512 = PHI [[VADD_F10]], %bb.4
-  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:acc512 = PHI [[VADD_F11]], %bb.4
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI5]].sub_256_lo, [[COPY]], 0
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI5]].sub_256_hi, [[COPY]], 0
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI6]].sub_256_lo, [[COPY]], 0
-  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI6]].sub_256_hi, [[COPY]], 0
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:acc512 = PHI [[VADD_F10]], %bb.4
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:acc512 = PHI [[VADD_F9]], %bb.4
+  ; CHECK-NEXT:   [[VADD_F11:%[0-9]+]]:acc512 = VADD_F [[PHI5]], [[COPY4]], [[MOV_RLC_imm10_pseudo]], implicit-def $srfpflags, implicit $crfpmask
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI4]].sub_256_lo, [[COPY]], 0
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[PHI4]].sub_256_hi, [[COPY]], 0
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[VADD_F11]].sub_256_lo, [[COPY]], 0
+  ; CHECK-NEXT:   VST_dmw_sts_am_ag_idx_imm [[VADD_F11]].sub_256_hi, [[COPY]], 0
   ; CHECK-NEXT:   PseudoJ_jump_imm %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2: