From 9c0ca003db1db1902f0939752e16b56f7cb443ef Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Fri, 24 Jan 2025 13:41:44 +0000
Subject: [PATCH 01/20] Update npu4 Ukernel for 4x8 for
 pack-peel-4-level-tiling pipeline

---
 compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc | 1 +
 1 file changed, 1 insertion(+)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
index 38c364c6d..fe6568d8d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
@@ -289,6 +289,7 @@ extern "C" {
   }
 
 matmul_combos(matmul_vectorized_c_func, 16, 8, 32)
+matmul_combos(matmul_vectorized_c_func, 16, 8, 64)
 matmul_combos(matmul_vectorized_c_func, 16, 16, 32)
 matmul_combos(matmul_vectorized_c_func, 32, 32, 32)
 matmul_combos(matmul_vectorized_c_func, 32, 32, 64)

From be66068df8116f0f1e1566eb20a96555a45c66a7 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Fri, 24 Jan 2025 13:42:31 +0000
Subject: [PATCH 02/20] [WIP] Fix split-logicalobjfifos for 4x8 for
 pack-peel-4-level-tiling

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 74 +++++++++++++++++--
 .../AMDAIELogicalObjFifoSplittingUtils.cpp    | 18 +++--
 .../AMDAIELogicalObjFifoSplittingUtils.h      |  8 +-
 .../test/split_logicalobjfifos.mlir           | 63 ++++++++++++++++
 4 files changed, 147 insertions(+), 16 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 89bc0fd05..e563f4059 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -348,23 +348,87 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
     return signalPassFailure();
   }
 
+  // Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us
+  // figure out the split factor for all LogicalObjectFifos.
+  DenseMap<Operation *, DenseSet<Operation *>> uniqueL2L1Pair;
+  moduleOp->walk([&](Operation *op) -> WalkResult {
+    if (auto copyOp = dyn_cast<CopyOpInterface>(op)) {
+      auto source = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+          copyOp.getSource().getDefiningOp());
+      auto target = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+          copyOp.getTarget().getDefiningOp());
+      if (!source || !target) {
+        return WalkResult::interrupt();
+      }
+      auto sourceFromMemrefOp =
+          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+              copyOp.getSource().getDefiningOp());
+      auto targetFromMemrefOp =
+          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+              copyOp.getTarget().getDefiningOp());
+      if (!sourceFromMemrefOp || !targetFromMemrefOp) {
+        return WalkResult::interrupt();
+      }
+      Operation *l2DefOp = nullptr;
+      Operation *l1DefOp = nullptr;
+      if (source.getMemorySpaceAsUInt() == 1 &&
+          target.getMemorySpaceAsUInt() == 2) {
+        l2DefOp = sourceFromMemrefOp.getMemref().getDefiningOp();
+        l1DefOp = targetFromMemrefOp;
+      } else if (target.getMemorySpaceAsUInt() == 1 &&
+                 source.getMemorySpaceAsUInt() == 2) {
+        l2DefOp = targetFromMemrefOp.getMemref().getDefiningOp();
+        l1DefOp = sourceFromMemrefOp;
+      } else {
+        return WalkResult::advance();
+      }
+      uniqueL2L1Pair[l2DefOp].insert(l1DefOp);
+      return WalkResult::advance();
+    }
+    return WalkResult::advance();
+  });
+
   /// Split the DMA and objectFifo ops based on the calcuated splitting
   /// dimensions.
+  DenseMap<Operation *, int64_t> splitFactorOfLOF;
   for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) {
     auto stridedOp =
         cast<AMDAIE::DoublyStridedOpInterface>(dmaOp.getOperation());
-    if (failed(splitDoublyStridedOp(
-            rewriter, stridedOp, dmaSplitInfo.sourceSplitDim,
-            dmaSplitInfo.targetSplitDim, dmaSplitInfo.splitSize,
-            dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride))) {
+    auto dmaCpyNd = cast<AMDAIE::DmaCpyNdOp>(dmaOp.getOperation());
+    int64_t splitFactor = dmaSplitInfo.splitSize;
+    if (stridedOp.getSourceMemorySpaceAsUInt() == 0) {
+      splitFactor =
+          uniqueL2L1Pair
+              [dmaCpyNd.getTarget()
+                   .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>()
+                   .getMemref()
+                   .getDefiningOp()]
+                  .size();
+    } else if (stridedOp.getTargetMemorySpaceAsUInt() == 0) {
+      splitFactor =
+          uniqueL2L1Pair
+              [dmaCpyNd.getSource()
+                   .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>()
+                   .getMemref()
+                   .getDefiningOp()]
+                  .size();
+    }
+    splitFactor = std::gcd(dmaSplitInfo.splitSize, splitFactor);
+    FailureOr<int64_t> maybeSplitFactor = splitDoublyStridedOp(
+        rewriter, stridedOp, dmaSplitInfo.sourceSplitDim,
+        dmaSplitInfo.targetSplitDim, splitFactor, dmaSplitInfo.newSourceStride,
+        dmaSplitInfo.newTargetStride);
+    if (failed(maybeSplitFactor)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to perform splitting of the DMA op: " << dmaOp);
       return signalPassFailure();
     }
+    splitFactorOfLOF[dmaCpyNd.getTarget().getDefiningOp()] = *maybeSplitFactor;
+    splitFactorOfLOF[dmaCpyNd.getSource().getDefiningOp()] = *maybeSplitFactor;
   }
   for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) {
     if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim,
-                                      splitInfo.splitSize,
+                                      splitFactorOfLOF[objFifo],
                                       splitInfo.splitStride))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to perform splitting of objectFifo op");
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
index eb92185d5..3a3b67460 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -751,13 +751,15 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
 }
 
 /// Split doubly strided operations on a source and target split dimension with
-/// the provided split factor.
-LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
-                                   AMDAIE::DoublyStridedOpInterface op,
-                                   size_t sourceSplitDim, size_t targetSplitDim,
-                                   std::optional<size_t> maybeSplitFactor,
-                                   int64_t sourceSplitStride,
-                                   int64_t targetSplitStride) {
+/// the provided split factor which might get updated. On success, return the
+/// split factor to the caller, else return failure.
+FailureOr<int64_t> splitDoublyStridedOp(IRRewriter &rewriter,
+                                        AMDAIE::DoublyStridedOpInterface op,
+                                        size_t sourceSplitDim,
+                                        size_t targetSplitDim,
+                                        std::optional<size_t> maybeSplitFactor,
+                                        int64_t sourceSplitStride,
+                                        int64_t targetSplitStride) {
   if (!op->use_empty())
     return op.emitOpError() << "can't be split because it has uses";
   SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
@@ -857,7 +859,7 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
     targetOffsets[targetSplitDim] = newTargetOffset.value();
   }
   rewriter.eraseOp(op);
-  return success();
+  return splitFactor;
 }
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index fee4e510c..8f529feee 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -38,9 +38,11 @@ LogicalResult splitLogicalObjectFifo(
     int64_t splitStride = 1);
 
 /// Split doubly strided operations on a source and target split dimension with
-/// the provided split factor. If no split factor is provided, the doubly
-/// strided operation will be split on the size of the dimension being split.
-LogicalResult splitDoublyStridedOp(
+/// the provided split factor which might get updated. On success, return the
+/// split factor to the caller, else return failure.
+/// NOTE: If no split factor is provided, the doubly strided operation will be
+///       split on the size of the dimension being split.
+FailureOr<int64_t> splitDoublyStridedOp(
     IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
     size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
     std::optional<size_t> splitFactor = std::nullopt,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index f930ad5d0..6af6a0ea8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -471,3 +471,66 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
     return
   }
 }
+
+// -----
+
+#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}>
+#translation = #iree_codegen.translation_info<pipeline = Custom>
+module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
+  func.func @pack_peel_4_level_4x8_Strix() attributes {translation_info = #translation} {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<1x1x8x8x8x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x8x8x4x8xi32, 2 : i32>
+    %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32>
+    %tile_0_1 = amdaie.tile(%c0, %c1)
+    %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>
+    %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32>
+    %tile_1_1 = amdaie.tile(%c1, %c1)
+    %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>
+    %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32>
+    %tile_2_1 = amdaie.tile(%c2, %c1)
+    %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>
+    %alloc_5 = memref.alloc() : memref<512x512xi32>
+    %tile_0_0 = amdaie.tile(%c0, %c0)
+    %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo<memref<512x512xi32>>
+    %alloc_6 = memref.alloc() : memref<512x4096xi32>
+    %tile_1_0 = amdaie.tile(%c1, %c0)
+    %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
+    %alloc_7 = memref.alloc() : memref<512x4096xi32>
+    %tile_2_0 = amdaie.tile(%c2, %c0)
+    %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
+    scf.forall (%arg0, %arg1) in (2, 8) {
+      %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x512xi32>>)
+      %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x4096xi32>>)
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %tile_1_2 = amdaie.tile(%c1, %c2)
+        %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
+        %tile_0_2 = amdaie.tile(%c0, %c2)
+        %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
+        %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
+        %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
+        %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
+        %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>)
+        %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+        %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) {
+          amdaie.end
+        }
+        %7 = amdaie.core(%tile_1_2, in : [%5, %4], out : []) {
+          amdaie.end
+        }
+        %8 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      }
+      %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo<memref<512x4096xi32>>, !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>)
+    }
+    memref.dealloc %alloc_4 : memref<8x8x32x64xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<16x8x64x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<16x8x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<1x1x8x8x4x8xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x1x8x8x8x4xi32, 2 : i32>
+    memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
+    return
+  }
+}

From 2588e1c079bbb5ca0310c74c029f7f233fd921e9 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 27 Jan 2025 12:13:04 +0000
Subject: [PATCH 03/20] Fix lit test

---
 .../test/split_logicalobjfifos.mlir           | 189 ++++++++++++------
 1 file changed, 123 insertions(+), 66 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index 6af6a0ea8..7708ab9f2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -42,6 +42,7 @@ module {
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
@@ -49,8 +50,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %2 = affine.apply #map(%arg1)
       %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %0[%2, 0] [64, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<2x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -93,6 +95,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
     %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
@@ -100,8 +103,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %2 = affine.apply #map(%arg2)
       %3 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1], %1[0, %2] [32, 64] [128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
@@ -151,6 +155,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_output(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
     %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
@@ -158,11 +165,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %2 = affine.apply #map(%arg2)
       %3 = affine.apply #map(%arg1)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %9 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %8 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %10 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
@@ -175,11 +185,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // Test of splitting matmul lhs input objectFifo and dma operations on 4x2 AIE array.
 // L2 buffer size `[4, 1, 32, 32]` is expected to be split into two `[2, 1, 32, 32]` buffers.
 
-// CHECK-label: func.func @split_L2_input_lhs_on_4x2_array
-//       CHECK:   %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A0]], {} :
-//  CHECK-SAME:         memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
-//       CHECK:   %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A1]], {} :
-//  CHECK-SAME:         memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-LABEL: func.func @split_L2_input_lhs_on_4x2_array
+//   CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+//       CHECK:   %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]]
+//       CHECK:   %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
 //       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (4, 2)
 //       CHECK:       %[[DMA_L3_TO_L2_A0:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                   %[[OBJ_L2_A0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1]
@@ -199,22 +209,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 //       CHECK:       %[[DMA_L2_TO_L1_A3:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]
 //  CHECK-SAME:                                   %[[OBJ_L2_A1]][1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]
-//       CHECK:   memref.dealloc %[[ALLOC_A0]] : memref<2x1x32x32xi32, 1 : i32>
-//       CHECK:   memref.dealloc %[[ALLOC_A1]] : memref<2x1x32x32xi32, 1 : i32>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}  {
   func.func @split_L2_input_lhs_on_4x2_array(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
     scf.forall (%arg1, %arg2) in (4, 2) {
       %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [128, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %10 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -234,14 +248,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1])
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 + 4)>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_producer_with_loop_dependency(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
@@ -249,8 +265,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
     scf.forall (%arg1, %arg2) in (2, 4) {
       %3 = affine.apply #map(%arg2)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -265,9 +282,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
 // CHECK:       }
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
@@ -276,14 +294,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_consumer_with_loop_dependency(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
     scf.forall (%arg1, %arg2) in (2, 4) {
       %3 = affine.apply #map(%arg2)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
@@ -327,15 +347,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1])
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 2 + 1)>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_producer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
@@ -344,8 +366,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -363,9 +386,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
 // CHECK:       }
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
@@ -375,6 +399,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_consumer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
@@ -382,8 +407,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
@@ -396,18 +422,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // CHECK-LABEL: func.func @change_split_factor_with_gcd_for_producer
 // CHECK-DAG:   %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
-// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
+// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) {
-// CHECK:         %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+// CHECK:         %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
 // CHECK:       }
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 * 2)>
@@ -415,6 +438,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @change_split_factor_with_gcd_for_producer(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
@@ -423,8 +447,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -436,25 +461,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // CHECK-LABEL: @change_split_factor_with_gcd_for_consumer
 // CHECK-DAG:   %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
-// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) {
-// CHECK:         %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+// CHECK:         %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
 // CHECK:       }
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1])
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1])
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 2 + 1)>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @change_split_factor_with_gcd_for_consumer(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
@@ -462,8 +485,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
     memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32>
@@ -474,6 +498,39 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // -----
 
+// CHECK-LABEL: @pack_peel_4_level_4x8_Strix
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[LHS_L3:.*]] = memref.alloc() : memref<512x512xi32>
+// CHECK:         %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK:         %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS_L3]], {%[[TILE_0_0]]} :
+// CHECK:         %[[RHS_L3:.*]] = memref.alloc() : memref<512x4096xi32>
+// CHECK:         %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK:         %[[LOF_RHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS_L3]], {%[[TILE_1_0]]} :
+// CHECK:         %[[OUT_L3:.*]] = memref.alloc() : memref<512x4096xi32>
+// CHECK:         %[[TILE_2_0:.*]] = amdaie.tile(%[[C2]], %[[C0]])
+// CHECK:         %[[LOF_OUT_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT_L3]], {%[[TILE_2_0]]} :
+// CHECK:         scf.forall (%{{.*}}, %{{.*}}) in (2, 8) {
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %[[LOF_LHS_L3]][0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 0] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 256] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
+// CHECK:             scf.forall (%{{.*}}, %{{.*}}) in (2, 2) {
+// CHECK:                 %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK:                 %[[LOF_RHS_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_1_2]]} :
+// CHECK:                 %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK:                 %[[LOF_RHS_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} :
+// CHECK:                 %[[LOF_LHS_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]], %[[TILE_1_2]]} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
+// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_RHS_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_RHS_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_LHS_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) :
+// CHECK:                 %[[LOF_OUT_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+// CHECK:                 amdaie.core
+// CHECK:                 amdaie.core
+// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %[[LOF_OUT_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) :
+// CHECK:             }
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_OUT_L3]][0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) :
+// CHECK:          }
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}>
 #translation = #iree_codegen.translation_info<pipeline = Custom>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {

From 4ed069eade28ca8b0fe88993e9d65e97e27e18e1 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 27 Jan 2025 12:19:27 +0000
Subject: [PATCH 04/20] Add one e2e test 512x512x256 (MxNxK)

---
 build_tools/ci/cpu_comparison/run.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 61807d232..253f993d8 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1717,6 +1717,25 @@ def __init__(self):
                 use_chess=False,
             )
         )
+        self.register(
+            Matmul(
+                512,
+                512,
+                256,
+                "i32",
+                "i32",
+                name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling",
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu4"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=8",
+                    "--iree-amd-aie-additional-peano-opt-flags=-O3",
+                    "--iree-amdaie-enable-function-outlining=True",
+                ],
+                use_chess=True,
+            )
+        )
 
         for target in ["npu1_4col", "npu4"]:
             self.register(

From be9c5ba56bf67014ccb06e1ccf9c7fb3a2d45c71 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 27 Jan 2025 12:22:36 +0000
Subject: [PATCH 05/20] Add ukernel to e2e CI

---
 build_tools/ci/cpu_comparison/run.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 253f993d8..a2d908d2e 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1801,6 +1801,26 @@ def __init__(self):
                 additional_labels=["I8UKernel"],
             )
         )
+        self.register(
+            Matmul(
+                64,
+                64,
+                64,
+                "bf16",
+                "f32",
+                name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling_ukernel",
+                use_ukernel=True,
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu4"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=8",
+                    "--iree-amd-aie-additional-peano-opt-flags=-O3",
+                    "--iree-amdaie-enable-function-outlining=True",
+                ],
+                use_chess=True,
+            )
+        )
 
         # Matmul test on 2(rows)x2(cols) cores
         self.register(

From 4b91fa31ad60acc66b8d24eee41a1e67db07b585 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Tue, 28 Jan 2025 12:01:24 +0000
Subject: [PATCH 06/20] Review comments v1.0

---
 build_tools/ci/cpu_comparison/run.py          |  6 --
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 75 ++++++++++++-------
 .../test/split_logicalobjfifos.mlir           | 66 ++++++++--------
 3 files changed, 79 insertions(+), 68 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index a2d908d2e..d96452e67 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1730,10 +1730,7 @@ def __init__(self):
                 aie_compilation_flags=[
                     "--iree-amdaie-num-rows=4",
                     "--iree-amdaie-num-cols=8",
-                    "--iree-amd-aie-additional-peano-opt-flags=-O3",
-                    "--iree-amdaie-enable-function-outlining=True",
                 ],
-                use_chess=True,
             )
         )
 
@@ -1815,10 +1812,7 @@ def __init__(self):
                 aie_compilation_flags=[
                     "--iree-amdaie-num-rows=4",
                     "--iree-amdaie-num-cols=8",
-                    "--iree-amd-aie-additional-peano-opt-flags=-O3",
-                    "--iree-amdaie-enable-function-outlining=True",
                 ],
-                use_chess=True,
             )
         )
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index e563f4059..088b77518 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -349,7 +349,23 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
   }
 
   // Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us
-  // figure out the split factor for all LogicalObjectFifos.
+  // figure out the split factor for all LogicalObjectFifos. Basically we get to
+  // decide how many splits to perform for a particular L2 ObjectFifo based on
+  // the total unique L2<->L1 Copy ops.
+  // Eg:
+  //      %lhs = LOF_on_L2
+  //      %a = LOF_on_L1_0
+  //      %b = LOF_on_L1_1
+  //      %c = LOF_on_L1_2
+  //      DMA(%a, %lhs)
+  //      DMA(%b, %lhs)
+  //      DMA(%c, %lhs)
+  //      DMA(%b, %lhs)
+  //      DMA(%c, %lhs)
+  //
+  //    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
+  //    them are unique. Hence we'd split %lhs into 3 unique splits, instead
+  //    of 5.
   DenseMap<Operation *, DenseSet<Operation *>> uniqueL2L1Pair;
   moduleOp->walk([&](Operation *op) -> WalkResult {
     if (auto copyOp = dyn_cast<CopyOpInterface>(op)) {
@@ -371,17 +387,20 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
       }
       Operation *l2DefOp = nullptr;
       Operation *l1DefOp = nullptr;
-      if (source.getMemorySpaceAsUInt() == 1 &&
-          target.getMemorySpaceAsUInt() == 2) {
+      // L2 -> L1.
+      if (target.getMemorySpaceAsUInt() == 2) {
         l2DefOp = sourceFromMemrefOp.getMemref().getDefiningOp();
         l1DefOp = targetFromMemrefOp;
-      } else if (target.getMemorySpaceAsUInt() == 1 &&
-                 source.getMemorySpaceAsUInt() == 2) {
+      } else if (source.getMemorySpaceAsUInt() == 2) {
+        // L1 -> L2.
         l2DefOp = targetFromMemrefOp.getMemref().getDefiningOp();
         l1DefOp = sourceFromMemrefOp;
       } else {
         return WalkResult::advance();
       }
+      if (!l2DefOp || !l1DefOp) {
+        return WalkResult::interrupt();
+      }
       uniqueL2L1Pair[l2DefOp].insert(l1DefOp);
       return WalkResult::advance();
     }
@@ -392,30 +411,31 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
   /// dimensions.
   DenseMap<Operation *, int64_t> splitFactorOfLOF;
   for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) {
-    auto stridedOp =
-        cast<AMDAIE::DoublyStridedOpInterface>(dmaOp.getOperation());
     auto dmaCpyNd = cast<AMDAIE::DmaCpyNdOp>(dmaOp.getOperation());
     int64_t splitFactor = dmaSplitInfo.splitSize;
-    if (stridedOp.getSourceMemorySpaceAsUInt() == 0) {
-      splitFactor =
-          uniqueL2L1Pair
-              [dmaCpyNd.getTarget()
-                   .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>()
-                   .getMemref()
-                   .getDefiningOp()]
-                  .size();
-    } else if (stridedOp.getTargetMemorySpaceAsUInt() == 0) {
-      splitFactor =
-          uniqueL2L1Pair
-              [dmaCpyNd.getSource()
-                   .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>()
-                   .getMemref()
-                   .getDefiningOp()]
-                  .size();
+    auto sourceDefOp =
+        dmaCpyNd.getSource()
+            .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>();
+    auto targetDefOp =
+        dmaCpyNd.getTarget()
+            .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>();
+    if (!sourceDefOp || !targetDefOp) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected defining op of source/target for : " << dmaOp);
+      return signalPassFailure();
+    }
+    if (dmaCpyNd.getSourceMemorySpaceAsUInt() == 0) {
+      if (Operation *l2DefOp = targetDefOp.getMemref().getDefiningOp())
+        splitFactor = uniqueL2L1Pair[l2DefOp].size();
+    } else if (dmaCpyNd.getTargetMemorySpaceAsUInt() == 0) {
+      if (Operation *l2DefOp = sourceDefOp.getMemref().getDefiningOp())
+        splitFactor = uniqueL2L1Pair[l2DefOp].size();
     }
+    // In cases where the number of available columns < the inferred split
+    // factor, we'll cap the final split factor by the lower bound.
     splitFactor = std::gcd(dmaSplitInfo.splitSize, splitFactor);
     FailureOr<int64_t> maybeSplitFactor = splitDoublyStridedOp(
-        rewriter, stridedOp, dmaSplitInfo.sourceSplitDim,
+        rewriter, dmaCpyNd, dmaSplitInfo.sourceSplitDim,
         dmaSplitInfo.targetSplitDim, splitFactor, dmaSplitInfo.newSourceStride,
         dmaSplitInfo.newTargetStride);
     if (failed(maybeSplitFactor)) {
@@ -423,8 +443,11 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
                  << "Failed to perform splitting of the DMA op: " << dmaOp);
       return signalPassFailure();
     }
-    splitFactorOfLOF[dmaCpyNd.getTarget().getDefiningOp()] = *maybeSplitFactor;
-    splitFactorOfLOF[dmaCpyNd.getSource().getDefiningOp()] = *maybeSplitFactor;
+    // The above function might change the split factor based on divisibility
+    // with source/target. Therefore here we maintain the final split factor
+    // which we'll use later to split the LogicalObjectFifo.
+    splitFactorOfLOF[targetDefOp] = *maybeSplitFactor;
+    splitFactorOfLOF[sourceDefOp] = *maybeSplitFactor;
   }
   for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) {
     if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index 7708ab9f2..daaf5e81c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -498,38 +498,30 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // -----
 
+// A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although
+// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8.
+// But to keep the test case concise it demonstrates a similar splitting strategy for 1 row and 2 columns.
+// Therefore L2 LHS will be split into 1 and L2 RHS will be split into 2. This needs to happen because
+// later in the compilation stack when the tiles are being assigned, we will
+// Refer: https://github.com/nod-ai/iree-amd-aie/pull/1031#discussion_r1920237380
+//
 // CHECK-LABEL: @pack_peel_4_level_4x8_Strix
 // CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK:         %[[LHS_L3:.*]] = memref.alloc() : memref<512x512xi32>
-// CHECK:         %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK:         %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS_L3]], {%[[TILE_0_0]]} :
-// CHECK:         %[[RHS_L3:.*]] = memref.alloc() : memref<512x4096xi32>
-// CHECK:         %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
-// CHECK:         %[[LOF_RHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS_L3]], {%[[TILE_1_0]]} :
-// CHECK:         %[[OUT_L3:.*]] = memref.alloc() : memref<512x4096xi32>
-// CHECK:         %[[TILE_2_0:.*]] = amdaie.tile(%[[C2]], %[[C0]])
-// CHECK:         %[[LOF_OUT_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT_L3]], {%[[TILE_2_0]]} :
 // CHECK:         scf.forall (%{{.*}}, %{{.*}}) in (2, 8) {
-// CHECK:             amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %[[LOF_LHS_L3]][0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>,
-// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 0] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
-// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 256] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %{{.*}}[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 0] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 128] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
 // CHECK:             scf.forall (%{{.*}}, %{{.*}}) in (2, 2) {
-// CHECK:                 %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK:                 %[[LOF_RHS_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_1_2]]} :
-// CHECK:                 %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK:                 %[[LOF_RHS_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} :
-// CHECK:                 %[[LOF_LHS_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]], %[[TILE_1_2]]} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
-// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_RHS_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
-// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_RHS_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
-// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_LHS_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) :
-// CHECK:                 %[[LOF_OUT_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+// CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) :
 // CHECK:                 amdaie.core
 // CHECK:                 amdaie.core
-// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %[[LOF_OUT_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) :
 // CHECK:             }
-// CHECK:             amdaie.dma_cpy_nd(%[[LOF_OUT_L3]][0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) :
+// CHECK:             amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) :
 // CHECK:          }
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}>
 #translation = #iree_codegen.translation_info<pipeline = Custom>
@@ -542,34 +534,36 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
     %alloc_0 = memref.alloc() : memref<1x1x8x8x8x4xi32, 2 : i32>
     %alloc_1 = memref.alloc() : memref<1x1x8x8x4x8xi32, 2 : i32>
     %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32>
-    %tile_0_1 = amdaie.tile(%c0, %c1)
-    %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>
     %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32>
-    %tile_1_1 = amdaie.tile(%c1, %c1)
-    %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>
     %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32>
-    %tile_2_1 = amdaie.tile(%c2, %c1)
-    %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>
     %alloc_5 = memref.alloc() : memref<512x512xi32>
-    %tile_0_0 = amdaie.tile(%c0, %c0)
-    %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo<memref<512x512xi32>>
     %alloc_6 = memref.alloc() : memref<512x4096xi32>
-    %tile_1_0 = amdaie.tile(%c1, %c0)
-    %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
     %alloc_7 = memref.alloc() : memref<512x4096xi32>
+    %tile_0_1 = amdaie.tile(%c0, %c1)
+    %tile_1_1 = amdaie.tile(%c1, %c1)
+    %tile_2_1 = amdaie.tile(%c2, %c1)
+    %tile_0_0 = amdaie.tile(%c0, %c0)
+    %tile_1_0 = amdaie.tile(%c1, %c0)
     %tile_2_0 = amdaie.tile(%c2, %c0)
+    %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>
+    %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>
+    %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>
+    %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo<memref<512x512xi32>>
+    %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
     %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
     scf.forall (%arg0, %arg1) in (2, 8) {
       %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x512xi32>>)
       %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x4096xi32>>)
       scf.forall (%arg2, %arg3) in (2, 2) {
+        %of0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg2)
+        %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2)
         %tile_1_2 = amdaie.tile(%c1, %c2)
-        %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
         %tile_0_2 = amdaie.tile(%c0, %c2)
+        %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
         %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
         %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
-        %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
-        %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
+        %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
+        %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
         %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>)
         %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
         %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) {

From 37771f855f8171daecc53686f18a39633dc0d541 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 06:57:42 +0000
Subject: [PATCH 07/20] Review comment v2.0

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 201 +++++++++---------
 .../AMDAIELogicalObjFifoSplittingUtils.cpp    |  27 +--
 .../AMDAIELogicalObjFifoSplittingUtils.h      |   2 +-
 3 files changed, 109 insertions(+), 121 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 088b77518..298e24c6e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -121,10 +121,77 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
   return splitStride;
 }
 
+/// Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us
+/// figure out the split factor for all LogicalObjectFifos. Basically we get to
+/// decide how many splits to perform for a particular L2 ObjectFifo based on
+/// the total unique L2<->L1 Copy ops.
+/// Eg:
+///      %lhs = LOF_on_L2
+///      %a = LOF_on_L1_0
+///      %b = LOF_on_L1_1
+///      %c = LOF_on_L1_2
+///      DMA(%a, %lhs)
+///      DMA(%b, %lhs)
+///      DMA(%c, %lhs)
+///      DMA(%b, %lhs)
+///      DMA(%c, %lhs)
+///
+///    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
+///    them are unique. Hence we'd split %lhs into 3 unique splits, instead
+///    of 5.
+static DenseMap<Operation *, int64_t> fetchUniqueL2L1(ModuleOp moduleOp) {
+  DenseMap<Operation *, DenseSet<Operation *>> uniqueL2L1Pair;
+  moduleOp->walk([&](Operation *op) -> WalkResult {
+    if (auto copyOp = dyn_cast<CopyOpInterface>(op)) {
+      auto source = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+          copyOp.getSource().getDefiningOp());
+      auto target = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+          copyOp.getTarget().getDefiningOp());
+      if (!source || !target) {
+        return WalkResult::interrupt();
+      }
+      auto sourceFromMemrefOp =
+          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+              copyOp.getSource().getDefiningOp());
+      auto targetFromMemrefOp =
+          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+              copyOp.getTarget().getDefiningOp());
+      if (!sourceFromMemrefOp || !targetFromMemrefOp) {
+        return WalkResult::interrupt();
+      }
+      Operation *l2LofOp = nullptr;
+      Operation *l1LofOp = nullptr;
+      // L2 -> L1.
+      if (source.getMemorySpaceAsUInt() == 1 &&
+          target.getMemorySpaceAsUInt() == 2) {
+        l2LofOp = sourceFromMemrefOp;
+        l1LofOp = targetFromMemrefOp;
+      } else if (source.getMemorySpaceAsUInt() == 2 &&
+                 target.getMemorySpaceAsUInt() == 1) {
+        // L1 -> L2.
+        l2LofOp = targetFromMemrefOp;
+        l1LofOp = sourceFromMemrefOp;
+      } else {
+        return WalkResult::advance();
+      }
+      uniqueL2L1Pair[l2LofOp].insert(l1LofOp);
+      return WalkResult::advance();
+    }
+    return WalkResult::advance();
+  });
+
+  DenseMap<Operation *, int64_t> uniqueL2L1Count;
+  for (auto &&[l2Lof, l1Lofs] : uniqueL2L1Pair)
+    uniqueL2L1Count[l2Lof] = l1Lofs.size();
+
+  return uniqueL2L1Count;
+}
+
 /// Find the logical objectFifo and DMA source/target splitting dimensions for
 /// each DMA and objectFifo pair.
 ///
-/// Each pair is handled in the following way:
+/// At first we find count of total unique L2<->L1 pairs for all L2 objectFifos.
+/// Then each DMA and objectFifo pair is handled in the following way:
 /// First, compute the objectFifo splitting dimension based on the last non-unit
 /// shape dimension and the number of available columns. Afterwards, depending
 /// on which logical objectFifo is being split on, find the outermost dimension
@@ -139,11 +206,12 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 /// splitting because that's the number of elements that should be
 /// produced/consumed on the respective sides before splitting.
 LogicalResult collectSplittingDims(
-    const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
+    ModuleOp &moduleOp, const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
     DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
         &objFifoSplitInfoMap,
     int64_t numCols) {
+  DenseMap<Operation *, int64_t> uniqueL2L1Pair = fetchUniqueL2L1(moduleOp);
   for (auto [dmaOp, objFifo] : dmaObjFifoPairs) {
     LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n");
     LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n");
@@ -218,6 +286,12 @@ LogicalResult collectSplittingDims(
       // Calculate the new source stride to be used for splitting the DMA.
       int64_t newSourceStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
+      int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols);
+      int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
+      int64_t targetSize = (*targetSizes)[targetSplitDim];
+      if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
+        splitFactor = std::gcd(sourceSize, targetSize);
+      }
       LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs()
@@ -225,10 +299,11 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs()
                  << "objFifoSplitDim: " << objFifoSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
-                                1, numCols};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
+                                1, splitFactor};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
+                                      splitStride};
     } else if (dmaOp.getSourceObjectFifo() == objFifo) {
       // Find outermost dimension in the access pattern that has stride ==
       // sizeAfterSplit and size != 1.
@@ -274,6 +349,12 @@ LogicalResult collectSplittingDims(
       // Calculate the new target stride to be used for splitting the DMA.
       int64_t newTargetStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
+      int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols);
+      int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
+      int64_t targetSize = (*targetSizes)[targetSplitDim];
+      if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
+        splitFactor = std::gcd(sourceSize, targetSize);
+      }
       LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs()
@@ -281,10 +362,11 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs()
                  << "objFifoSplitDim: " << objFifoSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
-                                newTargetStride, numCols};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
+                                newTargetStride, splitFactor};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
+                                      splitStride};
     }
   }
   return success();
@@ -343,115 +425,28 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
   DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> dmaSplitInfoMap;
   DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
       objFifoSplitInfoMap;
-  if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap,
+  if (failed(collectSplittingDims(moduleOp, dmaObjFifoPairs, dmaSplitInfoMap,
                                   objFifoSplitInfoMap, numColumns))) {
     return signalPassFailure();
   }
 
-  // Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us
-  // figure out the split factor for all LogicalObjectFifos. Basically we get to
-  // decide how many splits to perform for a particular L2 ObjectFifo based on
-  // the total unique L2<->L1 Copy ops.
-  // Eg:
-  //      %lhs = LOF_on_L2
-  //      %a = LOF_on_L1_0
-  //      %b = LOF_on_L1_1
-  //      %c = LOF_on_L1_2
-  //      DMA(%a, %lhs)
-  //      DMA(%b, %lhs)
-  //      DMA(%c, %lhs)
-  //      DMA(%b, %lhs)
-  //      DMA(%c, %lhs)
-  //
-  //    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
-  //    them are unique. Hence we'd split %lhs into 3 unique splits, instead
-  //    of 5.
-  DenseMap<Operation *, DenseSet<Operation *>> uniqueL2L1Pair;
-  moduleOp->walk([&](Operation *op) -> WalkResult {
-    if (auto copyOp = dyn_cast<CopyOpInterface>(op)) {
-      auto source = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-          copyOp.getSource().getDefiningOp());
-      auto target = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-          copyOp.getTarget().getDefiningOp());
-      if (!source || !target) {
-        return WalkResult::interrupt();
-      }
-      auto sourceFromMemrefOp =
-          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-              copyOp.getSource().getDefiningOp());
-      auto targetFromMemrefOp =
-          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-              copyOp.getTarget().getDefiningOp());
-      if (!sourceFromMemrefOp || !targetFromMemrefOp) {
-        return WalkResult::interrupt();
-      }
-      Operation *l2DefOp = nullptr;
-      Operation *l1DefOp = nullptr;
-      // L2 -> L1.
-      if (target.getMemorySpaceAsUInt() == 2) {
-        l2DefOp = sourceFromMemrefOp.getMemref().getDefiningOp();
-        l1DefOp = targetFromMemrefOp;
-      } else if (source.getMemorySpaceAsUInt() == 2) {
-        // L1 -> L2.
-        l2DefOp = targetFromMemrefOp.getMemref().getDefiningOp();
-        l1DefOp = sourceFromMemrefOp;
-      } else {
-        return WalkResult::advance();
-      }
-      if (!l2DefOp || !l1DefOp) {
-        return WalkResult::interrupt();
-      }
-      uniqueL2L1Pair[l2DefOp].insert(l1DefOp);
-      return WalkResult::advance();
-    }
-    return WalkResult::advance();
-  });
-
   /// Split the DMA and objectFifo ops based on the calcuated splitting
   /// dimensions.
-  DenseMap<Operation *, int64_t> splitFactorOfLOF;
   for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) {
-    auto dmaCpyNd = cast<AMDAIE::DmaCpyNdOp>(dmaOp.getOperation());
-    int64_t splitFactor = dmaSplitInfo.splitSize;
-    auto sourceDefOp =
-        dmaCpyNd.getSource()
-            .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>();
-    auto targetDefOp =
-        dmaCpyNd.getTarget()
-            .getDefiningOp<AMDAIE::LogicalObjectFifoFromMemrefOp>();
-    if (!sourceDefOp || !targetDefOp) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected defining op of source/target for : " << dmaOp);
-      return signalPassFailure();
-    }
-    if (dmaCpyNd.getSourceMemorySpaceAsUInt() == 0) {
-      if (Operation *l2DefOp = targetDefOp.getMemref().getDefiningOp())
-        splitFactor = uniqueL2L1Pair[l2DefOp].size();
-    } else if (dmaCpyNd.getTargetMemorySpaceAsUInt() == 0) {
-      if (Operation *l2DefOp = sourceDefOp.getMemref().getDefiningOp())
-        splitFactor = uniqueL2L1Pair[l2DefOp].size();
-    }
-    // In cases where the number of available columns < the inferred split
-    // factor, we'll cap the final split factor by the lower bound.
-    splitFactor = std::gcd(dmaSplitInfo.splitSize, splitFactor);
-    FailureOr<int64_t> maybeSplitFactor = splitDoublyStridedOp(
-        rewriter, dmaCpyNd, dmaSplitInfo.sourceSplitDim,
-        dmaSplitInfo.targetSplitDim, splitFactor, dmaSplitInfo.newSourceStride,
-        dmaSplitInfo.newTargetStride);
-    if (failed(maybeSplitFactor)) {
+    auto stridedOp =
+        cast<AMDAIE::DoublyStridedOpInterface>(dmaOp.getOperation());
+    if (failed(splitDoublyStridedOp(
+            rewriter, stridedOp, dmaSplitInfo.sourceSplitDim,
+            dmaSplitInfo.targetSplitDim, dmaSplitInfo.splitSize,
+            dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to perform splitting of the DMA op: " << dmaOp);
       return signalPassFailure();
     }
-    // The above function might change the split factor based on divisibility
-    // with source/target. Therefore here we maintain the final split factor
-    // which we'll use later to split the LogicalObjectFifo.
-    splitFactorOfLOF[targetDefOp] = *maybeSplitFactor;
-    splitFactorOfLOF[sourceDefOp] = *maybeSplitFactor;
   }
   for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) {
     if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim,
-                                      splitFactorOfLOF[objFifo],
+                                      splitInfo.splitSize,
                                       splitInfo.splitStride))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to perform splitting of objectFifo op");
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
index 3a3b67460..0ec9f0532 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -753,13 +753,12 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
 /// Split doubly strided operations on a source and target split dimension with
 /// the provided split factor which might get updated. On success, return the
 /// split factor to the caller, else return failure.
-FailureOr<int64_t> splitDoublyStridedOp(IRRewriter &rewriter,
-                                        AMDAIE::DoublyStridedOpInterface op,
-                                        size_t sourceSplitDim,
-                                        size_t targetSplitDim,
-                                        std::optional<size_t> maybeSplitFactor,
-                                        int64_t sourceSplitStride,
-                                        int64_t targetSplitStride) {
+LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
+                                   AMDAIE::DoublyStridedOpInterface op,
+                                   size_t sourceSplitDim, size_t targetSplitDim,
+                                   std::optional<size_t> maybeSplitFactor,
+                                   int64_t sourceSplitStride,
+                                   int64_t targetSplitStride) {
   if (!op->use_empty())
     return op.emitOpError() << "can't be split because it has uses";
   SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
@@ -802,15 +801,9 @@ FailureOr<int64_t> splitDoublyStridedOp(IRRewriter &rewriter,
   }
   int64_t sourceSize = maybeSourceSize.value();
   int64_t targetSize = maybeTargetSize.value();
-  int64_t splitFactor = maybeSplitFactor.has_value()
-                            ? maybeSplitFactor.value()
-                            : std::gcd(sourceSize, targetSize);
-  if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
-    int64_t newSplitFactor = std::gcd(sourceSize, targetSize);
-    LLVM_DEBUG(llvm::dbgs() << "split factor has been changed from "
-                            << splitFactor << " to " << newSplitFactor);
-    splitFactor = newSplitFactor;
-  }
+  assert(maybeSplitFactor.has_value() &&
+         "expected split factor to be sent by the caller");
+  int64_t splitFactor = maybeSplitFactor.value();
 
   int64_t newSourceSize = sourceSize / splitFactor;
   int64_t newTargetSize = targetSize / splitFactor;
@@ -859,7 +852,7 @@ FailureOr<int64_t> splitDoublyStridedOp(IRRewriter &rewriter,
     targetOffsets[targetSplitDim] = newTargetOffset.value();
   }
   rewriter.eraseOp(op);
-  return splitFactor;
+  return success();
 }
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index 8f529feee..a39686602 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -42,7 +42,7 @@ LogicalResult splitLogicalObjectFifo(
 /// split factor to the caller, else return failure.
 /// NOTE: If no split factor is provided, the doubly strided operation will be
 ///       split on the size of the dimension being split.
-FailureOr<int64_t> splitDoublyStridedOp(
+LogicalResult splitDoublyStridedOp(
     IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
     size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
     std::optional<size_t> splitFactor = std::nullopt,

From fe4f363e0e1408634dff5dc5680f902d3548598e Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 07:30:07 +0000
Subject: [PATCH 08/20] use_chess Strix

---
 build_tools/ci/cpu_comparison/run.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index d96452e67..ef33edc00 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1813,6 +1813,7 @@ def __init__(self):
                     "--iree-amdaie-num-rows=4",
                     "--iree-amdaie-num-cols=8",
                 ],
+                use_chess=True,
             )
         )
 

From b58611738e8412ac14d3ced67ec80e6a8fffd433 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 10:59:12 +0000
Subject: [PATCH 09/20] Review comment v3.0

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 96 +++++++------------
 .../AMDAIELogicalObjFifoSplittingUtils.cpp    |  8 +-
 .../AMDAIELogicalObjFifoSplittingUtils.h      |  8 +-
 3 files changed, 41 insertions(+), 71 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 298e24c6e..d1b07d772 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -121,7 +121,7 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
   return splitStride;
 }
 
-/// Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us
+/// Fetch total no. of unique pairs of L2<->L1 Copy ops. This would helps us
 /// figure out the split factor for all LogicalObjectFifos. Basically we get to
 /// decide how many splits to perform for a particular L2 ObjectFifo based on
 /// the total unique L2<->L1 Copy ops.
@@ -137,61 +137,30 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 ///      DMA(%c, %lhs)
 ///
 ///    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
-///    them are unique. Hence we'd split %lhs into 3 unique splits, instead
-///    of 5.
-static DenseMap<Operation *, int64_t> fetchUniqueL2L1(ModuleOp moduleOp) {
-  DenseMap<Operation *, DenseSet<Operation *>> uniqueL2L1Pair;
-  moduleOp->walk([&](Operation *op) -> WalkResult {
-    if (auto copyOp = dyn_cast<CopyOpInterface>(op)) {
-      auto source = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-          copyOp.getSource().getDefiningOp());
-      auto target = dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-          copyOp.getTarget().getDefiningOp());
-      if (!source || !target) {
-        return WalkResult::interrupt();
-      }
-      auto sourceFromMemrefOp =
-          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-              copyOp.getSource().getDefiningOp());
-      auto targetFromMemrefOp =
-          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-              copyOp.getTarget().getDefiningOp());
-      if (!sourceFromMemrefOp || !targetFromMemrefOp) {
-        return WalkResult::interrupt();
-      }
-      Operation *l2LofOp = nullptr;
-      Operation *l1LofOp = nullptr;
-      // L2 -> L1.
-      if (source.getMemorySpaceAsUInt() == 1 &&
-          target.getMemorySpaceAsUInt() == 2) {
-        l2LofOp = sourceFromMemrefOp;
-        l1LofOp = targetFromMemrefOp;
-      } else if (source.getMemorySpaceAsUInt() == 2 &&
-                 target.getMemorySpaceAsUInt() == 1) {
-        // L1 -> L2.
-        l2LofOp = targetFromMemrefOp;
-        l1LofOp = sourceFromMemrefOp;
-      } else {
-        return WalkResult::advance();
-      }
-      uniqueL2L1Pair[l2LofOp].insert(l1LofOp);
-      return WalkResult::advance();
+///    them are unique. Hence we'd split %lhs into 3 unique splits, instead of 5.
+static FailureOr<int64_t> fetchTotalUniqueL2L1(SmallVector<CopyOpInterface> copyLikeOps, bool fetchTarget) {
+  DenseSet<Operation*> uniqueLof;
+  for (CopyOpInterface copyOp : copyLikeOps) {
+    AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr;
+    if (fetchTarget) {
+      lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            copyOp.getTarget().getDefiningOp());
+    } else {
+      lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            copyOp.getSource().getDefiningOp());
     }
-    return WalkResult::advance();
-  });
-
-  DenseMap<Operation *, int64_t> uniqueL2L1Count;
-  for (auto &&[l2Lof, l1Lofs] : uniqueL2L1Pair)
-    uniqueL2L1Count[l2Lof] = l1Lofs.size();
-
-  return uniqueL2L1Count;
+    if (!lof) {
+      return copyOp.emitOpError()<< "could not retrieve source/target objectFifo";
+    }
+    uniqueLof.insert(lof);
+  }
+  return uniqueLof.size();
 }
 
 /// Find the logical objectFifo and DMA source/target splitting dimensions for
 /// each DMA and objectFifo pair.
 ///
-/// At first we find count of total unique L2<->L1 pairs for all L2 objectFifos.
-/// Then each DMA and objectFifo pair is handled in the following way:
+/// Each pair is handled in the following way:
 /// First, compute the objectFifo splitting dimension based on the last non-unit
 /// shape dimension and the number of available columns. Afterwards, depending
 /// on which logical objectFifo is being split on, find the outermost dimension
@@ -205,13 +174,14 @@ static DenseMap<Operation *, int64_t> fetchUniqueL2L1(ModuleOp moduleOp) {
 /// that has product size larger than the other side's product size after
 /// splitting because that's the number of elements that should be
 /// produced/consumed on the respective sides before splitting.
+/// Towards the end fetch the count of unique L2<->L1 for the objectFifo which
+/// will be split. This would form the split factor which would be capped by the
+/// total no. of columns OR std::gcd of source/target size.
 LogicalResult collectSplittingDims(
     ModuleOp &moduleOp, const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
     DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
-        &objFifoSplitInfoMap,
-    int64_t numCols) {
-  DenseMap<Operation *, int64_t> uniqueL2L1Pair = fetchUniqueL2L1(moduleOp);
+        &objFifoSplitInfoMap, int64_t numCols) {
   for (auto [dmaOp, objFifo] : dmaObjFifoPairs) {
     LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n");
     LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n");
@@ -286,7 +256,12 @@ LogicalResult collectSplittingDims(
       // Calculate the new source stride to be used for splitting the DMA.
       int64_t newSourceStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols);
+      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true);
+      if (failed(maybeUniqueL2L1)) {
+        objFifo.emitOpError()
+            << "could not retrieve total unique L2<->L1 pairs";
+      }
+      int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols);
       int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
       int64_t targetSize = (*targetSizes)[targetSplitDim];
       if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
@@ -302,8 +277,7 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
                                 1, splitFactor};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
-                                      splitStride};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride};
     } else if (dmaOp.getSourceObjectFifo() == objFifo) {
       // Find outermost dimension in the access pattern that has stride ==
       // sizeAfterSplit and size != 1.
@@ -349,7 +323,12 @@ LogicalResult collectSplittingDims(
       // Calculate the new target stride to be used for splitting the DMA.
       int64_t newTargetStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols);
+      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeProducers(), /*fetchTarget=*/false);
+      if (failed(maybeUniqueL2L1)) {
+        objFifo.emitOpError()
+            << "could not retrieve total unique L2<->L1 pairs";
+      }
+      int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols);
       int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
       int64_t targetSize = (*targetSizes)[targetSplitDim];
       if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
@@ -365,8 +344,7 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
                                 newTargetStride, splitFactor};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
-                                      splitStride};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride};
     }
   }
   return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
index 0ec9f0532..afde8fe8d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -751,12 +751,11 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
 }
 
 /// Split doubly strided operations on a source and target split dimension with
-/// the provided split factor which might get updated. On success, return the
-/// split factor to the caller, else return failure.
+/// the provided split factor.
 LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
                                    AMDAIE::DoublyStridedOpInterface op,
                                    size_t sourceSplitDim, size_t targetSplitDim,
-                                   std::optional<size_t> maybeSplitFactor,
+                                   int64_t splitFactor,
                                    int64_t sourceSplitStride,
                                    int64_t targetSplitStride) {
   if (!op->use_empty())
@@ -801,9 +800,6 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
   }
   int64_t sourceSize = maybeSourceSize.value();
   int64_t targetSize = maybeTargetSize.value();
-  assert(maybeSplitFactor.has_value() &&
-         "expected split factor to be sent by the caller");
-  int64_t splitFactor = maybeSplitFactor.value();
 
   int64_t newSourceSize = sourceSize / splitFactor;
   int64_t newTargetSize = targetSize / splitFactor;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index a39686602..a9a22c471 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -38,15 +38,11 @@ LogicalResult splitLogicalObjectFifo(
     int64_t splitStride = 1);
 
 /// Split doubly strided operations on a source and target split dimension with
-/// the provided split factor which might get updated. On success, return the
-/// split factor to the caller, else return failure.
-/// NOTE: If no split factor is provided, the doubly strided operation will be
-///       split on the size of the dimension being split.
+/// the provided split factor.
 LogicalResult splitDoublyStridedOp(
     IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
     size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
-    std::optional<size_t> splitFactor = std::nullopt,
-    int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
+    int64_t splitFactor, int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
 
 }  // namespace mlir::iree_compiler::AMDAIE
 

From f9d8a990d1230196cc47f1337d0f89164710f248 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 11:00:04 +0000
Subject: [PATCH 10/20] Fix getCopyLikeConsumer/Producer APIs

---
 .../IR/AMDAIELogicalObjFifoOpInterface.cpp       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
index 48dc97d98..89fb8b57a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
@@ -17,10 +17,10 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeConsumers(
     LogicalObjFifoOpInterface op) {
   SmallVector<mlir::CopyOpInterface> copyLikOps;
   for (Operation *userOp : op->getUsers()) {
-    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
-        dyn_cast_if_present<LogicalObjFifoOpInterface>(
-            copyOp.getSource().getDefiningOp()) == op) {
-      copyLikOps.push_back(copyOp);
+    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp)) {
+      if (dyn_cast_if_present<LogicalObjFifoOpInterface>(
+              copyOp.getSource().getDefiningOp()) == op)
+        copyLikOps.push_back(copyOp);
     }
   }
   return copyLikOps;
@@ -30,10 +30,10 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeProducers(
     LogicalObjFifoOpInterface op) {
   SmallVector<mlir::CopyOpInterface> copyLikOps;
   for (Operation *userOp : op->getUsers()) {
-    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
-        dyn_cast_if_present<LogicalObjFifoOpInterface>(
-            copyOp.getTarget().getDefiningOp()) == op) {
-      copyLikOps.push_back(copyOp);
+    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp)) {
+      if (dyn_cast_if_present<LogicalObjFifoOpInterface>(
+              copyOp.getTarget().getDefiningOp()) == op)
+        copyLikOps.push_back(copyOp);
     }
   }
   return copyLikOps;

From e6f6b780f4e877655f4270de9bb148dbdfcdf2ea Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 11:02:07 +0000
Subject: [PATCH 11/20] Pre-commit fixes

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 30 ++++++++++++-------
 .../AMDAIELogicalObjFifoSplittingUtils.h      |  4 +--
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index d1b07d772..a78d12e08 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -137,20 +137,23 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 ///      DMA(%c, %lhs)
 ///
 ///    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
-///    them are unique. Hence we'd split %lhs into 3 unique splits, instead of 5.
-static FailureOr<int64_t> fetchTotalUniqueL2L1(SmallVector<CopyOpInterface> copyLikeOps, bool fetchTarget) {
-  DenseSet<Operation*> uniqueLof;
+///    them are unique. Hence we'd split %lhs into 3 unique splits, instead
+///    of 5.
+static FailureOr<int64_t> fetchTotalUniqueL2L1(
+    SmallVector<CopyOpInterface> copyLikeOps, bool fetchTarget) {
+  DenseSet<Operation *> uniqueLof;
   for (CopyOpInterface copyOp : copyLikeOps) {
     AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr;
     if (fetchTarget) {
       lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-            copyOp.getTarget().getDefiningOp());
+          copyOp.getTarget().getDefiningOp());
     } else {
       lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-            copyOp.getSource().getDefiningOp());
+          copyOp.getSource().getDefiningOp());
     }
     if (!lof) {
-      return copyOp.emitOpError()<< "could not retrieve source/target objectFifo";
+      return copyOp.emitOpError()
+             << "could not retrieve source/target objectFifo";
     }
     uniqueLof.insert(lof);
   }
@@ -181,7 +184,8 @@ LogicalResult collectSplittingDims(
     ModuleOp &moduleOp, const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
     DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
-        &objFifoSplitInfoMap, int64_t numCols) {
+        &objFifoSplitInfoMap,
+    int64_t numCols) {
   for (auto [dmaOp, objFifo] : dmaObjFifoPairs) {
     LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n");
     LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n");
@@ -256,7 +260,8 @@ LogicalResult collectSplittingDims(
       // Calculate the new source stride to be used for splitting the DMA.
       int64_t newSourceStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true);
+      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(
+          objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true);
       if (failed(maybeUniqueL2L1)) {
         objFifo.emitOpError()
             << "could not retrieve total unique L2<->L1 pairs";
@@ -277,7 +282,8 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
                                 1, splitFactor};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
+                                      splitStride};
     } else if (dmaOp.getSourceObjectFifo() == objFifo) {
       // Find outermost dimension in the access pattern that has stride ==
       // sizeAfterSplit and size != 1.
@@ -323,7 +329,8 @@ LogicalResult collectSplittingDims(
       // Calculate the new target stride to be used for splitting the DMA.
       int64_t newTargetStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeProducers(), /*fetchTarget=*/false);
+      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(
+          objFifo.getCopyLikeProducers(), /*fetchTarget=*/false);
       if (failed(maybeUniqueL2L1)) {
         objFifo.emitOpError()
             << "could not retrieve total unique L2<->L1 pairs";
@@ -344,7 +351,8 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
                                 newTargetStride, splitFactor};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
+                                      splitStride};
     }
   }
   return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index a9a22c471..0905795fd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -41,8 +41,8 @@ LogicalResult splitLogicalObjectFifo(
 /// the provided split factor.
 LogicalResult splitDoublyStridedOp(
     IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
-    size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
-    int64_t splitFactor, int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
+    size_t sourceSplitDim = 0, size_t targetSplitDim = 0, int64_t splitFactor,
+    int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
 
 }  // namespace mlir::iree_compiler::AMDAIE
 

From 17032714196ead5ba69dbde824f1cd846416f225 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 11:07:14 +0000
Subject: [PATCH 12/20] Remove moduleOp - not needed anymore

---
 .../iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index a78d12e08..00a743e77 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -181,7 +181,7 @@ static FailureOr<int64_t> fetchTotalUniqueL2L1(
 /// will be split. This would form the split factor which would be capped by the
 /// total no. of columns OR std::gcd of source/target size.
 LogicalResult collectSplittingDims(
-    ModuleOp &moduleOp, const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
+    const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
     DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
         &objFifoSplitInfoMap,
@@ -411,7 +411,7 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
   DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> dmaSplitInfoMap;
   DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
       objFifoSplitInfoMap;
-  if (failed(collectSplittingDims(moduleOp, dmaObjFifoPairs, dmaSplitInfoMap,
+  if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap,
                                   objFifoSplitInfoMap, numColumns))) {
     return signalPassFailure();
   }

From 509fb6d40379e39ee7d0ee0e4ded75450f531d41 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 14:27:04 +0000
Subject: [PATCH 13/20] Review v4.0

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 26 ++++++++++---------
 .../AMDAIELogicalObjFifoSplittingUtils.h      | 11 +++++---
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 00a743e77..785708563 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -121,11 +121,10 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
   return splitStride;
 }
 
-/// Fetch total no. of unique pairs of L2<->L1 Copy ops. This would helps us
-/// figure out the split factor for all LogicalObjectFifos. Basically we get to
-/// decide how many splits to perform for a particular L2 ObjectFifo based on
-/// the total unique L2<->L1 Copy ops.
-/// Eg:
+/// Given a list of Copy Ops, fetch the total no. of unique consumer/producer
+/// LogicalObjectFifos. This would helps us figure out the split factor for
+/// LogicalObjectFifos.
+/// And example case which necessitated this feature :-
 ///      %lhs = LOF_on_L2
 ///      %a = LOF_on_L1_0
 ///      %b = LOF_on_L1_1
@@ -139,12 +138,13 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 ///    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
 ///    them are unique. Hence we'd split %lhs into 3 unique splits, instead
 ///    of 5.
-static FailureOr<int64_t> fetchTotalUniqueL2L1(
-    SmallVector<CopyOpInterface> copyLikeOps, bool fetchTarget) {
+template <CopyOpOperateOn OperateOn>
+static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifos(
+    SmallVector<CopyOpInterface> copyLikeOps) {
   DenseSet<Operation *> uniqueLof;
   for (CopyOpInterface copyOp : copyLikeOps) {
     AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr;
-    if (fetchTarget) {
+    if constexpr (OperateOn == CopyOpOperateOn::Target) {
       lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
           copyOp.getTarget().getDefiningOp());
     } else {
@@ -260,8 +260,9 @@ LogicalResult collectSplittingDims(
       // Calculate the new source stride to be used for splitting the DMA.
       int64_t newSourceStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(
-          objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true);
+      FailureOr<int64_t> maybeUniqueL2L1 =
+          fetchTotalUniqueLogicalObjFifos<CopyOpOperateOn::Target>(
+              objFifo.getCopyLikeConsumers());
       if (failed(maybeUniqueL2L1)) {
         objFifo.emitOpError()
             << "could not retrieve total unique L2<->L1 pairs";
@@ -329,8 +330,9 @@ LogicalResult collectSplittingDims(
       // Calculate the new target stride to be used for splitting the DMA.
       int64_t newTargetStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      FailureOr<int64_t> maybeUniqueL2L1 = fetchTotalUniqueL2L1(
-          objFifo.getCopyLikeProducers(), /*fetchTarget=*/false);
+      FailureOr<int64_t> maybeUniqueL2L1 =
+          fetchTotalUniqueLogicalObjFifos<CopyOpOperateOn::Source>(
+              objFifo.getCopyLikeProducers());
       if (failed(maybeUniqueL2L1)) {
         objFifo.emitOpError()
             << "could not retrieve total unique L2<->L1 pairs";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index 0905795fd..7a428df66 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -39,10 +39,13 @@ LogicalResult splitLogicalObjectFifo(
 
 /// Split doubly strided operations on a source and target split dimension with
 /// the provided split factor.
-LogicalResult splitDoublyStridedOp(
-    IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
-    size_t sourceSplitDim = 0, size_t targetSplitDim = 0, int64_t splitFactor,
-    int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
+LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
+                                   AMDAIE::DoublyStridedOpInterface op,
+                                   size_t sourceSplitDim = 0,
+                                   size_t targetSplitDim = 0,
+                                   int64_t splitFactor = 1,
+                                   int64_t sourceSplitStride = 1,
+                                   int64_t targetSplitStride = 1);
 
 }  // namespace mlir::iree_compiler::AMDAIE
 

From da1c62d760ce33d35a04caacf2b78fbd92f4bf03 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 14:46:05 +0000
Subject: [PATCH 14/20] Review comment v5.0

---
 .../IR/AMDAIELogicalObjFifoOpInterface.cpp       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
index 89fb8b57a..5031d688e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
@@ -17,10 +17,10 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeConsumers(
     LogicalObjFifoOpInterface op) {
   SmallVector<mlir::CopyOpInterface> copyLikOps;
   for (Operation *userOp : op->getUsers()) {
-    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp)) {
-      if (dyn_cast_if_present<LogicalObjFifoOpInterface>(
-              copyOp.getSource().getDefiningOp()) == op)
-        copyLikOps.push_back(copyOp);
+    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
+        copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
+                      copyOp.getSource().getDefiningOp()) == op) {
+      copyLikOps.push_back(copyOp);
     }
   }
   return copyLikOps;
@@ -30,10 +30,10 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeProducers(
     LogicalObjFifoOpInterface op) {
   SmallVector<mlir::CopyOpInterface> copyLikOps;
   for (Operation *userOp : op->getUsers()) {
-    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp)) {
-      if (dyn_cast_if_present<LogicalObjFifoOpInterface>(
-              copyOp.getTarget().getDefiningOp()) == op)
-        copyLikOps.push_back(copyOp);
+    if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
+        copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
+                      copyOp.getTarget().getDefiningOp()) == op) {
+      copyLikOps.push_back(copyOp);
     }
   }
   return copyLikOps;

From 8c3f762348c9e3d756a19862d1fa412e51ad6063 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 29 Jan 2025 15:53:41 +0000
Subject: [PATCH 15/20] Review comment v6.0

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 785708563..6b9c985b9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -139,7 +139,7 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 ///    them are unique. Hence we'd split %lhs into 3 unique splits, instead
 ///    of 5.
 template <CopyOpOperateOn OperateOn>
-static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifos(
+static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
     SmallVector<CopyOpInterface> copyLikeOps) {
   DenseSet<Operation *> uniqueLof;
   for (CopyOpInterface copyOp : copyLikeOps) {
@@ -260,14 +260,14 @@ LogicalResult collectSplittingDims(
       // Calculate the new source stride to be used for splitting the DMA.
       int64_t newSourceStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      FailureOr<int64_t> maybeUniqueL2L1 =
-          fetchTotalUniqueLogicalObjFifos<CopyOpOperateOn::Target>(
+      FailureOr<int64_t> maybeNumUniqueConsumers =
+          fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Target>(
               objFifo.getCopyLikeConsumers());
-      if (failed(maybeUniqueL2L1)) {
+      if (failed(maybeNumUniqueConsumers)) {
         objFifo.emitOpError()
             << "could not retrieve total unique L2<->L1 pairs";
       }
-      int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols);
+      int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols);
       int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
       int64_t targetSize = (*targetSizes)[targetSplitDim];
       if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
@@ -330,14 +330,14 @@ LogicalResult collectSplittingDims(
       // Calculate the new target stride to be used for splitting the DMA.
       int64_t newTargetStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
-      FailureOr<int64_t> maybeUniqueL2L1 =
-          fetchTotalUniqueLogicalObjFifos<CopyOpOperateOn::Source>(
+      FailureOr<int64_t> maybeNumUniqueProducers =
+          fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Source>(
               objFifo.getCopyLikeProducers());
-      if (failed(maybeUniqueL2L1)) {
+      if (failed(maybeNumUniqueProducers)) {
         objFifo.emitOpError()
             << "could not retrieve total unique L2<->L1 pairs";
       }
-      int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols);
+      int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols);
       int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
       int64_t targetSize = (*targetSizes)[targetSplitDim];
       if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {

From 42868e6c31dbaf83039f5f02fae8339a3d2543d3 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 30 Jan 2025 06:25:35 +0000
Subject: [PATCH 16/20] Review comment v7.0

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp |  8 +--
 .../test/split_logicalobjfifos.mlir           | 51 +++++++------------
 2 files changed, 21 insertions(+), 38 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 6b9c985b9..168af5cc8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -264,8 +264,8 @@ LogicalResult collectSplittingDims(
           fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Target>(
               objFifo.getCopyLikeConsumers());
       if (failed(maybeNumUniqueConsumers)) {
-        objFifo.emitOpError()
-            << "could not retrieve total unique L2<->L1 pairs";
+        objFifo.emitOpError() << "could not retrieve the total number of "
+                                 "unique consumer objFifos";
       }
       int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols);
       int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
@@ -334,8 +334,8 @@ LogicalResult collectSplittingDims(
           fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Source>(
               objFifo.getCopyLikeProducers());
       if (failed(maybeNumUniqueProducers)) {
-        objFifo.emitOpError()
-            << "could not retrieve total unique L2<->L1 pairs";
+        objFifo.emitOpError() << "could not retrieve the total number of "
+                                 "unique producer objFifos";
       }
       int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols);
       int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index daaf5e81c..16b8332df 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -499,11 +499,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // -----
 
 // A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although
-// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8.
-// But to keep the test case concise it demonstrates a similar splitting strategy for 1 row and 2 columns.
-// Therefore L2 LHS will be split into 1 and L2 RHS will be split into 2. This needs to happen because
-// later in the compilation stack when the tiles are being assigned, we will
-// Refer: https://github.com/nod-ai/iree-amd-aie/pull/1031#discussion_r1920237380
+// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8,
+// i.e. the splitting will be dependent on unique producer/consumer for the respective ObjectFifos
+// being split.
+// To keep the test case concise it demonstrates a similar splitting strategy when the actual
+// compute is taking place in 1 row and 2 columns.
 //
 // CHECK-LABEL: @pack_peel_4_level_4x8_Strix
 // CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
@@ -517,8 +517,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
 // CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
 // CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) :
-// CHECK:                 amdaie.core
-// CHECK:                 amdaie.core
 // CHECK:                 amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) :
 // CHECK:             }
 // CHECK:             amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) :
@@ -526,7 +524,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}>
 #translation = #iree_codegen.translation_info<pipeline = Custom>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
-  func.func @pack_peel_4_level_4x8_Strix() attributes {translation_info = #translation} {
+  func.func @pack_peel_4_level_4x8_Strix(%lhs: memref<512x512xi32>, %rhs: memref<512x4096xi32>, %out: memref<512x4096xi32>) attributes {translation_info = #translation} {
     %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
@@ -536,21 +534,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
     %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32>
     %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32>
     %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32>
-    %alloc_5 = memref.alloc() : memref<512x512xi32>
-    %alloc_6 = memref.alloc() : memref<512x4096xi32>
-    %alloc_7 = memref.alloc() : memref<512x4096xi32>
-    %tile_0_1 = amdaie.tile(%c0, %c1)
-    %tile_1_1 = amdaie.tile(%c1, %c1)
-    %tile_2_1 = amdaie.tile(%c2, %c1)
-    %tile_0_0 = amdaie.tile(%c0, %c0)
-    %tile_1_0 = amdaie.tile(%c1, %c0)
-    %tile_2_0 = amdaie.tile(%c2, %c0)
-    %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>
-    %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>
-    %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>
-    %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo<memref<512x512xi32>>
-    %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
-    %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
+    %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>
+    %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>
+    %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>
+    %lof_0_0 = amdaie.logicalobjectfifo.from_memref %lhs, {} : memref<512x512xi32> -> !amdaie.logicalobjectfifo<memref<512x512xi32>>
+    %lof_1_0 = amdaie.logicalobjectfifo.from_memref %rhs, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
+    %lof_2_0 = amdaie.logicalobjectfifo.from_memref %out, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
     scf.forall (%arg0, %arg1) in (2, 8) {
       %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x512xi32>>)
       %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x4096xi32>>)
@@ -559,20 +548,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
         %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2)
         %tile_1_2 = amdaie.tile(%c1, %c2)
         %tile_0_2 = amdaie.tile(%c0, %c2)
-        %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
-        %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
-        %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
+        %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
+        %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
+        %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
         %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
         %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
         %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>)
-        %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-        %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) {
-          amdaie.end
-        }
-        %7 = amdaie.core(%tile_1_2, in : [%5, %4], out : []) {
-          amdaie.end
-        }
-        %8 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+        %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+        %6 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
       }
       %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo<memref<512x4096xi32>>, !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>)
     }

From f7bcd0d65b7e0985ab33b6eae7d285c85db12b5d Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 30 Jan 2025 09:36:00 +0000
Subject: [PATCH 17/20] Review comment v8.0

---
 .../Transforms/AMDAIESplitLogicalObjFifos.cpp       | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 168af5cc8..cd6483b56 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -135,9 +135,10 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 ///      DMA(%b, %lhs)
 ///      DMA(%c, %lhs)
 ///
-///    In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
-///    them are unique. Hence we'd split %lhs into 3 unique splits, instead
-///    of 5.
+///    In the above snippet, assume we want to split %lhs, it has 5 DMA ops.
+///    But only 3 of them are unique : (%lhs -> %a), (%lhs -> %b) (%lhs -> %c).
+///    Therefore this function is going to return 3. Which the caller is going
+///    to use as split factor.
 template <CopyOpOperateOn OperateOn>
 static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
     SmallVector<CopyOpInterface> copyLikeOps) {
@@ -177,9 +178,9 @@ static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
 /// that has product size larger than the other side's product size after
 /// splitting because that's the number of elements that should be
 /// produced/consumed on the respective sides before splitting.
-/// Towards the end fetch the count of unique L2<->L1 for the objectFifo which
-/// will be split. This would form the split factor which would be capped by the
-/// total no. of columns OR std::gcd of source/target size.
+/// Towards the end fetch the count of unique producer (or consumers) for the
+/// objectFifo which will be split. This would form the split factor which would
+/// be capped by the total no. of columns OR std::gcd of source/target size.
 LogicalResult collectSplittingDims(
     const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,

From fe830899263c14cdb8d6fc6ad1ff8b5191c5dea7 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 30 Jan 2025 10:16:09 +0000
Subject: [PATCH 18/20] Review comment v9.0

---
 .../iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index 16b8332df..56fd218eb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -500,8 +500,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although
 // we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8,
-// i.e. the splitting will be dependent on unique producer/consumer for the respective ObjectFifos
-// being split.
+// depending on :-
+//     GCD(unique producer/consumer for the respective ObjectFifos being split, number of columns)
 // To keep the test case concise it demonstrates a similar splitting strategy when the actual
 // compute is taking place in 1 row and 2 columns.
 //

From 164b8049bf1c12cdff4a4315aae0aa915d0fcd62 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 30 Jan 2025 10:48:18 +0000
Subject: [PATCH 19/20] Review comment v10.0

---
 .../Transforms/test/split_logicalobjfifos.mlir            | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index 56fd218eb..55d2c0618 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -498,12 +498,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // -----
 
-// A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although
-// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8,
-// depending on :-
+// Although we have 8 columns, L2 LHS buffers needs to be split into only 1, L2 RHS into 2 and
+// L2 OUT into 1.
+// This is because we decide the split factor for the L2 ObjectFifo depending on :-
 //     GCD(unique producer/consumer for the respective ObjectFifos being split, number of columns)
-// To keep the test case concise it demonstrates a similar splitting strategy when the actual
-// compute is taking place in 1 row and 2 columns.
 //
 // CHECK-LABEL: @pack_peel_4_level_4x8_Strix
 // CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index

From a91b6905f40048a85595b3eff67440a5ac3595a0 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 30 Jan 2025 16:33:55 +0530
Subject: [PATCH 20/20] Update
 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp

Co-authored-by: Jorn Tuyls <jtuyls@users.noreply.github.com>
---
 .../iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index cd6483b56..1cf40aba4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -178,7 +178,7 @@ static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
 /// that has product size larger than the other side's product size after
 /// splitting because that's the number of elements that should be
 /// produced/consumed on the respective sides before splitting.
-/// Towards the end fetch the count of unique producer (or consumers) for the
+/// Towards the end fetch the number of unique producers (or consumers) for the
 /// objectFifo which will be split. This would form the split factor which would
 /// be capped by the total no. of columns OR std::gcd of source/target size.
 LogicalResult collectSplittingDims(