From 9c0ca003db1db1902f0939752e16b56f7cb443ef Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Fri, 24 Jan 2025 13:41:44 +0000 Subject: [PATCH 01/20] Update npu4 Ukernel for 4x8 for pack-peel-4-level-tiling pipeline --- compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc index 38c364c6d..fe6568d8d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc @@ -289,6 +289,7 @@ extern "C" { } matmul_combos(matmul_vectorized_c_func, 16, 8, 32) +matmul_combos(matmul_vectorized_c_func, 16, 8, 64) matmul_combos(matmul_vectorized_c_func, 16, 16, 32) matmul_combos(matmul_vectorized_c_func, 32, 32, 32) matmul_combos(matmul_vectorized_c_func, 32, 32, 64) From be66068df8116f0f1e1566eb20a96555a45c66a7 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Fri, 24 Jan 2025 13:42:31 +0000 Subject: [PATCH 02/20] [WIP] Fix split-logicalobjfifos for 4x8 for pack-peel-4-level-tiling --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 74 +++++++++++++++++-- .../AMDAIELogicalObjFifoSplittingUtils.cpp | 18 +++-- .../AMDAIELogicalObjFifoSplittingUtils.h | 8 +- .../test/split_logicalobjfifos.mlir | 63 ++++++++++++++++ 4 files changed, 147 insertions(+), 16 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 89bc0fd05..e563f4059 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -348,23 +348,87 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { return signalPassFailure(); } + // Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us + // figure out the split factor for all LogicalObjectFifos. + DenseMap> uniqueL2L1Pair; + moduleOp->walk([&](Operation *op) -> WalkResult { + if (auto copyOp = dyn_cast(op)) { + auto source = dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + auto target = dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + if (!source || !target) { + return WalkResult::interrupt(); + } + auto sourceFromMemrefOp = + dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + auto targetFromMemrefOp = + dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + if (!sourceFromMemrefOp || !targetFromMemrefOp) { + return WalkResult::interrupt(); + } + Operation *l2DefOp = nullptr; + Operation *l1DefOp = nullptr; + if (source.getMemorySpaceAsUInt() == 1 && + target.getMemorySpaceAsUInt() == 2) { + l2DefOp = sourceFromMemrefOp.getMemref().getDefiningOp(); + l1DefOp = targetFromMemrefOp; + } else if (target.getMemorySpaceAsUInt() == 1 && + source.getMemorySpaceAsUInt() == 2) { + l2DefOp = targetFromMemrefOp.getMemref().getDefiningOp(); + l1DefOp = sourceFromMemrefOp; + } else { + return WalkResult::advance(); + } + uniqueL2L1Pair[l2DefOp].insert(l1DefOp); + return WalkResult::advance(); + } + return WalkResult::advance(); + }); + /// Split the DMA and objectFifo ops based on the calcuated splitting /// dimensions. + DenseMap splitFactorOfLOF; for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) { auto stridedOp = cast(dmaOp.getOperation()); - if (failed(splitDoublyStridedOp( - rewriter, stridedOp, dmaSplitInfo.sourceSplitDim, - dmaSplitInfo.targetSplitDim, dmaSplitInfo.splitSize, - dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride))) { + auto dmaCpyNd = cast(dmaOp.getOperation()); + int64_t splitFactor = dmaSplitInfo.splitSize; + if (stridedOp.getSourceMemorySpaceAsUInt() == 0) { + splitFactor = + uniqueL2L1Pair + [dmaCpyNd.getTarget() + .getDefiningOp() + .getMemref() + .getDefiningOp()] + .size(); + } else if (stridedOp.getTargetMemorySpaceAsUInt() == 0) { + splitFactor = + uniqueL2L1Pair + [dmaCpyNd.getSource() + .getDefiningOp() + .getMemref() + .getDefiningOp()] + .size(); + } + splitFactor = std::gcd(dmaSplitInfo.splitSize, splitFactor); + FailureOr maybeSplitFactor = splitDoublyStridedOp( + rewriter, stridedOp, dmaSplitInfo.sourceSplitDim, + dmaSplitInfo.targetSplitDim, splitFactor, dmaSplitInfo.newSourceStride, + dmaSplitInfo.newTargetStride); + if (failed(maybeSplitFactor)) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of the DMA op: " << dmaOp); return signalPassFailure(); } + splitFactorOfLOF[dmaCpyNd.getTarget().getDefiningOp()] = *maybeSplitFactor; + splitFactorOfLOF[dmaCpyNd.getSource().getDefiningOp()] = *maybeSplitFactor; } for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) { if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim, - splitInfo.splitSize, + splitFactorOfLOF[objFifo], splitInfo.splitStride))) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of objectFifo op"); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp index eb92185d5..3a3b67460 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -751,13 +751,15 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, } /// Split doubly strided operations on a source and target split dimension with -/// the provided split factor. -LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, - AMDAIE::DoublyStridedOpInterface op, - size_t sourceSplitDim, size_t targetSplitDim, - std::optional maybeSplitFactor, - int64_t sourceSplitStride, - int64_t targetSplitStride) { +/// the provided split factor which might get updated. On success, return the +/// split factor to the caller, else return failure. +FailureOr splitDoublyStridedOp(IRRewriter &rewriter, + AMDAIE::DoublyStridedOpInterface op, + size_t sourceSplitDim, + size_t targetSplitDim, + std::optional maybeSplitFactor, + int64_t sourceSplitStride, + int64_t targetSplitStride) { if (!op->use_empty()) return op.emitOpError() << "can't be split because it has uses"; SmallVector sourceOffsets = op.getSourceMixedOffsets(); @@ -857,7 +859,7 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, targetOffsets[targetSplitDim] = newTargetOffset.value(); } rewriter.eraseOp(op); - return success(); + return splitFactor; } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index fee4e510c..8f529feee 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -38,9 +38,11 @@ LogicalResult splitLogicalObjectFifo( int64_t splitStride = 1); /// Split doubly strided operations on a source and target split dimension with -/// the provided split factor. If no split factor is provided, the doubly -/// strided operation will be split on the size of the dimension being split. -LogicalResult splitDoublyStridedOp( +/// the provided split factor which might get updated. On success, return the +/// split factor to the caller, else return failure. +/// NOTE: If no split factor is provided, the doubly strided operation will be +/// split on the size of the dimension being split. +FailureOr splitDoublyStridedOp( IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim = 0, size_t targetSplitDim = 0, std::optional splitFactor = std::nullopt, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index f930ad5d0..6af6a0ea8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -471,3 +471,66 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { return } } + +// ----- + +#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}> +#translation = #iree_codegen.translation_info +module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { + func.func @pack_peel_4_level_4x8_Strix() attributes {translation_info = #translation} { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<1x1x8x8x8x4xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x8x8x4x8xi32, 2 : i32> + %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32> + %tile_0_1 = amdaie.tile(%c0, %c1) + %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32> + %tile_1_1 = amdaie.tile(%c1, %c1) + %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32> + %tile_2_1 = amdaie.tile(%c2, %c1) + %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %alloc_5 = memref.alloc() : memref<512x512xi32> + %tile_0_0 = amdaie.tile(%c0, %c0) + %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> + %alloc_6 = memref.alloc() : memref<512x4096xi32> + %tile_1_0 = amdaie.tile(%c1, %c0) + %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + %alloc_7 = memref.alloc() : memref<512x4096xi32> + %tile_2_0 = amdaie.tile(%c2, %c0) + %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg0, %arg1) in (2, 8) { + %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + scf.forall (%arg2, %arg3) in (2, 2) { + %tile_1_2 = amdaie.tile(%c1, %c2) + %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %tile_0_2 = amdaie.tile(%c0, %c2) + %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) { + amdaie.end + } + %7 = amdaie.core(%tile_1_2, in : [%5, %4], out : []) { + amdaie.end + } + %8 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } + %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } + memref.dealloc %alloc_4 : memref<8x8x32x64xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<16x8x64x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<16x8x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<1x1x8x8x4x8xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x1x8x8x8x4xi32, 2 : i32> + memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32> + return + } +} From 2588e1c079bbb5ca0310c74c029f7f233fd921e9 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Mon, 27 Jan 2025 12:13:04 +0000 Subject: [PATCH 03/20] Fix lit test --- .../test/split_logicalobjfifos.mlir | 189 ++++++++++++------ 1 file changed, 123 insertions(+), 66 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index 6af6a0ea8..7708ab9f2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -42,6 +42,7 @@ module { module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -49,8 +50,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = affine.apply #map(%arg1) %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %0[%2, 0] [64, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<2x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -93,6 +95,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> @@ -100,8 +103,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = affine.apply #map(%arg2) %3 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1], %1[0, %2] [32, 64] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> @@ -151,6 +155,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_output(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_2 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> @@ -158,11 +165,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = affine.apply #map(%arg2) %3 = affine.apply #map(%arg1) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %9 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %8 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32> @@ -175,11 +185,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Test of splitting matmul lhs input objectFifo and dma operations on 4x2 AIE array. // L2 buffer size `[4, 1, 32, 32]` is expected to be split into two `[2, 1, 32, 32]` buffers. -// CHECK-label: func.func @split_L2_input_lhs_on_4x2_array -// CHECK: %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A0]], {} : -// CHECK-SAME: memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A1]], {} : -// CHECK-SAME: memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-LABEL: func.func @split_L2_input_lhs_on_4x2_array +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> +// CHECK: %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]] +// CHECK: %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (4, 2) // CHECK: %[[DMA_L3_TO_L2_A0:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[OBJ_L2_A0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1] @@ -199,22 +209,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[DMA_L2_TO_L1_A3:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1] // CHECK-SAME: %[[OBJ_L2_A1]][1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1] -// CHECK: memref.dealloc %[[ALLOC_A0]] : memref<2x1x32x32xi32, 1 : i32> -// CHECK: memref.dealloc %[[ALLOC_A1]] : memref<2x1x32x32xi32, 1 : i32> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_lhs_on_4x2_array(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_2 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> scf.forall (%arg1, %arg2) in (4, 2) { %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [128, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -234,14 +248,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1]) // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 + 4)> module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_producer_with_loop_dependency(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -249,8 +265,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { scf.forall (%arg1, %arg2) in (2, 4) { %3 = affine.apply #map(%arg2) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -265,9 +282,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) // CHECK: } // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) @@ -276,14 +294,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_consumer_with_loop_dependency(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> scf.forall (%arg1, %arg2) in (2, 4) { %3 = affine.apply #map(%arg2) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> @@ -327,15 +347,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1]) // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 2 + 1)> module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_producer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -344,8 +366,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -363,9 +386,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) // CHECK: } // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) @@ -375,6 +399,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_consumer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -382,8 +407,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> @@ -396,18 +422,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-LABEL: func.func @change_split_factor_with_gcd_for_producer // CHECK-DAG: %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) { -// CHECK: %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: } #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 * 2)> @@ -415,6 +438,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @change_split_factor_with_gcd_for_producer(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -423,8 +447,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -436,25 +461,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-LABEL: @change_split_factor_with_gcd_for_consumer // CHECK-DAG: %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) { -// CHECK: %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: } -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1]) #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 2 + 1)> module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @change_split_factor_with_gcd_for_consumer(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -462,8 +485,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32> @@ -474,6 +498,39 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // ----- +// CHECK-LABEL: @pack_peel_4_level_4x8_Strix +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[LHS_L3:.*]] = memref.alloc() : memref<512x512xi32> +// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS_L3]], {%[[TILE_0_0]]} : +// CHECK: %[[RHS_L3:.*]] = memref.alloc() : memref<512x4096xi32> +// CHECK: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: %[[LOF_RHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS_L3]], {%[[TILE_1_0]]} : +// CHECK: %[[OUT_L3:.*]] = memref.alloc() : memref<512x4096xi32> +// CHECK: %[[TILE_2_0:.*]] = amdaie.tile(%[[C2]], %[[C0]]) +// CHECK: %[[LOF_OUT_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT_L3]], {%[[TILE_2_0]]} : +// CHECK: scf.forall (%{{.*}}, %{{.*}}) in (2, 8) { +// CHECK: amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %[[LOF_LHS_L3]][0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 0] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 256] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: scf.forall (%{{.*}}, %{{.*}}) in (2, 2) { +// CHECK: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK: %[[LOF_RHS_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_1_2]]} : +// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[LOF_RHS_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} : +// CHECK: %[[LOF_LHS_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]], %[[TILE_1_2]]} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : +// CHECK: amdaie.dma_cpy_nd(%[[LOF_LHS_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : +// CHECK: %[[LOF_OUT_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: amdaie.core +// CHECK: amdaie.core +// CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %[[LOF_OUT_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : +// CHECK: } +// CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L3]][0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : +// CHECK: } #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}> #translation = #iree_codegen.translation_info module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { From 4ed069eade28ca8b0fe88993e9d65e97e27e18e1 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Mon, 27 Jan 2025 12:19:27 +0000 Subject: [PATCH 04/20] Add one e2e test 512x512x256 (MxNxK) --- build_tools/ci/cpu_comparison/run.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 61807d232..253f993d8 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1717,6 +1717,25 @@ def __init__(self): use_chess=False, ) ) + self.register( + Matmul( + 512, + 512, + 256, + "i32", + "i32", + name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling", + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu4"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=8", + "--iree-amd-aie-additional-peano-opt-flags=-O3", + "--iree-amdaie-enable-function-outlining=True", + ], + use_chess=True, + ) + ) for target in ["npu1_4col", "npu4"]: self.register( From be9c5ba56bf67014ccb06e1ccf9c7fb3a2d45c71 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Mon, 27 Jan 2025 12:22:36 +0000 Subject: [PATCH 05/20] Add ukernel to e2e CI --- build_tools/ci/cpu_comparison/run.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 253f993d8..a2d908d2e 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1801,6 +1801,26 @@ def __init__(self): additional_labels=["I8UKernel"], ) ) + self.register( + Matmul( + 64, + 64, + 64, + "bf16", + "f32", + name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling_ukernel", + use_ukernel=True, + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu4"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=8", + "--iree-amd-aie-additional-peano-opt-flags=-O3", + "--iree-amdaie-enable-function-outlining=True", + ], + use_chess=True, + ) + ) # Matmul test on 2(rows)x2(cols) cores self.register( From 4b91fa31ad60acc66b8d24eee41a1e67db07b585 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Tue, 28 Jan 2025 12:01:24 +0000 Subject: [PATCH 06/20] Review comments v1.0 --- build_tools/ci/cpu_comparison/run.py | 6 -- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 75 ++++++++++++------- .../test/split_logicalobjfifos.mlir | 66 ++++++++-------- 3 files changed, 79 insertions(+), 68 deletions(-) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index a2d908d2e..d96452e67 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1730,10 +1730,7 @@ def __init__(self): aie_compilation_flags=[ "--iree-amdaie-num-rows=4", "--iree-amdaie-num-cols=8", - "--iree-amd-aie-additional-peano-opt-flags=-O3", - "--iree-amdaie-enable-function-outlining=True", ], - use_chess=True, ) ) @@ -1815,10 +1812,7 @@ def __init__(self): aie_compilation_flags=[ "--iree-amdaie-num-rows=4", "--iree-amdaie-num-cols=8", - "--iree-amd-aie-additional-peano-opt-flags=-O3", - "--iree-amdaie-enable-function-outlining=True", ], - use_chess=True, ) ) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index e563f4059..088b77518 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -349,7 +349,23 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { } // Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us - // figure out the split factor for all LogicalObjectFifos. + // figure out the split factor for all LogicalObjectFifos. Basically we get to + // decide how many splits to perform for a particular L2 ObjectFifo based on + // the total unique L2<->L1 Copy ops. + // Eg: + // %lhs = LOF_on_L2 + // %a = LOF_on_L1_0 + // %b = LOF_on_L1_1 + // %c = LOF_on_L1_2 + // DMA(%a, %lhs) + // DMA(%b, %lhs) + // DMA(%c, %lhs) + // DMA(%b, %lhs) + // DMA(%c, %lhs) + // + // In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of + // them are unique. Hence we'd split %lhs into 3 unique splits, instead + // of 5. DenseMap> uniqueL2L1Pair; moduleOp->walk([&](Operation *op) -> WalkResult { if (auto copyOp = dyn_cast(op)) { @@ -371,17 +387,20 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { } Operation *l2DefOp = nullptr; Operation *l1DefOp = nullptr; - if (source.getMemorySpaceAsUInt() == 1 && - target.getMemorySpaceAsUInt() == 2) { + // L2 -> L1. + if (target.getMemorySpaceAsUInt() == 2) { l2DefOp = sourceFromMemrefOp.getMemref().getDefiningOp(); l1DefOp = targetFromMemrefOp; - } else if (target.getMemorySpaceAsUInt() == 1 && - source.getMemorySpaceAsUInt() == 2) { + } else if (source.getMemorySpaceAsUInt() == 2) { + // L1 -> L2. l2DefOp = targetFromMemrefOp.getMemref().getDefiningOp(); l1DefOp = sourceFromMemrefOp; } else { return WalkResult::advance(); } + if (!l2DefOp || !l1DefOp) { + return WalkResult::interrupt(); + } uniqueL2L1Pair[l2DefOp].insert(l1DefOp); return WalkResult::advance(); } @@ -392,30 +411,31 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { /// dimensions. DenseMap splitFactorOfLOF; for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) { - auto stridedOp = - cast(dmaOp.getOperation()); auto dmaCpyNd = cast(dmaOp.getOperation()); int64_t splitFactor = dmaSplitInfo.splitSize; - if (stridedOp.getSourceMemorySpaceAsUInt() == 0) { - splitFactor = - uniqueL2L1Pair - [dmaCpyNd.getTarget() - .getDefiningOp() - .getMemref() - .getDefiningOp()] - .size(); - } else if (stridedOp.getTargetMemorySpaceAsUInt() == 0) { - splitFactor = - uniqueL2L1Pair - [dmaCpyNd.getSource() - .getDefiningOp() - .getMemref() - .getDefiningOp()] - .size(); + auto sourceDefOp = + dmaCpyNd.getSource() + .getDefiningOp(); + auto targetDefOp = + dmaCpyNd.getTarget() + .getDefiningOp(); + if (!sourceDefOp || !targetDefOp) { + LLVM_DEBUG(llvm::dbgs() + << "Expected defining op of source/target for : " << dmaOp); + return signalPassFailure(); + } + if (dmaCpyNd.getSourceMemorySpaceAsUInt() == 0) { + if (Operation *l2DefOp = targetDefOp.getMemref().getDefiningOp()) + splitFactor = uniqueL2L1Pair[l2DefOp].size(); + } else if (dmaCpyNd.getTargetMemorySpaceAsUInt() == 0) { + if (Operation *l2DefOp = sourceDefOp.getMemref().getDefiningOp()) + splitFactor = uniqueL2L1Pair[l2DefOp].size(); } + // In cases where the number of available columns < the inferred split + // factor, we'll cap the final split factor by the lower bound. splitFactor = std::gcd(dmaSplitInfo.splitSize, splitFactor); FailureOr maybeSplitFactor = splitDoublyStridedOp( - rewriter, stridedOp, dmaSplitInfo.sourceSplitDim, + rewriter, dmaCpyNd, dmaSplitInfo.sourceSplitDim, dmaSplitInfo.targetSplitDim, splitFactor, dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride); if (failed(maybeSplitFactor)) { @@ -423,8 +443,11 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { << "Failed to perform splitting of the DMA op: " << dmaOp); return signalPassFailure(); } - splitFactorOfLOF[dmaCpyNd.getTarget().getDefiningOp()] = *maybeSplitFactor; - splitFactorOfLOF[dmaCpyNd.getSource().getDefiningOp()] = *maybeSplitFactor; + // The above function might change the split factor based on divisibility + // with source/target. Therefore here we maintain the final split factor + // which we'll use later to split the LogicalObjectFifo. + splitFactorOfLOF[targetDefOp] = *maybeSplitFactor; + splitFactorOfLOF[sourceDefOp] = *maybeSplitFactor; } for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) { if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index 7708ab9f2..daaf5e81c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -498,38 +498,30 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // ----- +// A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although +// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8. +// But to keep the test case concise it demonstrates a similar splitting strategy for 1 row and 2 columns. +// Therefore L2 LHS will be split into 1 and L2 RHS will be split into 2. This needs to happen because +// later in the compilation stack when the tiles are being assigned, we will +// Refer: https://github.com/nod-ai/iree-amd-aie/pull/1031#discussion_r1920237380 +// // CHECK-LABEL: @pack_peel_4_level_4x8_Strix // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[LHS_L3:.*]] = memref.alloc() : memref<512x512xi32> -// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS_L3]], {%[[TILE_0_0]]} : -// CHECK: %[[RHS_L3:.*]] = memref.alloc() : memref<512x4096xi32> -// CHECK: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK: %[[LOF_RHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS_L3]], {%[[TILE_1_0]]} : -// CHECK: %[[OUT_L3:.*]] = memref.alloc() : memref<512x4096xi32> -// CHECK: %[[TILE_2_0:.*]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK: %[[LOF_OUT_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT_L3]], {%[[TILE_2_0]]} : // CHECK: scf.forall (%{{.*}}, %{{.*}}) in (2, 8) { -// CHECK: amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %[[LOF_LHS_L3]][0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, -// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 0] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo>, -// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %[[LOF_RHS_L3]][0, 256] [512, 256] [4096, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %{{.*}}[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 0] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 128] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo>, // CHECK: scf.forall (%{{.*}}, %{{.*}}) in (2, 2) { -// CHECK: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK: %[[LOF_RHS_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_1_2]]} : -// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK: %[[LOF_RHS_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} : -// CHECK: %[[LOF_LHS_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]], %[[TILE_1_2]]} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : -// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : -// CHECK: amdaie.dma_cpy_nd(%[[LOF_LHS_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : -// CHECK: %[[LOF_OUT_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.*}}, {%[[TILE_0_2]]} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : // CHECK: amdaie.core // CHECK: amdaie.core -// CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %[[LOF_OUT_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : +// CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : // CHECK: } -// CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L3]][0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : // CHECK: } #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}> #translation = #iree_codegen.translation_info @@ -542,34 +534,36 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %alloc_0 = memref.alloc() : memref<1x1x8x8x8x4xi32, 2 : i32> %alloc_1 = memref.alloc() : memref<1x1x8x8x4x8xi32, 2 : i32> %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32> - %tile_0_1 = amdaie.tile(%c0, %c1) - %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32> - %tile_1_1 = amdaie.tile(%c1, %c1) - %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32> - %tile_2_1 = amdaie.tile(%c2, %c1) - %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %alloc_5 = memref.alloc() : memref<512x512xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> %alloc_6 = memref.alloc() : memref<512x4096xi32> - %tile_1_0 = amdaie.tile(%c1, %c0) - %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> %alloc_7 = memref.alloc() : memref<512x4096xi32> + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_2_1 = amdaie.tile(%c2, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) + %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> + %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> scf.forall (%arg0, %arg1) in (2, 8) { %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) scf.forall (%arg2, %arg3) in (2, 2) { + %of0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg2) + %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2) %tile_1_2 = amdaie.tile(%c1, %c2) - %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %tile_0_2 = amdaie.tile(%c0, %c2) + %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) { From 37771f855f8171daecc53686f18a39633dc0d541 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 06:57:42 +0000 Subject: [PATCH 07/20] Review comment v2.0 --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 201 +++++++++--------- .../AMDAIELogicalObjFifoSplittingUtils.cpp | 27 +-- .../AMDAIELogicalObjFifoSplittingUtils.h | 2 +- 3 files changed, 109 insertions(+), 121 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 088b77518..298e24c6e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -121,10 +121,77 @@ FailureOr getSplitStride(ArrayRef dmaOps, return splitStride; } +/// Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us +/// figure out the split factor for all LogicalObjectFifos. Basically we get to +/// decide how many splits to perform for a particular L2 ObjectFifo based on +/// the total unique L2<->L1 Copy ops. +/// Eg: +/// %lhs = LOF_on_L2 +/// %a = LOF_on_L1_0 +/// %b = LOF_on_L1_1 +/// %c = LOF_on_L1_2 +/// DMA(%a, %lhs) +/// DMA(%b, %lhs) +/// DMA(%c, %lhs) +/// DMA(%b, %lhs) +/// DMA(%c, %lhs) +/// +/// In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of +/// them are unique. Hence we'd split %lhs into 3 unique splits, instead +/// of 5. +static DenseMap fetchUniqueL2L1(ModuleOp moduleOp) { + DenseMap> uniqueL2L1Pair; + moduleOp->walk([&](Operation *op) -> WalkResult { + if (auto copyOp = dyn_cast(op)) { + auto source = dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + auto target = dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + if (!source || !target) { + return WalkResult::interrupt(); + } + auto sourceFromMemrefOp = + dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + auto targetFromMemrefOp = + dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + if (!sourceFromMemrefOp || !targetFromMemrefOp) { + return WalkResult::interrupt(); + } + Operation *l2LofOp = nullptr; + Operation *l1LofOp = nullptr; + // L2 -> L1. + if (source.getMemorySpaceAsUInt() == 1 && + target.getMemorySpaceAsUInt() == 2) { + l2LofOp = sourceFromMemrefOp; + l1LofOp = targetFromMemrefOp; + } else if (source.getMemorySpaceAsUInt() == 2 && + target.getMemorySpaceAsUInt() == 1) { + // L1 -> L2. + l2LofOp = targetFromMemrefOp; + l1LofOp = sourceFromMemrefOp; + } else { + return WalkResult::advance(); + } + uniqueL2L1Pair[l2LofOp].insert(l1LofOp); + return WalkResult::advance(); + } + return WalkResult::advance(); + }); + + DenseMap uniqueL2L1Count; + for (auto &&[l2Lof, l1Lofs] : uniqueL2L1Pair) + uniqueL2L1Count[l2Lof] = l1Lofs.size(); + + return uniqueL2L1Count; +} + /// Find the logical objectFifo and DMA source/target splitting dimensions for /// each DMA and objectFifo pair. /// -/// Each pair is handled in the following way: +/// At first we find count of total unique L2<->L1 pairs for all L2 objectFifos. +/// Then each DMA and objectFifo pair is handled in the following way: /// First, compute the objectFifo splitting dimension based on the last non-unit /// shape dimension and the number of available columns. Afterwards, depending /// on which logical objectFifo is being split on, find the outermost dimension @@ -139,11 +206,12 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// splitting because that's the number of elements that should be /// produced/consumed on the respective sides before splitting. LogicalResult collectSplittingDims( - const SmallVector &dmaObjFifoPairs, + ModuleOp &moduleOp, const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, DenseMap &objFifoSplitInfoMap, int64_t numCols) { + DenseMap uniqueL2L1Pair = fetchUniqueL2L1(moduleOp); for (auto [dmaOp, objFifo] : dmaObjFifoPairs) { LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n"); LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n"); @@ -218,6 +286,12 @@ LogicalResult collectSplittingDims( // Calculate the new source stride to be used for splitting the DMA. int64_t newSourceStride = splitStride != 1 ? splitDimSize / splitStride : 1; + int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols); + int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; + int64_t targetSize = (*targetSizes)[targetSplitDim]; + if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { + splitFactor = std::gcd(sourceSize, targetSize); + } LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() @@ -225,10 +299,11 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n"); - LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim, - 1, numCols}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride}; + 1, splitFactor}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, + splitStride}; } else if (dmaOp.getSourceObjectFifo() == objFifo) { // Find outermost dimension in the access pattern that has stride == // sizeAfterSplit and size != 1. @@ -274,6 +349,12 @@ LogicalResult collectSplittingDims( // Calculate the new target stride to be used for splitting the DMA. int64_t newTargetStride = splitStride != 1 ? splitDimSize / splitStride : 1; + int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols); + int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; + int64_t targetSize = (*targetSizes)[targetSplitDim]; + if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { + splitFactor = std::gcd(sourceSize, targetSize); + } LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() @@ -281,10 +362,11 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n"); - LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim, - newTargetStride, numCols}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride}; + newTargetStride, splitFactor}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, + splitStride}; } } return success(); @@ -343,115 +425,28 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { DenseMap dmaSplitInfoMap; DenseMap objFifoSplitInfoMap; - if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap, + if (failed(collectSplittingDims(moduleOp, dmaObjFifoPairs, dmaSplitInfoMap, objFifoSplitInfoMap, numColumns))) { return signalPassFailure(); } - // Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us - // figure out the split factor for all LogicalObjectFifos. Basically we get to - // decide how many splits to perform for a particular L2 ObjectFifo based on - // the total unique L2<->L1 Copy ops. - // Eg: - // %lhs = LOF_on_L2 - // %a = LOF_on_L1_0 - // %b = LOF_on_L1_1 - // %c = LOF_on_L1_2 - // DMA(%a, %lhs) - // DMA(%b, %lhs) - // DMA(%c, %lhs) - // DMA(%b, %lhs) - // DMA(%c, %lhs) - // - // In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of - // them are unique. Hence we'd split %lhs into 3 unique splits, instead - // of 5. - DenseMap> uniqueL2L1Pair; - moduleOp->walk([&](Operation *op) -> WalkResult { - if (auto copyOp = dyn_cast(op)) { - auto source = dyn_cast_if_present( - copyOp.getSource().getDefiningOp()); - auto target = dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()); - if (!source || !target) { - return WalkResult::interrupt(); - } - auto sourceFromMemrefOp = - dyn_cast_if_present( - copyOp.getSource().getDefiningOp()); - auto targetFromMemrefOp = - dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()); - if (!sourceFromMemrefOp || !targetFromMemrefOp) { - return WalkResult::interrupt(); - } - Operation *l2DefOp = nullptr; - Operation *l1DefOp = nullptr; - // L2 -> L1. - if (target.getMemorySpaceAsUInt() == 2) { - l2DefOp = sourceFromMemrefOp.getMemref().getDefiningOp(); - l1DefOp = targetFromMemrefOp; - } else if (source.getMemorySpaceAsUInt() == 2) { - // L1 -> L2. - l2DefOp = targetFromMemrefOp.getMemref().getDefiningOp(); - l1DefOp = sourceFromMemrefOp; - } else { - return WalkResult::advance(); - } - if (!l2DefOp || !l1DefOp) { - return WalkResult::interrupt(); - } - uniqueL2L1Pair[l2DefOp].insert(l1DefOp); - return WalkResult::advance(); - } - return WalkResult::advance(); - }); - /// Split the DMA and objectFifo ops based on the calcuated splitting /// dimensions. - DenseMap splitFactorOfLOF; for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) { - auto dmaCpyNd = cast(dmaOp.getOperation()); - int64_t splitFactor = dmaSplitInfo.splitSize; - auto sourceDefOp = - dmaCpyNd.getSource() - .getDefiningOp(); - auto targetDefOp = - dmaCpyNd.getTarget() - .getDefiningOp(); - if (!sourceDefOp || !targetDefOp) { - LLVM_DEBUG(llvm::dbgs() - << "Expected defining op of source/target for : " << dmaOp); - return signalPassFailure(); - } - if (dmaCpyNd.getSourceMemorySpaceAsUInt() == 0) { - if (Operation *l2DefOp = targetDefOp.getMemref().getDefiningOp()) - splitFactor = uniqueL2L1Pair[l2DefOp].size(); - } else if (dmaCpyNd.getTargetMemorySpaceAsUInt() == 0) { - if (Operation *l2DefOp = sourceDefOp.getMemref().getDefiningOp()) - splitFactor = uniqueL2L1Pair[l2DefOp].size(); - } - // In cases where the number of available columns < the inferred split - // factor, we'll cap the final split factor by the lower bound. - splitFactor = std::gcd(dmaSplitInfo.splitSize, splitFactor); - FailureOr maybeSplitFactor = splitDoublyStridedOp( - rewriter, dmaCpyNd, dmaSplitInfo.sourceSplitDim, - dmaSplitInfo.targetSplitDim, splitFactor, dmaSplitInfo.newSourceStride, - dmaSplitInfo.newTargetStride); - if (failed(maybeSplitFactor)) { + auto stridedOp = + cast(dmaOp.getOperation()); + if (failed(splitDoublyStridedOp( + rewriter, stridedOp, dmaSplitInfo.sourceSplitDim, + dmaSplitInfo.targetSplitDim, dmaSplitInfo.splitSize, + dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride))) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of the DMA op: " << dmaOp); return signalPassFailure(); } - // The above function might change the split factor based on divisibility - // with source/target. Therefore here we maintain the final split factor - // which we'll use later to split the LogicalObjectFifo. - splitFactorOfLOF[targetDefOp] = *maybeSplitFactor; - splitFactorOfLOF[sourceDefOp] = *maybeSplitFactor; } for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) { if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim, - splitFactorOfLOF[objFifo], + splitInfo.splitSize, splitInfo.splitStride))) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of objectFifo op"); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp index 3a3b67460..0ec9f0532 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -753,13 +753,12 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, /// Split doubly strided operations on a source and target split dimension with /// the provided split factor which might get updated. On success, return the /// split factor to the caller, else return failure. -FailureOr splitDoublyStridedOp(IRRewriter &rewriter, - AMDAIE::DoublyStridedOpInterface op, - size_t sourceSplitDim, - size_t targetSplitDim, - std::optional maybeSplitFactor, - int64_t sourceSplitStride, - int64_t targetSplitStride) { +LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, + AMDAIE::DoublyStridedOpInterface op, + size_t sourceSplitDim, size_t targetSplitDim, + std::optional maybeSplitFactor, + int64_t sourceSplitStride, + int64_t targetSplitStride) { if (!op->use_empty()) return op.emitOpError() << "can't be split because it has uses"; SmallVector sourceOffsets = op.getSourceMixedOffsets(); @@ -802,15 +801,9 @@ FailureOr splitDoublyStridedOp(IRRewriter &rewriter, } int64_t sourceSize = maybeSourceSize.value(); int64_t targetSize = maybeTargetSize.value(); - int64_t splitFactor = maybeSplitFactor.has_value() - ? maybeSplitFactor.value() - : std::gcd(sourceSize, targetSize); - if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { - int64_t newSplitFactor = std::gcd(sourceSize, targetSize); - LLVM_DEBUG(llvm::dbgs() << "split factor has been changed from " - << splitFactor << " to " << newSplitFactor); - splitFactor = newSplitFactor; - } + assert(maybeSplitFactor.has_value() && + "expected split factor to be sent by the caller"); + int64_t splitFactor = maybeSplitFactor.value(); int64_t newSourceSize = sourceSize / splitFactor; int64_t newTargetSize = targetSize / splitFactor; @@ -859,7 +852,7 @@ FailureOr splitDoublyStridedOp(IRRewriter &rewriter, targetOffsets[targetSplitDim] = newTargetOffset.value(); } rewriter.eraseOp(op); - return splitFactor; + return success(); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index 8f529feee..a39686602 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -42,7 +42,7 @@ LogicalResult splitLogicalObjectFifo( /// split factor to the caller, else return failure. /// NOTE: If no split factor is provided, the doubly strided operation will be /// split on the size of the dimension being split. -FailureOr splitDoublyStridedOp( +LogicalResult splitDoublyStridedOp( IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim = 0, size_t targetSplitDim = 0, std::optional splitFactor = std::nullopt, From fe4f363e0e1408634dff5dc5680f902d3548598e Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 07:30:07 +0000 Subject: [PATCH 08/20] use_chess Strix --- build_tools/ci/cpu_comparison/run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index d96452e67..ef33edc00 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1813,6 +1813,7 @@ def __init__(self): "--iree-amdaie-num-rows=4", "--iree-amdaie-num-cols=8", ], + use_chess=True, ) ) From b58611738e8412ac14d3ced67ec80e6a8fffd433 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 10:59:12 +0000 Subject: [PATCH 09/20] Review comment v3.0 --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 96 +++++++------------ .../AMDAIELogicalObjFifoSplittingUtils.cpp | 8 +- .../AMDAIELogicalObjFifoSplittingUtils.h | 8 +- 3 files changed, 41 insertions(+), 71 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 298e24c6e..d1b07d772 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -121,7 +121,7 @@ FailureOr getSplitStride(ArrayRef dmaOps, return splitStride; } -/// Fetch and store all unique pairs of L2<->L1 Copy ops. This would helps us +/// Fetch total no. of unique pairs of L2<->L1 Copy ops. This would helps us /// figure out the split factor for all LogicalObjectFifos. Basically we get to /// decide how many splits to perform for a particular L2 ObjectFifo based on /// the total unique L2<->L1 Copy ops. @@ -137,61 +137,30 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// DMA(%c, %lhs) /// /// In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of -/// them are unique. Hence we'd split %lhs into 3 unique splits, instead -/// of 5. -static DenseMap fetchUniqueL2L1(ModuleOp moduleOp) { - DenseMap> uniqueL2L1Pair; - moduleOp->walk([&](Operation *op) -> WalkResult { - if (auto copyOp = dyn_cast(op)) { - auto source = dyn_cast_if_present( - copyOp.getSource().getDefiningOp()); - auto target = dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()); - if (!source || !target) { - return WalkResult::interrupt(); - } - auto sourceFromMemrefOp = - dyn_cast_if_present( - copyOp.getSource().getDefiningOp()); - auto targetFromMemrefOp = - dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()); - if (!sourceFromMemrefOp || !targetFromMemrefOp) { - return WalkResult::interrupt(); - } - Operation *l2LofOp = nullptr; - Operation *l1LofOp = nullptr; - // L2 -> L1. - if (source.getMemorySpaceAsUInt() == 1 && - target.getMemorySpaceAsUInt() == 2) { - l2LofOp = sourceFromMemrefOp; - l1LofOp = targetFromMemrefOp; - } else if (source.getMemorySpaceAsUInt() == 2 && - target.getMemorySpaceAsUInt() == 1) { - // L1 -> L2. - l2LofOp = targetFromMemrefOp; - l1LofOp = sourceFromMemrefOp; - } else { - return WalkResult::advance(); - } - uniqueL2L1Pair[l2LofOp].insert(l1LofOp); - return WalkResult::advance(); +/// them are unique. Hence we'd split %lhs into 3 unique splits, instead of 5. +static FailureOr fetchTotalUniqueL2L1(SmallVector copyLikeOps, bool fetchTarget) { + DenseSet uniqueLof; + for (CopyOpInterface copyOp : copyLikeOps) { + AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr; + if (fetchTarget) { + lof = dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + } else { + lof = dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); } - return WalkResult::advance(); - }); - - DenseMap uniqueL2L1Count; - for (auto &&[l2Lof, l1Lofs] : uniqueL2L1Pair) - uniqueL2L1Count[l2Lof] = l1Lofs.size(); - - return uniqueL2L1Count; + if (!lof) { + return copyOp.emitOpError()<< "could not retrieve source/target objectFifo"; + } + uniqueLof.insert(lof); + } + return uniqueLof.size(); } /// Find the logical objectFifo and DMA source/target splitting dimensions for /// each DMA and objectFifo pair. /// -/// At first we find count of total unique L2<->L1 pairs for all L2 objectFifos. -/// Then each DMA and objectFifo pair is handled in the following way: +/// Each pair is handled in the following way: /// First, compute the objectFifo splitting dimension based on the last non-unit /// shape dimension and the number of available columns. Afterwards, depending /// on which logical objectFifo is being split on, find the outermost dimension @@ -205,13 +174,14 @@ static DenseMap fetchUniqueL2L1(ModuleOp moduleOp) { /// that has product size larger than the other side's product size after /// splitting because that's the number of elements that should be /// produced/consumed on the respective sides before splitting. +/// Towards the end fetch the count of unique L2<->L1 for the objectFifo which +/// will be split. This would form the split factor which would be capped by the +/// total no. of columns OR std::gcd of source/target size. LogicalResult collectSplittingDims( ModuleOp &moduleOp, const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, DenseMap - &objFifoSplitInfoMap, - int64_t numCols) { - DenseMap uniqueL2L1Pair = fetchUniqueL2L1(moduleOp); + &objFifoSplitInfoMap, int64_t numCols) { for (auto [dmaOp, objFifo] : dmaObjFifoPairs) { LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n"); LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n"); @@ -286,7 +256,12 @@ LogicalResult collectSplittingDims( // Calculate the new source stride to be used for splitting the DMA. int64_t newSourceStride = splitStride != 1 ? splitDimSize / splitStride : 1; - int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols); + FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true); + if (failed(maybeUniqueL2L1)) { + objFifo.emitOpError() + << "could not retrieve total unique L2<->L1 pairs"; + } + int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; int64_t targetSize = (*targetSizes)[targetSplitDim]; if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { @@ -302,8 +277,7 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim, 1, splitFactor}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, - splitStride}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride}; } else if (dmaOp.getSourceObjectFifo() == objFifo) { // Find outermost dimension in the access pattern that has stride == // sizeAfterSplit and size != 1. @@ -349,7 +323,12 @@ LogicalResult collectSplittingDims( // Calculate the new target stride to be used for splitting the DMA. int64_t newTargetStride = splitStride != 1 ? splitDimSize / splitStride : 1; - int64_t splitFactor = std::gcd(uniqueL2L1Pair[objFifo], numCols); + FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeProducers(), /*fetchTarget=*/false); + if (failed(maybeUniqueL2L1)) { + objFifo.emitOpError() + << "could not retrieve total unique L2<->L1 pairs"; + } + int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; int64_t targetSize = (*targetSizes)[targetSplitDim]; if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { @@ -365,8 +344,7 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim, newTargetStride, splitFactor}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, - splitStride}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride}; } } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp index 0ec9f0532..afde8fe8d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -751,12 +751,11 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, } /// Split doubly strided operations on a source and target split dimension with -/// the provided split factor which might get updated. On success, return the -/// split factor to the caller, else return failure. +/// the provided split factor. LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim, size_t targetSplitDim, - std::optional maybeSplitFactor, + int64_t splitFactor, int64_t sourceSplitStride, int64_t targetSplitStride) { if (!op->use_empty()) @@ -801,9 +800,6 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, } int64_t sourceSize = maybeSourceSize.value(); int64_t targetSize = maybeTargetSize.value(); - assert(maybeSplitFactor.has_value() && - "expected split factor to be sent by the caller"); - int64_t splitFactor = maybeSplitFactor.value(); int64_t newSourceSize = sourceSize / splitFactor; int64_t newTargetSize = targetSize / splitFactor; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index a39686602..a9a22c471 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -38,15 +38,11 @@ LogicalResult splitLogicalObjectFifo( int64_t splitStride = 1); /// Split doubly strided operations on a source and target split dimension with -/// the provided split factor which might get updated. On success, return the -/// split factor to the caller, else return failure. -/// NOTE: If no split factor is provided, the doubly strided operation will be -/// split on the size of the dimension being split. +/// the provided split factor. LogicalResult splitDoublyStridedOp( IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim = 0, size_t targetSplitDim = 0, - std::optional splitFactor = std::nullopt, - int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); + int64_t splitFactor, int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); } // namespace mlir::iree_compiler::AMDAIE From f9d8a990d1230196cc47f1337d0f89164710f248 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 11:00:04 +0000 Subject: [PATCH 10/20] Fix getCopyLikeConsumer/Producer APIs --- .../IR/AMDAIELogicalObjFifoOpInterface.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp index 48dc97d98..89fb8b57a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp @@ -17,10 +17,10 @@ SmallVector getCopyLikeConsumers( LogicalObjFifoOpInterface op) { SmallVector copyLikOps; for (Operation *userOp : op->getUsers()) { - if (auto copyOp = dyn_cast(userOp); - dyn_cast_if_present( - copyOp.getSource().getDefiningOp()) == op) { - copyLikOps.push_back(copyOp); + if (auto copyOp = dyn_cast(userOp)) { + if (dyn_cast_if_present( + copyOp.getSource().getDefiningOp()) == op) + copyLikOps.push_back(copyOp); } } return copyLikOps; @@ -30,10 +30,10 @@ SmallVector getCopyLikeProducers( LogicalObjFifoOpInterface op) { SmallVector copyLikOps; for (Operation *userOp : op->getUsers()) { - if (auto copyOp = dyn_cast(userOp); - dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()) == op) { - copyLikOps.push_back(copyOp); + if (auto copyOp = dyn_cast(userOp)) { + if (dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()) == op) + copyLikOps.push_back(copyOp); } } return copyLikOps; From e6f6b780f4e877655f4270de9bb148dbdfcdf2ea Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 11:02:07 +0000 Subject: [PATCH 11/20] Pre-commit fixes --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 30 ++++++++++++------- .../AMDAIELogicalObjFifoSplittingUtils.h | 4 +-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index d1b07d772..a78d12e08 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -137,20 +137,23 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// DMA(%c, %lhs) /// /// In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of -/// them are unique. Hence we'd split %lhs into 3 unique splits, instead of 5. -static FailureOr fetchTotalUniqueL2L1(SmallVector copyLikeOps, bool fetchTarget) { - DenseSet uniqueLof; +/// them are unique. Hence we'd split %lhs into 3 unique splits, instead +/// of 5. +static FailureOr fetchTotalUniqueL2L1( + SmallVector copyLikeOps, bool fetchTarget) { + DenseSet uniqueLof; for (CopyOpInterface copyOp : copyLikeOps) { AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr; if (fetchTarget) { lof = dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()); + copyOp.getTarget().getDefiningOp()); } else { lof = dyn_cast_if_present( - copyOp.getSource().getDefiningOp()); + copyOp.getSource().getDefiningOp()); } if (!lof) { - return copyOp.emitOpError()<< "could not retrieve source/target objectFifo"; + return copyOp.emitOpError() + << "could not retrieve source/target objectFifo"; } uniqueLof.insert(lof); } @@ -181,7 +184,8 @@ LogicalResult collectSplittingDims( ModuleOp &moduleOp, const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, DenseMap - &objFifoSplitInfoMap, int64_t numCols) { + &objFifoSplitInfoMap, + int64_t numCols) { for (auto [dmaOp, objFifo] : dmaObjFifoPairs) { LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n"); LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n"); @@ -256,7 +260,8 @@ LogicalResult collectSplittingDims( // Calculate the new source stride to be used for splitting the DMA. int64_t newSourceStride = splitStride != 1 ? splitDimSize / splitStride : 1; - FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true); + FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1( + objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true); if (failed(maybeUniqueL2L1)) { objFifo.emitOpError() << "could not retrieve total unique L2<->L1 pairs"; @@ -277,7 +282,8 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim, 1, splitFactor}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, + splitStride}; } else if (dmaOp.getSourceObjectFifo() == objFifo) { // Find outermost dimension in the access pattern that has stride == // sizeAfterSplit and size != 1. @@ -323,7 +329,8 @@ LogicalResult collectSplittingDims( // Calculate the new target stride to be used for splitting the DMA. int64_t newTargetStride = splitStride != 1 ? splitDimSize / splitStride : 1; - FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1(objFifo.getCopyLikeProducers(), /*fetchTarget=*/false); + FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1( + objFifo.getCopyLikeProducers(), /*fetchTarget=*/false); if (failed(maybeUniqueL2L1)) { objFifo.emitOpError() << "could not retrieve total unique L2<->L1 pairs"; @@ -344,7 +351,8 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim, newTargetStride, splitFactor}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, splitStride}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, + splitStride}; } } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index a9a22c471..0905795fd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -41,8 +41,8 @@ LogicalResult splitLogicalObjectFifo( /// the provided split factor. LogicalResult splitDoublyStridedOp( IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, - size_t sourceSplitDim = 0, size_t targetSplitDim = 0, - int64_t splitFactor, int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); + size_t sourceSplitDim = 0, size_t targetSplitDim = 0, int64_t splitFactor, + int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); } // namespace mlir::iree_compiler::AMDAIE From 17032714196ead5ba69dbde824f1cd846416f225 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 11:07:14 +0000 Subject: [PATCH 12/20] Remove moduleOp - not needed anymore --- .../iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index a78d12e08..00a743e77 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -181,7 +181,7 @@ static FailureOr fetchTotalUniqueL2L1( /// will be split. This would form the split factor which would be capped by the /// total no. of columns OR std::gcd of source/target size. LogicalResult collectSplittingDims( - ModuleOp &moduleOp, const SmallVector &dmaObjFifoPairs, + const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, DenseMap &objFifoSplitInfoMap, @@ -411,7 +411,7 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { DenseMap dmaSplitInfoMap; DenseMap objFifoSplitInfoMap; - if (failed(collectSplittingDims(moduleOp, dmaObjFifoPairs, dmaSplitInfoMap, + if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap, objFifoSplitInfoMap, numColumns))) { return signalPassFailure(); } From 509fb6d40379e39ee7d0ee0e4ded75450f531d41 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 14:27:04 +0000 Subject: [PATCH 13/20] Review v4.0 --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 26 ++++++++++--------- .../AMDAIELogicalObjFifoSplittingUtils.h | 11 +++++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 00a743e77..785708563 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -121,11 +121,10 @@ FailureOr getSplitStride(ArrayRef dmaOps, return splitStride; } -/// Fetch total no. of unique pairs of L2<->L1 Copy ops. This would helps us -/// figure out the split factor for all LogicalObjectFifos. Basically we get to -/// decide how many splits to perform for a particular L2 ObjectFifo based on -/// the total unique L2<->L1 Copy ops. -/// Eg: +/// Given a list of Copy Ops, fetch the total no. of unique consumer/producer +/// LogicalObjectFifos. This would helps us figure out the split factor for +/// LogicalObjectFifos. +/// And example case which necessitated this feature :- /// %lhs = LOF_on_L2 /// %a = LOF_on_L1_0 /// %b = LOF_on_L1_1 @@ -139,12 +138,13 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of /// them are unique. Hence we'd split %lhs into 3 unique splits, instead /// of 5. -static FailureOr fetchTotalUniqueL2L1( - SmallVector copyLikeOps, bool fetchTarget) { +template +static FailureOr fetchTotalUniqueLogicalObjFifos( + SmallVector copyLikeOps) { DenseSet uniqueLof; for (CopyOpInterface copyOp : copyLikeOps) { AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr; - if (fetchTarget) { + if constexpr (OperateOn == CopyOpOperateOn::Target) { lof = dyn_cast_if_present( copyOp.getTarget().getDefiningOp()); } else { @@ -260,8 +260,9 @@ LogicalResult collectSplittingDims( // Calculate the new source stride to be used for splitting the DMA. int64_t newSourceStride = splitStride != 1 ? splitDimSize / splitStride : 1; - FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1( - objFifo.getCopyLikeConsumers(), /*fetchTarget=*/true); + FailureOr maybeUniqueL2L1 = + fetchTotalUniqueLogicalObjFifos( + objFifo.getCopyLikeConsumers()); if (failed(maybeUniqueL2L1)) { objFifo.emitOpError() << "could not retrieve total unique L2<->L1 pairs"; @@ -329,8 +330,9 @@ LogicalResult collectSplittingDims( // Calculate the new target stride to be used for splitting the DMA. int64_t newTargetStride = splitStride != 1 ? splitDimSize / splitStride : 1; - FailureOr maybeUniqueL2L1 = fetchTotalUniqueL2L1( - objFifo.getCopyLikeProducers(), /*fetchTarget=*/false); + FailureOr maybeUniqueL2L1 = + fetchTotalUniqueLogicalObjFifos( + objFifo.getCopyLikeProducers()); if (failed(maybeUniqueL2L1)) { objFifo.emitOpError() << "could not retrieve total unique L2<->L1 pairs"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index 0905795fd..7a428df66 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -39,10 +39,13 @@ LogicalResult splitLogicalObjectFifo( /// Split doubly strided operations on a source and target split dimension with /// the provided split factor. -LogicalResult splitDoublyStridedOp( - IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, - size_t sourceSplitDim = 0, size_t targetSplitDim = 0, int64_t splitFactor, - int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); +LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, + AMDAIE::DoublyStridedOpInterface op, + size_t sourceSplitDim = 0, + size_t targetSplitDim = 0, + int64_t splitFactor = 1, + int64_t sourceSplitStride = 1, + int64_t targetSplitStride = 1); } // namespace mlir::iree_compiler::AMDAIE From da1c62d760ce33d35a04caacf2b78fbd92f4bf03 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 14:46:05 +0000 Subject: [PATCH 14/20] Review comment v5.0 --- .../IR/AMDAIELogicalObjFifoOpInterface.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp index 89fb8b57a..5031d688e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp @@ -17,10 +17,10 @@ SmallVector getCopyLikeConsumers( LogicalObjFifoOpInterface op) { SmallVector copyLikOps; for (Operation *userOp : op->getUsers()) { - if (auto copyOp = dyn_cast(userOp)) { - if (dyn_cast_if_present( - copyOp.getSource().getDefiningOp()) == op) - copyLikOps.push_back(copyOp); + if (auto copyOp = dyn_cast(userOp); + copyOp && dyn_cast_if_present( + copyOp.getSource().getDefiningOp()) == op) { + copyLikOps.push_back(copyOp); } } return copyLikOps; @@ -30,10 +30,10 @@ SmallVector getCopyLikeProducers( LogicalObjFifoOpInterface op) { SmallVector copyLikOps; for (Operation *userOp : op->getUsers()) { - if (auto copyOp = dyn_cast(userOp)) { - if (dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()) == op) - copyLikOps.push_back(copyOp); + if (auto copyOp = dyn_cast(userOp); + copyOp && dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()) == op) { + copyLikOps.push_back(copyOp); } } return copyLikOps; From 8c3f762348c9e3d756a19862d1fa412e51ad6063 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 29 Jan 2025 15:53:41 +0000 Subject: [PATCH 15/20] Review comment v6.0 --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 785708563..6b9c985b9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -139,7 +139,7 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// them are unique. Hence we'd split %lhs into 3 unique splits, instead /// of 5. template -static FailureOr fetchTotalUniqueLogicalObjFifos( +static FailureOr fetchTotalUniqueLogicalObjFifoUsers( SmallVector copyLikeOps) { DenseSet uniqueLof; for (CopyOpInterface copyOp : copyLikeOps) { @@ -260,14 +260,14 @@ LogicalResult collectSplittingDims( // Calculate the new source stride to be used for splitting the DMA. int64_t newSourceStride = splitStride != 1 ? splitDimSize / splitStride : 1; - FailureOr maybeUniqueL2L1 = - fetchTotalUniqueLogicalObjFifos( + FailureOr maybeNumUniqueConsumers = + fetchTotalUniqueLogicalObjFifoUsers( objFifo.getCopyLikeConsumers()); - if (failed(maybeUniqueL2L1)) { + if (failed(maybeNumUniqueConsumers)) { objFifo.emitOpError() << "could not retrieve total unique L2<->L1 pairs"; } - int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols); + int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; int64_t targetSize = (*targetSizes)[targetSplitDim]; if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { @@ -330,14 +330,14 @@ LogicalResult collectSplittingDims( // Calculate the new target stride to be used for splitting the DMA. int64_t newTargetStride = splitStride != 1 ? splitDimSize / splitStride : 1; - FailureOr maybeUniqueL2L1 = - fetchTotalUniqueLogicalObjFifos( + FailureOr maybeNumUniqueProducers = + fetchTotalUniqueLogicalObjFifoUsers( objFifo.getCopyLikeProducers()); - if (failed(maybeUniqueL2L1)) { + if (failed(maybeNumUniqueProducers)) { objFifo.emitOpError() << "could not retrieve total unique L2<->L1 pairs"; } - int64_t splitFactor = std::gcd(*maybeUniqueL2L1, numCols); + int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; int64_t targetSize = (*targetSizes)[targetSplitDim]; if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { From 42868e6c31dbaf83039f5f02fae8339a3d2543d3 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 30 Jan 2025 06:25:35 +0000 Subject: [PATCH 16/20] Review comment v7.0 --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 8 +-- .../test/split_logicalobjfifos.mlir | 51 +++++++------------ 2 files changed, 21 insertions(+), 38 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 6b9c985b9..168af5cc8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -264,8 +264,8 @@ LogicalResult collectSplittingDims( fetchTotalUniqueLogicalObjFifoUsers( objFifo.getCopyLikeConsumers()); if (failed(maybeNumUniqueConsumers)) { - objFifo.emitOpError() - << "could not retrieve total unique L2<->L1 pairs"; + objFifo.emitOpError() << "could not retrieve the total number of " + "unique consumer objFifos"; } int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; @@ -334,8 +334,8 @@ LogicalResult collectSplittingDims( fetchTotalUniqueLogicalObjFifoUsers( objFifo.getCopyLikeProducers()); if (failed(maybeNumUniqueProducers)) { - objFifo.emitOpError() - << "could not retrieve total unique L2<->L1 pairs"; + objFifo.emitOpError() << "could not retrieve the total number of " + "unique producer objFifos"; } int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index daaf5e81c..16b8332df 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -499,11 +499,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // ----- // A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although -// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8. -// But to keep the test case concise it demonstrates a similar splitting strategy for 1 row and 2 columns. -// Therefore L2 LHS will be split into 1 and L2 RHS will be split into 2. This needs to happen because -// later in the compilation stack when the tiles are being assigned, we will -// Refer: https://github.com/nod-ai/iree-amd-aie/pull/1031#discussion_r1920237380 +// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8, +// i.e. the splitting will be dependent on unique producer/consumer for the respective ObjectFifos +// being split. +// To keep the test case concise it demonstrates a similar splitting strategy when the actual +// compute is taking place in 1 row and 2 columns. // // CHECK-LABEL: @pack_peel_4_level_4x8_Strix // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index @@ -517,8 +517,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : -// CHECK: amdaie.core -// CHECK: amdaie.core // CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : // CHECK: } // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : @@ -526,7 +524,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}> #translation = #iree_codegen.translation_info module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { - func.func @pack_peel_4_level_4x8_Strix() attributes {translation_info = #translation} { + func.func @pack_peel_4_level_4x8_Strix(%lhs: memref<512x512xi32>, %rhs: memref<512x4096xi32>, %out: memref<512x4096xi32>) attributes {translation_info = #translation} { %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index @@ -536,21 +534,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32> %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32> %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32> - %alloc_5 = memref.alloc() : memref<512x512xi32> - %alloc_6 = memref.alloc() : memref<512x4096xi32> - %alloc_7 = memref.alloc() : memref<512x4096xi32> - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_1_1 = amdaie.tile(%c1, %c1) - %tile_2_1 = amdaie.tile(%c2, %c1) - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_1_0 = amdaie.tile(%c1, %c0) - %tile_2_0 = amdaie.tile(%c2, %c0) - %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> - %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> - %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_0 = amdaie.logicalobjectfifo.from_memref %lhs, {} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> + %lof_1_0 = amdaie.logicalobjectfifo.from_memref %rhs, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + %lof_2_0 = amdaie.logicalobjectfifo.from_memref %out, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> scf.forall (%arg0, %arg1) in (2, 8) { %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -559,20 +548,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2) %tile_1_2 = amdaie.tile(%c1, %c2) %tile_0_2 = amdaie.tile(%c0, %c2) - %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) { - amdaie.end - } - %7 = amdaie.core(%tile_1_2, in : [%5, %4], out : []) { - amdaie.end - } - %8 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } From f7bcd0d65b7e0985ab33b6eae7d285c85db12b5d Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 30 Jan 2025 09:36:00 +0000 Subject: [PATCH 17/20] Review comment v8.0 --- .../Transforms/AMDAIESplitLogicalObjFifos.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 168af5cc8..cd6483b56 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -135,9 +135,10 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// DMA(%b, %lhs) /// DMA(%c, %lhs) /// -/// In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of -/// them are unique. Hence we'd split %lhs into 3 unique splits, instead -/// of 5. +/// In the above snippet, assume we want to split %lhs, it has 5 DMA ops. +/// But only 3 of them are unique : (%lhs -> %a), (%lhs -> %b) (%lhs -> %c). +/// Therefore this function is going to return 3. Which the caller is going +/// to use as split factor. template static FailureOr fetchTotalUniqueLogicalObjFifoUsers( SmallVector copyLikeOps) { @@ -177,9 +178,9 @@ static FailureOr fetchTotalUniqueLogicalObjFifoUsers( /// that has product size larger than the other side's product size after /// splitting because that's the number of elements that should be /// produced/consumed on the respective sides before splitting. -/// Towards the end fetch the count of unique L2<->L1 for the objectFifo which -/// will be split. This would form the split factor which would be capped by the -/// total no. of columns OR std::gcd of source/target size. +/// Towards the end fetch the count of unique producer (or consumers) for the +/// objectFifo which will be split. This would form the split factor which would +/// be capped by the total no. of columns OR std::gcd of source/target size. LogicalResult collectSplittingDims( const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, From fe830899263c14cdb8d6fc6ad1ff8b5191c5dea7 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 30 Jan 2025 10:16:09 +0000 Subject: [PATCH 18/20] Review comment v9.0 --- .../iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index 16b8332df..56fd218eb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -500,8 +500,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although // we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8, -// i.e. the splitting will be dependent on unique producer/consumer for the respective ObjectFifos -// being split. +// depending on :- +// GCD(unique producer/consumer for the respective ObjectFifos being split, number of columns) // To keep the test case concise it demonstrates a similar splitting strategy when the actual // compute is taking place in 1 row and 2 columns. // From 164b8049bf1c12cdff4a4315aae0aa915d0fcd62 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 30 Jan 2025 10:48:18 +0000 Subject: [PATCH 19/20] Review comment v10.0 --- .../Transforms/test/split_logicalobjfifos.mlir | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index 56fd218eb..55d2c0618 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -498,12 +498,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // ----- -// A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although -// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8, -// depending on :- +// Although we have 8 columns, L2 LHS buffers needs to be split into only 1, L2 RHS into 2 and +// L2 OUT into 1. +// This is because we decide the split factor for the L2 ObjectFifo depending on :- // GCD(unique producer/consumer for the respective ObjectFifos being split, number of columns) -// To keep the test case concise it demonstrates a similar splitting strategy when the actual -// compute is taking place in 1 row and 2 columns. // // CHECK-LABEL: @pack_peel_4_level_4x8_Strix // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index From a91b6905f40048a85595b3eff67440a5ac3595a0 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 30 Jan 2025 16:33:55 +0530 Subject: [PATCH 20/20] Update compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp Co-authored-by: Jorn Tuyls --- .../iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index cd6483b56..1cf40aba4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -178,7 +178,7 @@ static FailureOr fetchTotalUniqueLogicalObjFifoUsers( /// that has product size larger than the other side's product size after /// splitting because that's the number of elements that should be /// produced/consumed on the respective sides before splitting. -/// Towards the end fetch the count of unique producer (or consumers) for the +/// Towards the end fetch the number of unique producers (or consumers) for the /// objectFifo which will be split. This would form the split factor which would /// be capped by the total no. of columns OR std::gcd of source/target size. LogicalResult collectSplittingDims(