From 6232f0295518c5dc23e04ffc5f1dee88994202ac Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 18 Sep 2024 10:41:00 -0700 Subject: [PATCH 1/4] Have AIECtrlPacketToDmaPass figure out the ctrl packet stream size, instead of hard coding --- .../AIEX/Transforms/AIECtrlPacketToDma.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index 0b76309a60..ca46e66f0e 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -76,8 +76,20 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase { auto newSeq = builder.create(loc, f.getSymNameAttr()); newSeq.getBody().push_back(new Block); - auto ctrlPktMemrefType = MemRefType::get( - SmallVector{1024}, IntegerType::get(ctx, 32), nullptr, 0); + + // Get total size of all control packets (in i32 words) + int64_t totalCtrlPktSizeInI32 = 0; + llvm::for_each(controlPacketOps, [&](AIEX::NpuControlPacketOp ctrlPktOp) { + if (ctrlPktOp.getData()) + totalCtrlPktSizeInI32 += ctrlPktOp.getData()->size(); + else if (ctrlPktOp.getLength()) + totalCtrlPktSizeInI32 += *ctrlPktOp.getLength(); + totalCtrlPktSizeInI32++; // Plus one control packet info word + }); + + auto ctrlPktMemrefType = + MemRefType::get(SmallVector{totalCtrlPktSizeInI32}, + IntegerType::get(ctx, 32), nullptr, 0); auto newBlockArg = newSeq.getBody().addArgument(ctrlPktMemrefType, loc); builder.setInsertionPointToStart(&newSeq.getBody().front()); From 83934c6c57e3675a7794784cc7d01dadc83f040b Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 18 Sep 2024 11:00:59 -0700 Subject: [PATCH 2/4] Update mlir unit tests --- test/dialect/AIEX/ctrl_pkt_to_dma.mlir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/dialect/AIEX/ctrl_pkt_to_dma.mlir b/test/dialect/AIEX/ctrl_pkt_to_dma.mlir index 0ee76178c7..02ef034af3 100644 --- a/test/dialect/AIEX/ctrl_pkt_to_dma.mlir +++ b/test/dialect/AIEX/ctrl_pkt_to_dma.mlir @@ -13,8 +13,8 @@ // transforms control packet ops to dma memcpy ops and sync ops. // CHECK-LABEL: aie.device(npu1_1col) { -// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref<1024xi32>) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> +// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref<2xi32>) { +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<2xi32> // CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} aie.device(npu1_1col) { @@ -29,8 +29,8 @@ aie.device(npu1_1col) { // ----- // CHECK-LABEL: aie.device(npu1_1col) { -// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref<1024xi32>) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> +// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref<2xi32>) { +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<2xi32> // CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} aie.device(npu1_1col) { From cb52dac168e9325f13bb15902952f03382555575 Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 18 Sep 2024 12:07:25 -0700 Subject: [PATCH 3/4] Change cost code to allocate ctrl pkt stream memory based on stream size --- test/npu-xrt/ctrl_packet_reconfig/test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/npu-xrt/ctrl_packet_reconfig/test.cpp b/test/npu-xrt/ctrl_packet_reconfig/test.cpp index eac1ddd2d7..0ea1283140 100644 --- a/test/npu-xrt/ctrl_packet_reconfig/test.cpp +++ b/test/npu-xrt/ctrl_packet_reconfig/test.cpp @@ -23,7 +23,6 @@ constexpr int IN_SIZE = 64 * 64; constexpr int OUT_SIZE = 64 * 64; -constexpr int CTRL_IN_SIZE = 1024; #define IN_DATATYPE int8_t #define OUT_DATATYPE int8_t @@ -89,7 +88,7 @@ int main(int argc, const char *argv[]) { XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_ctrlpkt = xrt::bo(device, CTRL_IN_SIZE * sizeof(int32_t), + auto bo_ctrlpkt = xrt::bo(device, ctrlPackets.size() * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); IN_DATATYPE *bufInA = bo_inA.map(); From 8013008a4c0a103960eb50a7b31758745c85c7b8 Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 18 Sep 2024 13:54:28 -0700 Subject: [PATCH 4/4] Using dynamic shape for MemRefType for ctrl pkt stream --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 5 +++-- .../AIEX/Transforms/AIECtrlPacketToDma.cpp | 16 +++------------- test/dialect/AIEX/ctrl_pkt_to_dma.mlir | 12 ++++++------ 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index ac6e29a8ce..12b89aad32 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -347,8 +347,9 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { if (buffer.getElementTypeBitWidth() > addressGranularity) { return emitOpError("Maximum element bit width allowed is ") << addressGranularity << "bits. "; - } else if ((buffer.getNumElements() * buffer.getElementTypeBitWidth()) < - addressGranularity) { + } else if (buffer.hasStaticShape() && + (buffer.getNumElements() * buffer.getElementTypeBitWidth()) < + addressGranularity) { return emitOpError("Minimum data transfer size required is ") << addressGranularity << "bits. "; } diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index ca46e66f0e..f56f1cee3e 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -77,19 +77,9 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase { builder.create(loc, f.getSymNameAttr()); newSeq.getBody().push_back(new Block); - // Get total size of all control packets (in i32 words) - int64_t totalCtrlPktSizeInI32 = 0; - llvm::for_each(controlPacketOps, [&](AIEX::NpuControlPacketOp ctrlPktOp) { - if (ctrlPktOp.getData()) - totalCtrlPktSizeInI32 += ctrlPktOp.getData()->size(); - else if (ctrlPktOp.getLength()) - totalCtrlPktSizeInI32 += *ctrlPktOp.getLength(); - totalCtrlPktSizeInI32++; // Plus one control packet info word - }); - - auto ctrlPktMemrefType = - MemRefType::get(SmallVector{totalCtrlPktSizeInI32}, - IntegerType::get(ctx, 32), nullptr, 0); + // Using dynamic shape for ctrl pkt stream. + auto ctrlPktMemrefType = MemRefType::get( + ShapedType::kDynamic, IntegerType::get(ctx, 32), nullptr, 0); auto newBlockArg = newSeq.getBody().addArgument(ctrlPktMemrefType, loc); builder.setInsertionPointToStart(&newSeq.getBody().front()); diff --git a/test/dialect/AIEX/ctrl_pkt_to_dma.mlir b/test/dialect/AIEX/ctrl_pkt_to_dma.mlir index 02ef034af3..d8e8a4c44f 100644 --- a/test/dialect/AIEX/ctrl_pkt_to_dma.mlir +++ b/test/dialect/AIEX/ctrl_pkt_to_dma.mlir @@ -13,13 +13,13 @@ // transforms control packet ops to dma memcpy ops and sync ops. // CHECK-LABEL: aie.device(npu1_1col) { -// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref<2xi32>) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<2xi32> +// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref) { +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref // CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} aie.device(npu1_1col) { %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} - aiex.runtime_sequence(%arg0: memref<2048xi32>) { + aiex.runtime_sequence() { aiex.control_packet {address = 126976 : ui32, data = array, opcode = 0 : i32, stream_id = 0 : i32} } aie.shim_dma_allocation @ctrlpkt_col0_mm2s_chan0(MM2S, 0, 0) @@ -29,14 +29,14 @@ aie.device(npu1_1col) { // ----- // CHECK-LABEL: aie.device(npu1_1col) { -// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref<2xi32>) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<2xi32> +// CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref) { +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref // CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} aie.device(npu1_1col) { %tile_0_0 = aie.tile(0, 0) %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} - aiex.runtime_sequence(%arg0: memref<2048xi32>) { + aiex.runtime_sequence() { aiex.control_packet {address = 2301952 : ui32, data = array, opcode = 0 : i32, stream_id = 0 : i32} } aie.shim_dma_allocation @ctrlpkt_col0_mm2s_chan0(MM2S, 0, 0)