diff --git a/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h b/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h index c7656d0a9..55820e722 100644 --- a/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h +++ b/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h @@ -195,6 +195,7 @@ void simpleDMAChannelAllocation(std::vector &memcpy_flows, MemTileDMAAllocator &memtile_dma_alloc, TileDMAAllocator &tile_dma_alloc); template int foundInVector(T item, std::vector vec); +template void push_back_if_unique(SmallVector &vec, T entry); int getSCFForLoopDepth(Operation *o); bool groupingMemcpysByLoop(std::vector &memcpy_flows); diff --git a/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp b/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp index 9476a1eb8..a42bdd1aa 100644 --- a/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp +++ b/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp @@ -269,32 +269,38 @@ std::pair air::getLockValuePair(AIE::AIEArch arch, if (!air_chan) return getLockValuePair(arch, buffer_memref); - // Infer semaphore lock values using air.channel - int read_counter = 0; - int write_counter = 0; + // Infer semaphore lock values using air.channel. This method enables + // ping-pong compute-communication overlap. + SmallVector unique_write_buffers; + SmallVector unique_read_buffers; for (auto get : getChannelGetOpThroughSymbol(air_chan)) { if (isa(buffer_memref.getDefiningOp())) { // Shim DMA locks - write_counter = 1; + unique_write_buffers.clear(); + unique_write_buffers.push_back(buffer_memref); + break; } else if (auto core_op = get->getParentOfType()) { if (core_op.getTileOp().getResult() == buffer_memref.getDefiningOp()->getOperand(0)) { - write_counter++; + push_back_if_unique(unique_write_buffers, get.getMemref()); } } } for (auto put : getChannelPutOpThroughSymbol(air_chan)) { if (isa(buffer_memref.getDefiningOp())) { // Shim DMA locks - read_counter = 1; + unique_read_buffers.clear(); + unique_read_buffers.push_back(buffer_memref); + break; } else if (auto core_op = put->getParentOfType()) { if (core_op.getTileOp().getResult() == buffer_memref.getDefiningOp()->getOperand(0)) { - read_counter++; + push_back_if_unique(unique_read_buffers, put.getMemref()); } } } - return std::make_pair(read_counter, write_counter); + return std::make_pair(unique_read_buffers.size(), + unique_write_buffers.size()); } // allocation_info_t impl. @@ -1009,6 +1015,12 @@ template int air::foundInVector(T item, std::vector vec) { return index; } +template +void air::push_back_if_unique(SmallVector &vec, T entry) { + if (std::find(vec.begin(), vec.end(), entry) == vec.end()) + vec.push_back(entry); +} + int air::getSCFForLoopDepth(Operation *o) { int for_loop_depth = 0; Operation *parentFor = o->getParentOfType(); diff --git a/mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir b/mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir index d32a311ff..ff0fc4e5e 100644 --- a/mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir +++ b/mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir @@ -325,3 +325,103 @@ func.func @core_to_core_ping_pong() { } return } + +// ----- + +// ping-pong is not possible with multiple channel accesses to the same buffer, due to dependence arising from the prod. and cons. of data in the buffer. +// CHECK: aie.device +// CHECK: %[[VAL_0:.*]] = aie.tile(2, 1) +// CHECK: %[[VAL_1:.*]] = aie.tile(0, 3) +// CHECK: %[[VAL_3:.*]] = aie.lock(%[[VAL_0]], 1) {init = 1 : i32} +// CHECK: %[[VAL_4:.*]] = aie.lock(%[[VAL_0]], 0) {init = 0 : i32} +// CHECK: %[[VAL_7:.*]] = aie.lock(%[[VAL_1]], 1) {init = 1 : i32} +// CHECK: %[[VAL_8:.*]] = aie.lock(%[[VAL_1]], 0) {init = 0 : i32} +// CHECK: %[[VAL_11:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = {{.*}}} : memref<1x1x64x32xi32, 1 : i32> +// CHECK: %[[VAL_12:.*]] = aie.buffer(%[[VAL_1]]) {sym_name = {{.*}}} : memref<1x1x4x8x4x8xi32, 2 : i32> + +// CHECK: aie.mem(%[[VAL_1]]) { +// CHECK: aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[VAL_7]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[VAL_12]] : memref<1x1x4x8x4x8xi32, 2 : i32>, 0, 1024) +// CHECK: aie.use_lock(%[[VAL_8]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[VAL_7]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[VAL_12]] : memref<1x1x4x8x4x8xi32, 2 : i32>, 0, 1024) +// CHECK: aie.use_lock(%[[VAL_8]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: aie.end +// CHECK: } + +// CHECK: aie.core(%[[VAL_1]]) { +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: // pred: ^bb0 +// CHECK: cf.br ^bb2 +// CHECK: ^bb2: // pred: ^bb1 +// CHECK: aie.use_lock(%[[VAL_8]], AcquireGreaterEqual, 1) +// CHECK: aie.use_lock(%[[VAL_7]], Release, 1) +// CHECK: cf.br ^bb3 +// CHECK: ^bb3: // pred: ^bb2 +// CHECK: cf.br ^bb4 +// CHECK: ^bb4: // pred: ^bb3 +// CHECK: scf.for %arg0 = %c1 to %c5 step %c1 { +// CHECK: aie.use_lock(%[[VAL_8]], AcquireGreaterEqual, 1) +// CHECK: aie.use_lock(%[[VAL_7]], Release, 1) +// CHECK: } +// CHECK: aie.end + +// CHECK: aie.flow(%[[VAL_0]], DMA : 0, %[[VAL_1]], DMA : 0) +// cHECK: @not_really_ping_pong + +air.channel @channel_2 [1, 1] +func.func @not_really_ping_pong() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = air.launch async (%arg0, %arg1) in (%arg2=%c1, %arg3=%c1) { + %6 = air.segment @segment_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 4 : i64, y_loc = 3 : i64, y_size = 4 : i64} { + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %c5_1 = arith.constant 5 : index + %c32_21 = arith.constant 32 : index + %c64_22 = arith.constant 64 : index + %c0_23 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1_24 = arith.constant 1 : index + %async_token_31, %results_32 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) { + %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> + } + %7 = air.herd @herd_0 async tile (%arg7, %arg8) in (%arg9=%c1_24, %arg10=%c1_24) args(%arg11=%results_32) : memref<1x1x4x8x4x8xi32, 2 : i32> attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 3 : i64} { + %23 = air.channel.get async @channel_2[] (%arg11[] [] []) {id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>) + air.herd_terminator + } + %async_token_39, %results_40 = air.execute -> (memref<1x1x64x32xi32, 1 : i32>) { + %alloc = memref.alloc() : memref<1x1x64x32xi32, 1 : i32> + air.execute_terminator %alloc : memref<1x1x64x32xi32, 1 : i32> + } + %9 = scf.for %arg7 = %c0_23 to %c5_1 step %c1_24 iter_args(%arg8 = %async_token_39) -> (!air.async.token) { + %26 = air.channel.put async [%arg8] @channel_2[] (%results_40[%c0_23, %c0_23, %c0_23] [%c4, %c32_21, %c8] [%c8, %c32_21, %c1_24]) {id = 13 : i32} : (memref<1x1x64x32xi32, 1 : i32>) + scf.yield %26 : !air.async.token + } + %async_token_52 = air.execute { + memref.dealloc %results_40 : memref<1x1x64x32xi32, 1 : i32> + } + %17 = air.herd @herd_0 async [%7] tile (%arg7, %arg8) in (%arg9=%c1_24, %arg10=%c1_24) args(%arg11=%results_32) : memref<1x1x4x8x4x8xi32, 2 : i32> attributes {id = 4 : i32, x_loc = 0 : i64, y_loc = 3 : i64} { + %c5_51 = arith.constant 5 : index + %c1_52 = arith.constant 1 : index + scf.for %arg14 = %c1_52 to %c5_51 step %c1_52 { + %23 = air.channel.get async @channel_2[] (%arg11[] [] []) {id = 26 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>) + } + air.herd_terminator + } + %async_token_47 = air.execute { + memref.dealloc %results_32 : memref<1x1x4x8x4x8xi32, 2 : i32> + } + air.segment_terminator + } + air.launch_terminator + } + return +}