Skip to content

Commit

Permalink
Enforce core dma ping-pong to only happen where different L1 buffers …
Browse files Browse the repository at this point in the history
…were accessed by the same channel (Xilinx#546)
  • Loading branch information
erwei-xilinx authored Apr 24, 2024
1 parent 17a2124 commit c2ea563
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 8 deletions.
1 change: 1 addition & 0 deletions mlir/include/air/Conversion/AIRToAIESchedulingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ void simpleDMAChannelAllocation(std::vector<MemcpyBundleAsFlow> &memcpy_flows,
MemTileDMAAllocator &memtile_dma_alloc,
TileDMAAllocator &tile_dma_alloc);
template <typename T> int foundInVector(T item, std::vector<T> vec);
template <typename T> void push_back_if_unique(SmallVector<T> &vec, T entry);
int getSCFForLoopDepth(Operation *o);
bool groupingMemcpysByLoop(std::vector<MemcpyBundleAsFlow> &memcpy_flows);

Expand Down
28 changes: 20 additions & 8 deletions mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,32 +269,38 @@ std::pair<int64_t, int64_t> air::getLockValuePair(AIE::AIEArch arch,
if (!air_chan)
return getLockValuePair(arch, buffer_memref);

// Infer semaphore lock values using air.channel
int read_counter = 0;
int write_counter = 0;
// Infer semaphore lock values using air.channel. This method enables
// ping-pong compute-communication overlap.
SmallVector<Value> unique_write_buffers;
SmallVector<Value> unique_read_buffers;
for (auto get : getChannelGetOpThroughSymbol(air_chan)) {
if (isa<AIE::ExternalBufferOp>(buffer_memref.getDefiningOp())) {
// Shim DMA locks
write_counter = 1;
unique_write_buffers.clear();
unique_write_buffers.push_back(buffer_memref);
break;
} else if (auto core_op = get->getParentOfType<AIE::CoreOp>()) {
if (core_op.getTileOp().getResult() ==
buffer_memref.getDefiningOp()->getOperand(0)) {
write_counter++;
push_back_if_unique<Value>(unique_write_buffers, get.getMemref());
}
}
}
for (auto put : getChannelPutOpThroughSymbol(air_chan)) {
if (isa<AIE::ExternalBufferOp>(buffer_memref.getDefiningOp())) {
// Shim DMA locks
read_counter = 1;
unique_read_buffers.clear();
unique_read_buffers.push_back(buffer_memref);
break;
} else if (auto core_op = put->getParentOfType<AIE::CoreOp>()) {
if (core_op.getTileOp().getResult() ==
buffer_memref.getDefiningOp()->getOperand(0)) {
read_counter++;
push_back_if_unique<Value>(unique_read_buffers, put.getMemref());
}
}
}
return std::make_pair(read_counter, write_counter);
return std::make_pair(unique_read_buffers.size(),
unique_write_buffers.size());
}

// allocation_info_t impl.
Expand Down Expand Up @@ -1009,6 +1015,12 @@ template <typename T> int air::foundInVector(T item, std::vector<T> vec) {
return index;
}

template <typename T>
void air::push_back_if_unique(SmallVector<T> &vec, T entry) {
if (std::find(vec.begin(), vec.end(), entry) == vec.end())
vec.push_back(entry);
}

int air::getSCFForLoopDepth(Operation *o) {
int for_loop_depth = 0;
Operation *parentFor = o->getParentOfType<scf::ForOp>();
Expand Down
100 changes: 100 additions & 0 deletions mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,103 @@ func.func @core_to_core_ping_pong() {
}
return
}

// -----

// ping-pong is not possible with multiple channel accesses to the same buffer, due to dependence arising from the prod. and cons. of data in the buffer.
// CHECK: aie.device
// CHECK: %[[VAL_0:.*]] = aie.tile(2, 1)
// CHECK: %[[VAL_1:.*]] = aie.tile(0, 3)
// CHECK: %[[VAL_3:.*]] = aie.lock(%[[VAL_0]], 1) {init = 1 : i32}
// CHECK: %[[VAL_4:.*]] = aie.lock(%[[VAL_0]], 0) {init = 0 : i32}
// CHECK: %[[VAL_7:.*]] = aie.lock(%[[VAL_1]], 1) {init = 1 : i32}
// CHECK: %[[VAL_8:.*]] = aie.lock(%[[VAL_1]], 0) {init = 0 : i32}
// CHECK: %[[VAL_11:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = {{.*}}} : memref<1x1x64x32xi32, 1 : i32>
// CHECK: %[[VAL_12:.*]] = aie.buffer(%[[VAL_1]]) {sym_name = {{.*}}} : memref<1x1x4x8x4x8xi32, 2 : i32>

// CHECK: aie.mem(%[[VAL_1]]) {
// CHECK: aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
// CHECK: ^bb1:
// CHECK: aie.use_lock(%[[VAL_7]], AcquireGreaterEqual, 1)
// CHECK: aie.dma_bd(%[[VAL_12]] : memref<1x1x4x8x4x8xi32, 2 : i32>, 0, 1024)
// CHECK: aie.use_lock(%[[VAL_8]], Release, 1)
// CHECK: aie.next_bd ^bb2
// CHECK: ^bb2:
// CHECK: aie.use_lock(%[[VAL_7]], AcquireGreaterEqual, 1)
// CHECK: aie.dma_bd(%[[VAL_12]] : memref<1x1x4x8x4x8xi32, 2 : i32>, 0, 1024)
// CHECK: aie.use_lock(%[[VAL_8]], Release, 1)
// CHECK: aie.next_bd ^bb1
// CHECK: ^bb3:
// CHECK: aie.end
// CHECK: }

// CHECK: aie.core(%[[VAL_1]]) {
// CHECK: cf.br ^bb1
// CHECK: ^bb1: // pred: ^bb0
// CHECK: cf.br ^bb2
// CHECK: ^bb2: // pred: ^bb1
// CHECK: aie.use_lock(%[[VAL_8]], AcquireGreaterEqual, 1)
// CHECK: aie.use_lock(%[[VAL_7]], Release, 1)
// CHECK: cf.br ^bb3
// CHECK: ^bb3: // pred: ^bb2
// CHECK: cf.br ^bb4
// CHECK: ^bb4: // pred: ^bb3
// CHECK: scf.for %arg0 = %c1 to %c5 step %c1 {
// CHECK: aie.use_lock(%[[VAL_8]], AcquireGreaterEqual, 1)
// CHECK: aie.use_lock(%[[VAL_7]], Release, 1)
// CHECK: }
// CHECK: aie.end

// CHECK: aie.flow(%[[VAL_0]], DMA : 0, %[[VAL_1]], DMA : 0)
// cHECK: @not_really_ping_pong

air.channel @channel_2 [1, 1]
func.func @not_really_ping_pong() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%0 = air.launch async (%arg0, %arg1) in (%arg2=%c1, %arg3=%c1) {
%6 = air.segment @segment_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 4 : i64, y_loc = 3 : i64, y_size = 4 : i64} {
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c5_1 = arith.constant 5 : index
%c32_21 = arith.constant 32 : index
%c64_22 = arith.constant 64 : index
%c0_23 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c1_24 = arith.constant 1 : index
%async_token_31, %results_32 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%7 = air.herd @herd_0 async tile (%arg7, %arg8) in (%arg9=%c1_24, %arg10=%c1_24) args(%arg11=%results_32) : memref<1x1x4x8x4x8xi32, 2 : i32> attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 3 : i64} {
%23 = air.channel.get async @channel_2[] (%arg11[] [] []) {id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
air.herd_terminator
}
%async_token_39, %results_40 = air.execute -> (memref<1x1x64x32xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x32xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x32xi32, 1 : i32>
}
%9 = scf.for %arg7 = %c0_23 to %c5_1 step %c1_24 iter_args(%arg8 = %async_token_39) -> (!air.async.token) {
%26 = air.channel.put async [%arg8] @channel_2[] (%results_40[%c0_23, %c0_23, %c0_23] [%c4, %c32_21, %c8] [%c8, %c32_21, %c1_24]) {id = 13 : i32} : (memref<1x1x64x32xi32, 1 : i32>)
scf.yield %26 : !air.async.token
}
%async_token_52 = air.execute {
memref.dealloc %results_40 : memref<1x1x64x32xi32, 1 : i32>
}
%17 = air.herd @herd_0 async [%7] tile (%arg7, %arg8) in (%arg9=%c1_24, %arg10=%c1_24) args(%arg11=%results_32) : memref<1x1x4x8x4x8xi32, 2 : i32> attributes {id = 4 : i32, x_loc = 0 : i64, y_loc = 3 : i64} {
%c5_51 = arith.constant 5 : index
%c1_52 = arith.constant 1 : index
scf.for %arg14 = %c1_52 to %c5_51 step %c1_52 {
%23 = air.channel.get async @channel_2[] (%arg11[] [] []) {id = 26 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
}
air.herd_terminator
}
%async_token_47 = air.execute {
memref.dealloc %results_32 : memref<1x1x4x8x4x8xi32, 2 : i32>
}
air.segment_terminator
}
air.launch_terminator
}
return
}

0 comments on commit c2ea563

Please sign in to comment.