Enforce core dma ping-pong to only happen where different L1 buffers …

…were accessed by the same channel (Xilinx#546)
nod-ai · Apr 24, 2024 · c2ea563 · c2ea563
1 parent 17a2124
commit c2ea563
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 8 deletions.
diff --git a/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h b/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h
@@ -195,6 +195,7 @@ void simpleDMAChannelAllocation(std::vector<MemcpyBundleAsFlow> &memcpy_flows,
                                 MemTileDMAAllocator &memtile_dma_alloc,
                                 TileDMAAllocator &tile_dma_alloc);
 template <typename T> int foundInVector(T item, std::vector<T> vec);
+template <typename T> void push_back_if_unique(SmallVector<T> &vec, T entry);
 int getSCFForLoopDepth(Operation *o);
 bool groupingMemcpysByLoop(std::vector<MemcpyBundleAsFlow> &memcpy_flows);
 

diff --git a/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp b/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp
@@ -269,32 +269,38 @@ std::pair<int64_t, int64_t> air::getLockValuePair(AIE::AIEArch arch,
   if (!air_chan)
     return getLockValuePair(arch, buffer_memref);
 
-  // Infer semaphore lock values using air.channel
-  int read_counter = 0;
-  int write_counter = 0;
+  // Infer semaphore lock values using air.channel. This method enables
+  // ping-pong compute-communication overlap.
+  SmallVector<Value> unique_write_buffers;
+  SmallVector<Value> unique_read_buffers;
   for (auto get : getChannelGetOpThroughSymbol(air_chan)) {
     if (isa<AIE::ExternalBufferOp>(buffer_memref.getDefiningOp())) {
       // Shim DMA locks
-      write_counter = 1;
+      unique_write_buffers.clear();
+      unique_write_buffers.push_back(buffer_memref);
+      break;
     } else if (auto core_op = get->getParentOfType<AIE::CoreOp>()) {
       if (core_op.getTileOp().getResult() ==
           buffer_memref.getDefiningOp()->getOperand(0)) {
-        write_counter++;
+        push_back_if_unique<Value>(unique_write_buffers, get.getMemref());
       }
     }
   }
   for (auto put : getChannelPutOpThroughSymbol(air_chan)) {
     if (isa<AIE::ExternalBufferOp>(buffer_memref.getDefiningOp())) {
       // Shim DMA locks
-      read_counter = 1;
+      unique_read_buffers.clear();
+      unique_read_buffers.push_back(buffer_memref);
+      break;
     } else if (auto core_op = put->getParentOfType<AIE::CoreOp>()) {
       if (core_op.getTileOp().getResult() ==
           buffer_memref.getDefiningOp()->getOperand(0)) {
-        read_counter++;
+        push_back_if_unique<Value>(unique_read_buffers, put.getMemref());
       }
     }
   }
-  return std::make_pair(read_counter, write_counter);
+  return std::make_pair(unique_read_buffers.size(),
+                        unique_write_buffers.size());
 }
 
 // allocation_info_t impl.
@@ -1009,6 +1015,12 @@ template <typename T> int air::foundInVector(T item, std::vector<T> vec) {
   return index;
 }
 
+template <typename T>
+void air::push_back_if_unique(SmallVector<T> &vec, T entry) {
+  if (std::find(vec.begin(), vec.end(), entry) == vec.end())
+    vec.push_back(entry);
+}
+
 int air::getSCFForLoopDepth(Operation *o) {
   int for_loop_depth = 0;
   Operation *parentFor = o->getParentOfType<scf::ForOp>();

diff --git a/mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir b/mlir/test/Conversion/AIRToAIE/air_channel_to_locks_ping_pong.mlir
@@ -325,3 +325,103 @@ func.func @core_to_core_ping_pong() {
   }
   return
 }
+
+// -----
+
+// ping-pong is not possible with multiple channel accesses to the same buffer, due to dependence arising from the prod. and cons. of data in the buffer.
+// CHECK: aie.device
+// CHECK:         %[[VAL_0:.*]] = aie.tile(2, 1)
+// CHECK:         %[[VAL_1:.*]] = aie.tile(0, 3)
+// CHECK:         %[[VAL_3:.*]] = aie.lock(%[[VAL_0]], 1) {init = 1 : i32}
+// CHECK:         %[[VAL_4:.*]] = aie.lock(%[[VAL_0]], 0) {init = 0 : i32}
+// CHECK:         %[[VAL_7:.*]] = aie.lock(%[[VAL_1]], 1) {init = 1 : i32}
+// CHECK:         %[[VAL_8:.*]] = aie.lock(%[[VAL_1]], 0) {init = 0 : i32}
+// CHECK:         %[[VAL_11:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = {{.*}}} : memref<1x1x64x32xi32, 1 : i32>
+// CHECK:         %[[VAL_12:.*]] = aie.buffer(%[[VAL_1]]) {sym_name = {{.*}}} : memref<1x1x4x8x4x8xi32, 2 : i32>
+
+// CHECK:    aie.mem(%[[VAL_1]])  {
+// CHECK:           aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+// CHECK:         ^bb1:
+// CHECK:           aie.use_lock(%[[VAL_7]], AcquireGreaterEqual, 1)
+// CHECK:           aie.dma_bd(%[[VAL_12]] : memref<1x1x4x8x4x8xi32, 2 : i32>, 0, 1024)
+// CHECK:           aie.use_lock(%[[VAL_8]], Release, 1)
+// CHECK:           aie.next_bd ^bb2
+// CHECK:         ^bb2:
+// CHECK:           aie.use_lock(%[[VAL_7]], AcquireGreaterEqual, 1)
+// CHECK:           aie.dma_bd(%[[VAL_12]] : memref<1x1x4x8x4x8xi32, 2 : i32>, 0, 1024)
+// CHECK:           aie.use_lock(%[[VAL_8]], Release, 1)
+// CHECK:           aie.next_bd ^bb1
+// CHECK:         ^bb3:
+// CHECK:           aie.end
+// CHECK:         }
+
+// CHECK:    aie.core(%[[VAL_1]])  {
+// CHECK:         cf.br ^bb1
+// CHECK:       ^bb1:  // pred: ^bb0
+// CHECK:         cf.br ^bb2
+// CHECK:       ^bb2:  // pred: ^bb1
+// CHECK:         aie.use_lock(%[[VAL_8]], AcquireGreaterEqual, 1)
+// CHECK:         aie.use_lock(%[[VAL_7]], Release, 1)
+// CHECK:         cf.br ^bb3
+// CHECK:       ^bb3:  // pred: ^bb2
+// CHECK:         cf.br ^bb4
+// CHECK:       ^bb4:  // pred: ^bb3
+// CHECK:         scf.for %arg0 = %c1 to %c5 step %c1 {
+// CHECK:           aie.use_lock(%[[VAL_8]], AcquireGreaterEqual, 1)
+// CHECK:           aie.use_lock(%[[VAL_7]], Release, 1)
+// CHECK:         }
+// CHECK:         aie.end
+
+// CHECK:         aie.flow(%[[VAL_0]], DMA : 0, %[[VAL_1]], DMA : 0)
+// cHECK: @not_really_ping_pong
+
+air.channel @channel_2 [1, 1]
+func.func @not_really_ping_pong() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = air.launch async (%arg0, %arg1) in (%arg2=%c1, %arg3=%c1) {
+    %6 = air.segment @segment_0 async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 4 : i64, y_loc = 3 : i64, y_size = 4 : i64} {
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %c5_1 = arith.constant 5 : index
+      %c32_21 = arith.constant 32 : index
+      %c64_22 = arith.constant 64 : index
+      %c0_23 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %c1_24 = arith.constant 1 : index
+      %async_token_31, %results_32 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
+        %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+        air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
+      }
+      %7 = air.herd @herd_0 async  tile (%arg7, %arg8) in (%arg9=%c1_24, %arg10=%c1_24) args(%arg11=%results_32) : memref<1x1x4x8x4x8xi32, 2 : i32> attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 3 : i64} {
+        %23 = air.channel.get async  @channel_2[] (%arg11[] [] []) {id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
+        air.herd_terminator
+      }
+      %async_token_39, %results_40 = air.execute -> (memref<1x1x64x32xi32, 1 : i32>) {
+        %alloc = memref.alloc() : memref<1x1x64x32xi32, 1 : i32>
+        air.execute_terminator %alloc : memref<1x1x64x32xi32, 1 : i32>
+      }
+      %9 = scf.for %arg7 = %c0_23 to %c5_1 step %c1_24 iter_args(%arg8 = %async_token_39) -> (!air.async.token) {
+        %26 = air.channel.put async [%arg8] @channel_2[] (%results_40[%c0_23, %c0_23, %c0_23] [%c4, %c32_21, %c8] [%c8, %c32_21, %c1_24]) {id = 13 : i32} : (memref<1x1x64x32xi32, 1 : i32>)
+        scf.yield %26 : !air.async.token
+      }
+      %async_token_52 = air.execute  {
+        memref.dealloc %results_40 : memref<1x1x64x32xi32, 1 : i32>
+      }
+      %17 = air.herd @herd_0 async [%7]  tile (%arg7, %arg8) in (%arg9=%c1_24, %arg10=%c1_24) args(%arg11=%results_32) : memref<1x1x4x8x4x8xi32, 2 : i32> attributes {id = 4 : i32, x_loc = 0 : i64, y_loc = 3 : i64} {
+        %c5_51 = arith.constant 5 : index
+        %c1_52 = arith.constant 1 : index
+        scf.for %arg14 = %c1_52 to %c5_51 step %c1_52 {
+          %23 = air.channel.get async  @channel_2[] (%arg11[] [] []) {id = 26 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
+        }
+        air.herd_terminator
+      }
+      %async_token_47 = air.execute {
+        memref.dealloc %results_32 : memref<1x1x4x8x4x8xi32, 2 : i32>
+      }
+      air.segment_terminator
+    }
+    air.launch_terminator
+  }
+  return
+}