From 7b39046948bc29c86471c4719312db464c204a0e Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Mon, 26 Feb 2024 14:48:37 -0800 Subject: [PATCH] Extend -air-specialize-channel-wrap-and-stride for tile/memtile DMA repeat count (#456) * Fixup an issue with folding for loop nests into channel ops with empty stride lists * Fixup issue with stride value calculation; test * Add support for tile / memtile dma repeat count feature * Clang format --- .../air/Conversion/AIRToAIESchedulingUtils.h | 2 + mlir/lib/Conversion/AIRToAIEPass.cpp | 19 +++++- .../Conversion/AIRToAIESchedulingUtils.cpp | 19 ++++++ .../Transform/AIRDependencyScheduleOpt.cpp | 15 +++++ .../AIRToAIE/air_shimcpy_to_aie2.mlir | 65 +++++++++++++++++++ .../specialize-channel-wrap-and-stride.mlir | 15 +++++ 6 files changed, 133 insertions(+), 2 deletions(-) diff --git a/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h b/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h index e9ea0051c..d0bc863b3 100644 --- a/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h +++ b/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h @@ -47,6 +47,8 @@ bool areIdenticalVectors(std::vector &a, std::vector &b); int64_t get1DOffset(SmallVector memcpy_offsets, SmallVector memcpy_strides, int byte_count_per_elem); +int getRepeatCount(Operation *memcpy_op); + std::vector getWrapsAndStrides(SmallVector memcpy_sizes, SmallVector memcpy_strides, MLIRContext *ctx); diff --git a/mlir/lib/Conversion/AIRToAIEPass.cpp b/mlir/lib/Conversion/AIRToAIEPass.cpp index c1525769b..b6f896fbc 100644 --- a/mlir/lib/Conversion/AIRToAIEPass.cpp +++ b/mlir/lib/Conversion/AIRToAIEPass.cpp @@ -2358,19 +2358,24 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { generateDmaBd(loc, dir, locks, x, y, arch, bd, memcpyOp, bufferOp); } + + int repeat_count = 1; + if (p.second.size() == 1) + repeat_count = air::getRepeatCount(p.second[0]); + if (!channel_head) { channel_head = start_bb; end_bb = new Block(); mem.getBody().push_back(end_bb); auto b = OpBuilder::atBlockBegin(channel_head); - b.create(loc, dir, chan, /*repeat*/ 1, first_bd, + b.create(loc, dir, chan, repeat_count, first_bd, end_bb); b.setInsertionPointToEnd(end_bb); b.create(loc); } else { auto b = OpBuilder::atBlockBegin(start_bb); b.create( - loc, dir, chan, /*repeat*/ 1, first_bd, + loc, dir, chan, repeat_count, first_bd, channel_head->getTerminator()->getSuccessor(1)); channel_head->getTerminator()->setSuccessor(start_bb, 1); } @@ -2414,6 +2419,16 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { ? ndcpy.getDstStrides() : ndcpy.getSrcStrides(); + // Skip over repeat pattern at highest dimension; repeat pattern handled at + // AIE::DMAStartOp. + if (!strides.empty() && !sizes.empty() && !offsets.empty()) + if (auto const_highest_stride = getConstantIntValue(strides[0])) + if (*const_highest_stride == 0) { + strides.erase(strides.begin()); + sizes.erase(sizes.begin()); + offsets.erase(offsets.begin()); + } + int64_t len = getMemcpySizesAsInt(memref, sizes); int64_t offset = get1DOffset(offsets, strides, getElementSizeInBytes(memref.getType())); diff --git a/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp b/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp index ce50cf872..b54dcf14c 100644 --- a/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp +++ b/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp @@ -177,6 +177,25 @@ int64_t air::get1DOffset(SmallVector memcpy_offsets, return one_d_offset * byte_count_per_elem; } +// Get the repeat_count value from an air::ChannelPut/GetOp. +int air::getRepeatCount(Operation *memcpy_op) { + auto chan_op = dyn_cast(memcpy_op); + if (!chan_op) + return 1; + if (chan_op.getStrides().empty() || chan_op.getSizes().empty()) + return 1; + if (getConstantIntValue(chan_op.getStrides()[0]) && + getConstantIntValue(chan_op.getSizes()[0])) { + auto const_highest_stride = getConstantIntValue(chan_op.getStrides()[0]); + auto const_highest_size = getConstantIntValue(chan_op.getSizes()[0]); + if (*const_highest_stride == 0) { + // Highest dimension data access pattern is repeat. + return *const_highest_size; + } + } + return 1; +} + std::vector air::getWrapsAndStrides(SmallVector memcpy_sizes, SmallVector memcpy_strides, MLIRContext *ctx) { diff --git a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp index c8540bc14..ad8591f49 100644 --- a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp +++ b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp @@ -1698,6 +1698,21 @@ struct AIRSpecializeChannelWrapAndStrideInScfFor SmallVector offsets = channel_ops[0].getOffsets(); SmallVector wraps = channel_ops[0].getSizes(); SmallVector strides = channel_ops[0].getStrides(); + // If empty offsets/sizes/strides, then populate the lists with default + // values. + if (offsets.empty() && wraps.empty() && strides.empty()) { + auto memref_shape = getTensorShape(channel_ops[0].getMemref().getType()); + int current_stride = + getTensorVolume(channel_ops[0].getMemref().getType()); + for (unsigned i = 0; i < memref_shape.size(); i++) { + offsets.push_back(rewriter.create(loc, 0)); + wraps.push_back( + rewriter.create(loc, memref_shape[i])); + current_stride /= memref_shape[i]; + strides.push_back( + rewriter.create(loc, current_stride)); + } + } for (auto o : for_loops) { // Check for perfect loop nest containing only air.channel ops if (!hasNElements(o.getBody(), 1)) diff --git a/mlir/test/Conversion/AIRToAIE/air_shimcpy_to_aie2.mlir b/mlir/test/Conversion/AIRToAIE/air_shimcpy_to_aie2.mlir index a326d645e..e10210d68 100644 --- a/mlir/test/Conversion/AIRToAIE/air_shimcpy_to_aie2.mlir +++ b/mlir/test/Conversion/AIRToAIE/air_shimcpy_to_aie2.mlir @@ -635,3 +635,68 @@ func.func @func9(%arg0: memref<128xf32>, %arg1: memref<128xf32>) { return } +// ----- + +// Tile / memtile DMA repeat count support. +// CHECK: aie.device(xcve2802) +// CHECK: %[[tileDMA_0_4:.*]] = aie.mem +// CHECK: aie.dma_start(S2MM, 0, ^bb1, ^bb2, repeat_count = 32) +// CHECK: aie.dma_bd({{.*}} : memref<32x256xi32, 2>, 0, 8192) +// CHECK: %[[tileDMA_0_3:.*]] = aie.mem +// CHECK: aie.dma_start(S2MM, 0, ^bb1, ^bb2, repeat_count = 32) +// CHECK: aie.dma_bd({{.*}} : memref<32x256xi32, 2>, 0, 8192) +// CHECK: %[[memTileDMA_2_1:.*]] = aie.memtile_dma +// CHECK: aie.dma_start(MM2S, 0, ^bb1, ^bb3, repeat_count = 32) +// CHECK: aie.dma_bd({{.*}} : memref<32x256xi32, 1>, 0, 8192) +// CHECK: aie.dma_start(MM2S, 1, ^bb4, ^bb2, repeat_count = 32) +// CHECK: aie.dma_bd({{.*}} : memref<32x256xi32, 1>, 0, 8192) + +#map = affine_map<()[s0] -> (s0 * 32)> +air.channel @channel_1 [2, 1] +func.func @func10(%arg0: memref<128xf32>, %arg1: memref<128xf32>) { + %c2 = arith.constant 2 : index + %0 = air.launch async (%arg2) in (%arg3=%c2) attributes {id = 1 : i32} { + %1 = air.segment @segment_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 1 : i64, y_loc = 3 : i64, y_size = 2 : i64} { + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2_0 = arith.constant 2 : index + %async_token, %results = air.execute -> (memref<32x256xi32, 1>) { + %alloc = memref.alloc() : memref<32x256xi32, 1> + air.execute_terminator %alloc : memref<32x256xi32, 1> + } + %2 = scf.parallel (%arg4) = (%c0) to (%c2_0) step (%c1) init (%async_token) -> !air.async.token { + %4 = air.channel.put async [%async_token] @channel_1[%arg4, %c0] (%results[%c0, %c0, %c0] [%c32, %c32, %c256] [%c0, %c256, %c1]) {id = 4 : i32} : (memref<32x256xi32, 1>) + scf.reduce(%4 : !air.async.token) { + ^bb0(%arg5: !air.async.token, %arg6: !air.async.token): + %5 = air.wait_all async [%arg5, %arg6] + scf.reduce.return %5 : !air.async.token + } + } + %3 = air.herd @herd_0 async [%async_token] tile (%arg4, %arg5) in (%arg6=%c1, %arg7=%c2_0) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 3 : i64} { + %c0_2 = arith.constant 0 : index + %c1_4 = arith.constant 1 : index + %c32_3 = arith.constant 32 : index + %c256_5 = arith.constant 256 : index + %4 = air.wait_all async + %async_token_3, %results_4 = air.execute -> (memref<32x256xi32, 2>) { + %alloc = memref.alloc() : memref<32x256xi32, 2> + air.execute_terminator %alloc : memref<32x256xi32, 2> + } + %5 = air.channel.get async [%4, %async_token_3] @channel_1[%arg5, %c0_2] (%results_4[%c0_2, %c0_2, %c0_2] [%c32_3, %c32_3, %c256_5] [%c0_2, %c256_5, %c1_4]) {id = 6 : i32} : (memref<32x256xi32, 2>) + %async_token_5 = air.execute [%5] { + memref.dealloc %results_4 : memref<32x256xi32, 2> + } + air.herd_terminator + } + %async_token_1 = air.execute [%3] { + memref.dealloc %results : memref<32x256xi32, 1> + } + air.segment_terminator + } + air.launch_terminator + } + return +} + diff --git a/mlir/test/Transform/AIRDependencyScheduleOpt/specialize-channel-wrap-and-stride.mlir b/mlir/test/Transform/AIRDependencyScheduleOpt/specialize-channel-wrap-and-stride.mlir index d0f3c12c4..5de9b5fc6 100644 --- a/mlir/test/Transform/AIRDependencyScheduleOpt/specialize-channel-wrap-and-stride.mlir +++ b/mlir/test/Transform/AIRDependencyScheduleOpt/specialize-channel-wrap-and-stride.mlir @@ -194,4 +194,19 @@ module { %2 = air.wait_all async [%0, %1] return %alloc : memref<128xf32> } + + // CHECK-LABEL: test5 + // CHECK: put async @channel_17[] (%arg0[%c0, %c0, %c0] [%c8, %c32, %c32] [%c0, %c32, %c1]) : (memref<32x32xf32>) + + func.func @test5(%arg0: memref<32x32xf32>) -> memref<32x32xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %alloc = memref.alloc() : memref<32x32xf32> + scf.for %arg2 = %c0 to %c8 step %c1 { + %0 = affine.apply #map()[%arg2] + %1 = air.channel.put async @channel_17[] (%arg0[] [] []) : (memref<32x32xf32>) + } + return %alloc : memref<32x32xf32> + } }