diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index 21670b6827..04f0a0aa96 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -1689,7 +1689,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] // via_shared_mem==1 means use consumer tile's memory module OptionalAttr:$via_shared_mem, // memtile_repeat==0 means "do it once" and don't repeat - OptionalAttr:$memtile_repeat + OptionalAttr:$memtile_repeat, + OptionalAttr:$padDimensions ); let assemblyFormat = [{ @@ -1728,7 +1729,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] OpBuilder<(ins "mlir::StringAttr":$sym_name, "mlir::Value":$producerTile, "mlir::ValueRange":$consumerTiles, "mlir::Attribute":$elemNumber, "mlir::Type":$elem_type, CArg<"llvm::ArrayRef", "{}">:$dimensionsToStream, - CArg<"llvm::ArrayRef", "{}">:$dimensionsFromStreamPerConsumer), [{ + CArg<"llvm::ArrayRef", "{}">:$dimensionsFromStreamPerConsumer, + CArg<"llvm::ArrayRef", "{}">:$padDimensions), [{ odsState.addOperands(producerTile); odsState.addOperands(consumerTiles); odsState.addAttribute(getSymNameAttrName(odsState.name), sym_name); diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index ce04f131e0..6e7d9968df 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -570,7 +570,13 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ OptionalAttr:$packet, FlatSymbolRefAttr:$metadata, I64Attr:$id, - DefaultValuedOptionalAttr:$issue_token + DefaultValuedOptionalAttr:$issue_token, + DefaultValuedOptionalAttr:$d0_zero_before, + DefaultValuedOptionalAttr:$d1_zero_before, + DefaultValuedOptionalAttr:$d2_zero_before, + DefaultValuedOptionalAttr:$d0_zero_after, + DefaultValuedOptionalAttr:$d1_zero_after, + DefaultValuedOptionalAttr:$d2_zero_after ); let assemblyFormat = [{ @@ -840,7 +846,13 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> { I32Attr:$lock_rel_id, I32Attr:$lock_acq_enable, I32Attr:$lock_acq_val, - I32Attr:$lock_acq_id + I32Attr:$lock_acq_id, + I32Attr:$d0_zero_before, + I32Attr:$d1_zero_before, + I32Attr:$d2_zero_before, + I32Attr:$d0_zero_after, + I32Attr:$d1_zero_after, + I32Attr:$d2_zero_after ); let results = (outs ); let assemblyFormat = [{ attr-dict }]; diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 6b77f498b1..535912a9ff 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -459,14 +459,19 @@ struct AIEObjectFifoStatefulTransformPass void createBd(OpBuilder &builder, LockOp acqLock, int acqMode, LockAction acqLockAction, LockOp relLock, int relMode, MyOp buff, int offset, int len, Block *succ, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { if (acqLock) builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - if (!dims.getValue().empty()) + + if (!dims.getValue().empty() && !padDimensions.getValue().empty()) { + builder.create(builder.getUnknownLoc(), buff, offset, len, dims, + padDimensions); + } else if (!dims.getValue().empty()) { builder.create(builder.getUnknownLoc(), buff, offset, len, dims); - else + } else { builder.create(builder.getUnknownLoc(), buff, offset, len); + } if (acqLock) builder.create(builder.getUnknownLoc(), relLock, LockAction::Release, relMode); @@ -480,7 +485,8 @@ struct AIEObjectFifoStatefulTransformPass void createBdBlock(OpBuilder &builder, ObjectFifoCreateOp op, int lockMode, int acqNum, int relNum, MyOp buff, int offset, int len, DMAChannelDir channelDir, size_t blockIndex, Block *succ, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, + BDPadLayoutArrayAttr padDimensions) { LockOp acqLock; LockOp relLock; int acqMode = 1; @@ -505,20 +511,25 @@ struct AIEObjectFifoStatefulTransformPass } } createBd(builder, acqLock, acqMode, acqLockAction, relLock, relMode, buff, - offset, len, succ, dims); + offset, len, succ, dims, padDimensions); } /// Function that either calls createAIETileDMA(), createShimDMA() or /// createMemTileDMA() based on op tile row value. void createDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op, DMAChannelDir channelDir, int channelIndex, int lockMode, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr pad_dims) { if (op.getProducerTileOp().isShimTile()) { createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); - } else if (op.getProducerTileOp().isMemTile()) { + } else if (op.getProducerTileOp().isMemTile() && + channelDir == DMAChannelDir::MM2S) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, - dims); + dims, pad_dims); + } else if (op.getProducerTileOp().isMemTile() && + channelDir == DMAChannelDir::S2MM) { + createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, + dims, nullptr); } else { createAIETileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); @@ -602,7 +613,7 @@ struct AIEObjectFifoStatefulTransformPass builder.setInsertionPointToStart(curr); createBdBlock(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], /*offset*/ 0, - len, channelDir, blockIndex, succ, dims); + len, channelDir, blockIndex, succ, dims, nullptr); curr = succ; blockIndex++; } @@ -678,7 +689,7 @@ struct AIEObjectFifoStatefulTransformPass createBdBlock(builder, op, lockMode, acqNum, relNum, externalBuffersPerFifo[op][blockIndex], /*offset*/ 0, len, channelDir, blockIndex, - succ, dims); + succ, dims, nullptr); curr = succ; blockIndex++; } @@ -689,7 +700,8 @@ struct AIEObjectFifoStatefulTransformPass void createMemTileDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op, DMAChannelDir channelDir, int channelIndex, int lockMode, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, + BDPadLayoutArrayAttr padDimensions) { size_t numBlocks = op.size(); if (numBlocks == 0) return; @@ -710,6 +722,7 @@ struct AIEObjectFifoStatefulTransformPass dims.getValue().drop_front(1)); } } + if (op.getMemtileRepeat().has_value()) repeatCount = op.getMemtileRepeat().value(); @@ -839,7 +852,8 @@ struct AIEObjectFifoStatefulTransformPass offset = extraOffset; createBdBlock(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], offset, - lenOut, channelDir, blockIndex, succ, dims); + lenOut, channelDir, blockIndex, succ, dims, + padDimensions); curr = succ; blockIndex++; } @@ -1303,7 +1317,6 @@ struct AIEObjectFifoStatefulTransformPass auto consumerWireType = WireBundle::DMA; std::set objectFifoTiles; // track cores to check for loops during unrolling - //===------------------------------------------------------------------===// // Split objectFifos into a consumer end and producer end if needed //===------------------------------------------------------------------===// @@ -1446,7 +1459,8 @@ struct AIEObjectFifoStatefulTransformPass DMAChannel producerChan = dmaAnalysis.getMasterDMAChannel(producer.getProducerTile()); createDMA(device, builder, producer, producerChan.direction, - producerChan.channel, 0, producer.getDimensionsToStreamAttr()); + producerChan.channel, 0, producer.getDimensionsToStreamAttr(), + producer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); @@ -1464,7 +1478,8 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, - consumerChan.channel, 1, consumerDims); + consumerChan.channel, 1, consumerDims, + consumer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index f56f1cee3e..c064ad6702 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -129,7 +129,7 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase { SmallVector{}, SmallVector{}, SmallVector{}, ArrayRef(staticOffsets), ArrayRef(staticSizes), ArrayRef(staticStrides), - controllerIdPkt, metadata, 0, true); + controllerIdPkt, metadata, 0, true, 0, 0, 0, 0, 0, 0); auto shimRow = builder.getI32IntegerAttr(0); auto shimCol = builder.getI32IntegerAttr(col); diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 8c889553da..ac5a2fa822 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -216,7 +216,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { } LogicalResult rewriteSingleBD(OpBuilder &builder, Block &block, - AIE::TileOp &tile) { + AIE::TileOp &tile, + AIE::DMAChannelDir channelDir) { AIE::DMABDOp bd_op = getBdForBlock(block); const auto &target_model = AIE::getTargetModel(bd_op); MemRefType buffer_type = bd_op.getBuffer().getType(); @@ -237,12 +238,21 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { << len << " bytes falls below minimum hardware transfer unit of " << (addr_granularity / 8) << " bytes."; } - // Process strides/wraps std::optional> dims = bd_op.getDimensions(); llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); + // Padding + std::optional> padDims = + bd_op.getPadDimensions(); + llvm::SmallVector padBefore = + llvm::SmallVector(4, 0); + llvm::SmallVector padAfter = + llvm::SmallVector(4, 0); + std::fill(padBefore.begin(), padBefore.end(), 0); + std::fill(padAfter.begin(), padAfter.end(), 0); + if (dims && dims->size() > 0) { llvm::SmallVector input_sizes = llvm::SmallVector(4, 1); @@ -260,6 +270,23 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { input_sizes[i] = (*dims)[j].getSize(); input_strides[i] = (*dims)[j].getStride(); } + + if (target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S) { + if (padDims && (padDims->size() > dims->size())) + return bd_op->emitOpError() + << "Mismatch number of dimensions between padding(s)" + << " and wrap(s) and stride(s)."; + else if (padDims) + for (size_t i = 0; i < padDims->size(); i++) { + int j = padDims->size() - i - 1; + padBefore[i] = (*padDims)[j].getConstPadBefore(); + padAfter[i] = (*padDims)[j].getConstPadAfter(); + } + } else if (padDims) { + return bd_op->emitOpError() + << "supports padding only for MM2S direction on MemTiles."; + } getHardwareStridesWraps(target_model, buffer_type, input_sizes, input_strides, sizes, strides); if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(), @@ -290,8 +317,16 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { "transfer length, as this is the BD repeat count."; return failure(); } + } else { + if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S) { + return bd_op->emitOpError() + << "Padding requires n-d data layouts expressed as " + << "wrap(s) and stride(s)."; + } else if (padDims) { + return bd_op->emitOpError() << "Padding is supported only on MemTiles."; + } } - // find next BD ID, if any uint32_t use_next_bd = 0; uint32_t next_bd_id = 0; @@ -316,7 +351,10 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /*valid_bd=*/1, /* TODO: Locks */ /*lock_rel_val=*/0, /*lock_rel_id=*/0, /*lock_acq_enable=*/0, - /*lock_acq_val=*/0, /*lock_ackq_id=*/0); + /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/padBefore[0], + /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2], + /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1], + /*d2_zero_after=*/padAfter[2]); return setAddressForSingleBD(builder, bd_op, tile); } @@ -392,13 +430,15 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { return failure(); } + auto channelDir = op.getDirection(); + // Lower all BDs for (auto it = body.begin(); it != body.end(); ++it) { Block &block = *it; if (shouldSkipBlock(block)) { continue; } - if (failed(rewriteSingleBD(builder, block, tile))) { + if (failed(rewriteSingleBD(builder, block, tile, channelDir))) { return failure(); } } diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index ca6f7f1a69..54717f2935 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -338,6 +338,12 @@ struct DmaToNpuPattern : OpConversionPattern { auto lock_acq_enable = zero; auto lock_acq_val = zero; auto lock_acq_id = zero; + auto d0_zero_before = zero; + auto d1_zero_before = zero; + auto d2_zero_before = zero; + auto d0_zero_after = zero; + auto d1_zero_after = zero; + auto d2_zero_after = zero; auto issue_token = BoolAttr::get(ctx, false); auto repeat_count = zero; @@ -357,6 +363,9 @@ struct DmaToNpuPattern : OpConversionPattern { // column column = IntegerAttr::get(i32ty, col); + // row + row = IntegerAttr::get(i32ty, 0); + // arg_idx AIEX::RuntimeSequenceOp seq_op = op->getParentOfType(); @@ -447,6 +456,24 @@ struct DmaToNpuPattern : OpConversionPattern { // lock_acq_id + // d0_zero_before + d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore()); + + // d1_zero_before + d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore()); + + // d2_zero_before + d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore()); + + // d0_zero_after + d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter()); + + // d1_zero_after + d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter()); + + // d2_zero_after + d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter()); + // Set the issue_token issue_token = BoolAttr::get(ctx, op.getIssueToken()); // Earlier, all S2MM channels were implicitly assumed to issue a token. @@ -454,12 +481,18 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); + // TODO: Need to add a check to only allow zero padding on MM2S channel of + // MemTile As of now, run time MemTile DMA configuration is supported only + // from BD level, not at NpuDmaMemcpyNdOp. + rewriter.create( op->getLoc(), column, bd_id, buffer_length, buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, d0_size, d0_stride, d1_size, d1_stride, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, - lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id); + lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id, + d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, + d1_zero_after, d2_zero_after); uint64_t addr = getBufferDescriptorAddressRegisterAddress( targetModel, op.getId(), col, 0); @@ -581,6 +614,12 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[7] |= (op.getLockAcqEnable() & 0x1) << 12; words[7] |= (op.getLockAcqVal() & 0xef) << 5; words[7] |= op.getLockAcqId() & 0xf; + + if (op.getD0ZeroBefore() || op.getD1ZeroBefore() || + op.getD2ZeroBefore() || op.getD0ZeroAfter() || op.getD1ZeroAfter() || + op.getD2ZeroAfter()) { + op->emitError("Zero padding is only available on MemTile"); + } } else if (tm.isMemTile(op.getColumn(), op.getRow())) { bd_addr = (op.getColumn() << tm.getColumnShift()) | (op.getRow() << tm.getRowShift()) | (0xA0000 + bd_id * 0x20); @@ -592,6 +631,7 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[0] |= op.getBufferLength() & 0x1ffff; // DMA_BDX_1 + words[1] |= (op.getD0ZeroBefore() & 0x3F) << 26; words[1] |= (op.getNextBd() & 0x3f) << 20; words[1] |= (op.getUseNextBd() & 0x1) << 19; words[1] |= op.getBufferOffset() & 0x7ffff; @@ -602,15 +642,20 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { // DMA_BDX_3 // TODO: Secure Access + words[3] |= (op.getD1ZeroBefore() & 0x1F) << 27; words[3] |= (op.getD1Size() & 0x3ff) << 17; words[3] |= op.getD1Stride() & 0x1ffff; // DMA_BDX_4 // TODO: D2Size + words[4] |= (op.getD2ZeroBefore() & 0xF) << 27; words[4] |= op.getD2Stride() & 0x1ffff; // DMA_BDX_5 // ToDO: D3Stride + words[5] |= (op.getD2ZeroAfter() & 0xF) << 28; + words[5] |= (op.getD1ZeroAfter() & 0x1F) << 23; + words[5] |= (op.getD0ZeroAfter() & 0x3F) << 17; // DMA_BDX_6 words[6] |= (op.getIterationCurrent() & 0x3f) << 23; diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp index 9c11596119..3e227310cf 100644 --- a/programming_examples/basic/passthrough_dmas/test.cpp +++ b/programming_examples/basic/passthrough_dmas/test.cpp @@ -192,4 +192,4 @@ int main(int argc, const char *argv[]) { std::cout << std::endl << "fail." << std::endl << std::endl; return 1; } -} +} \ No newline at end of file diff --git a/python/dialects/aie.py b/python/dialects/aie.py index 4d672b45be..717f430f92 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -107,6 +107,12 @@ def bd_dim_layout(size, stride): return Attribute.parse(f"#aie.bd_dim_layout<{size=}, {stride=}>") +def bd_pad_layout(const_pad_before, const_pad_after): + return Attribute.parse( + f"#aie.bd_pad_layout<{const_pad_before=}, {const_pad_after=}>" + ) + + @register_attribute_builder("BDDimLayoutArrayAttr") def bd_dim_layout_array_attr_builder(tups: List[Attribute | Tuple[int]], context=None): if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups): @@ -125,6 +131,17 @@ def bd_dim_layout_array_array_attr_builder(tup_arrs: List[List[tuple]], context= ) +@register_attribute_builder("BDPadLayoutArrayAttr") +def bd_pad_layout_array_attr_builder( + tups: List[Union[Attribute, Tuple[int]]], context=None +): + if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups): + tups = list(map(lambda t: bd_pad_layout(*t), tups)) + return Attribute.parse( + f'#aie', context=context + ) + + @register_attribute_builder("AIEI1Attr") def _i1Attr(x, context): return IntegerAttr.get(IntegerType.get_signless(1, context=context), x) @@ -378,6 +395,7 @@ def __init__( dimensionsFromStreamPerConsumer=None, via_DMA=None, plio=None, + padDimensions=None, disable_synchronization=None, ): self.datatype = try_convert_np_type_to_mlir_type(datatype) @@ -398,6 +416,7 @@ def __init__( dimensionsFromStreamPerConsumer=dimensionsFromStreamPerConsumer, via_DMA=via_DMA, plio=plio, + padDimensions=padDimensions, disable_synchronization=disable_synchronization, ) diff --git a/python/utils/trace.py b/python/utils/trace.py index 668455881e..07a7a3dcb8 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -527,9 +527,15 @@ def configure_shimtile_tracing_aie2( column=int(shim.col), d0_size=0, d0_stride=0, + d0_zero_after=0, + d0_zero_before=0, d1_size=0, d1_stride=0, + d1_zero_after=0, + d1_zero_before=0, d2_stride=0, + d2_zero_after=0, + d2_zero_before=0, iteration_current=0, iteration_size=0, iteration_stride=0, diff --git a/test/Targets/NPU/npu_blockwrite_instgen.mlir b/test/Targets/NPU/npu_blockwrite_instgen.mlir index 4ba0b41342..f38a24d5d0 100644 --- a/test/Targets/NPU/npu_blockwrite_instgen.mlir +++ b/test/Targets/NPU/npu_blockwrite_instgen.mlir @@ -46,9 +46,15 @@ module { row = 1 : i32, d0_stride = 5 : i32, d0_size = 6 : i32, + d0_zero_after = 0 : i32, + d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 8 : i32, + d1_zero_after = 0 : i32, + d1_zero_before = 0 : i32, d2_stride = 9 : i32, + d2_zero_after = 0 : i32, + d2_zero_before = 0 : i32, ddr_id = 10 : i32, iteration_current = 11 : i32, iteration_stride = 12 : i32, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir new file mode 100644 index 0000000000..8ff16ccaf1 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir @@ -0,0 +1,25 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_1, S2MM, 0) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir new file mode 100644 index 0000000000..d0291b038f --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir new file mode 100644 index 0000000000..466c73b929 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir new file mode 100644 index 0000000000..45f95e0056 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir @@ -0,0 +1,27 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + // expected-error@+1 {{Mismatch number of dimensions between padding(s) and wrap(s) and stride(s).}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], [, ]) + {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir new file mode 100644 index 0000000000..3e58b8a5af --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + // expected-error@+1 {{Padding is supported only on MemTiles.}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir index 2ad275b804..61601b91b3 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir @@ -17,13 +17,13 @@ module { %tile_2_0 = aie.tile(2, 0) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task(%tile_2_0, S2MM, 1) { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} @@ -40,5 +40,4 @@ module { aiex.dma_await_task(%t2) } } -} - +} \ No newline at end of file diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir index b57cbc81bd..286ad32f52 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir @@ -16,11 +16,11 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} - // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} - // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118852 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 0 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir index 191f1511ee..798201879e 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir @@ -16,7 +16,7 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir index 22df05bca5..143feb9e1b 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir @@ -19,7 +19,7 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { aie.dma_bd(%buf : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir index 5a6519a4ee..e026d9e829 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir @@ -16,14 +16,14 @@ module { aie.shim_dma_allocation @alloc1 (S2MM, 1, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task_for @alloc0 { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} - // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task_for @alloc1 { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} aie.end diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir new file mode 100644 index 0000000000..a6b9fa6bbb --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 2 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir new file mode 100644 index 0000000000..6baa6be36a --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_stride = 0 : i32, d2_zero_after = 1 : i32, d2_zero_before = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], [, , ]) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir index 383f6ac567..5be345197e 100644 --- a/test/dialect/AIEX/bad_npu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -15,7 +15,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -26,7 +26,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -37,7 +37,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -48,7 +48,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } \ No newline at end of file diff --git a/test/objectFifo-stateful-transform/memtile_padding_test.mlir b/test/objectFifo-stateful-transform/memtile_padding_test.mlir new file mode 100644 index 0000000000..0fe3b75af7 --- /dev/null +++ b/test/objectFifo-stateful-transform/memtile_padding_test.mlir @@ -0,0 +1,182 @@ +//===- memtile_padding_test.mlir --------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + // CHECK: %tile_0_0 = aie.tile(0, 0) + // CHECK: %tile_0_1 = aie.tile(0, 1) + // CHECK: %tile_0_2 = aie.tile(0, 2) + // CHECK: %objFifo_out0_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "objFifo_out0_cons_prod_lock"} + // CHECK: %objFifo_out0_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_cons_lock"} + // CHECK: %objFifo_out1_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_out1_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_out1_cons_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out1_cons_prod_lock"} + // CHECK: %objFifo_out1_cons_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_cons_lock"} + // CHECK: %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"} + // CHECK: %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"} + // CHECK: %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"} + // CHECK: %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"} + // CHECK: %objFifo_in1_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_in1_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_in1_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in1_prod_lock"} + // CHECK: %objFifo_in1_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_lock"} + // CHECK: %objFifo_in0_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "objFifo_in0_prod_lock"} + // CHECK: %objFifo_in0_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_lock"} + // CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + // CHECK: aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + // CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) + // CHECK: aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) + // CHECK: %core_0_2 = aie.core(%tile_0_2) { + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: %c0 = arith.constant 0 : index + // CHECK: %c1 = arith.constant 1 : index + // CHECK: %c64 = arith.constant 64 : index + // CHECK: %c12_i8 = arith.constant 12 : i8 + // CHECK: scf.for %arg0 = %c0 to %c64 step %c1 { + // CHECK: scf.for %arg1 = %c0 to %c64 step %c1 { + // CHECK: %0 = memref.load %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8> + // CHECK: %1 = arith.addi %0, %c12_i8 : i8 + // CHECK: memref.store %1, %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8> + // CHECK: } + // CHECK: } + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1) + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, Release, 1) + // CHECK: aie.end + // CHECK: } + // CHECK: aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) + // CHECK: aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64x64xi8> + // CHECK: aiex.npu.dma_wait {symbol = @objFifo_out0} + // CHECK: } + // CHECK: %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + // CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + // CHECK: ^bb1: + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb2 + // CHECK: ^bb2: + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb1 + // CHECK: ^bb3: + // CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + // CHECK: ^bb4: + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb5 + // CHECK: ^bb5: + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb4 + // CHECK: ^bb6: + // CHECK: %2 = aie.dma_start(S2MM, 1, ^bb7, ^bb9) + // CHECK: ^bb7: + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb8 + // CHECK: ^bb8: + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb7 + // CHECK: ^bb9: + // CHECK: %3 = aie.dma_start(MM2S, 1, ^bb10, ^bb12) + // CHECK: ^bb10: + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096, [, ], [, ]) + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb11 + // CHECK: ^bb11: + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096, [, ], [, ]) + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb10 + // CHECK: ^bb12: + // CHECK: aie.end + // CHECK: } + // CHECK: aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) + // CHECK: %mem_0_2 = aie.mem(%tile_0_2) { + // CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + // CHECK: ^bb1: + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb2 + // CHECK: ^bb2: + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb1 + // CHECK: ^bb3: + // CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + // CHECK: ^bb4: + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb5 + // CHECK: ^bb5: + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb4 + // CHECK: ^bb6: + // CHECK: aie.end + // CHECK: } + +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + aie.objectfifo @objFifo_in0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ([] []) + aie.objectfifo @objFifo_out1(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%tile_0_1 dimensionsToStream [, ], {%tile_0_0}, 2 : i32) {padDimensions = #aie, ]>} : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ([] []) + %core_0_2 = aie.core(%tile_0_2) { + %subview = aie.objectfifo.acquire @objFifo_in1 (Consume, 1) : !aie.objectfifosubview> + %subview1 = aie.objectfifo.acquire @objFifo_out1 (Produce, 1) : !aie.objectfifosubview> + %elem = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<64x64xi8> + %elem1 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<64x64xi8> + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c12_i8 = arith.constant 12 : i8 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %elem[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %elem1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.objectfifo.release @objFifo_in1 (Consume, 1) + aie.objectfifo.release @objFifo_out1 (Produce, 1) + aie.end + } + + aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + } +} \ No newline at end of file diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index c3a02a201a..8039dda9ea 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -13,7 +13,7 @@ # CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} # CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} # CHECK: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} # CHECK: aiex.npu.address_patch {addr = 118884 : ui32, arg_idx = 2 : i32, arg_plus = 1024 : i32} # CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} diff --git a/test/python/zero_pad.py b/test/python/zero_pad.py new file mode 100644 index 0000000000..8257a18271 --- /dev/null +++ b/test/python/zero_pad.py @@ -0,0 +1,71 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# RUN: %python %s | FileCheck %s +# CHECK: aie.objectfifo @out(%tile_0_1 dimensionsToStream [, ], {%tile_0_0}, 1 : i32) {padDimensions = #aie, ]>} : !aie.objectfifo> +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ + +N = 56 +dev = AIEDevice.npu1_1col +col = 0 + +if len(sys.argv) > 1: + N = int(sys.argv[1]) + +if len(sys.argv) > 2: + if sys.argv[2] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[2] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2])) + +if len(sys.argv) > 3: + col = int(sys.argv[3]) + + +def my_passthrough(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(25, T.i32()) + memRef_ty2 = T.memref(56, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + MemTile = tile(col, 1) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_ty) + of_out = object_fifo( + "out", + MemTile, + ShimTile, + 1, + memRef_ty2, + dimensionsToStream=[(5, 5), (5, 5)], + padDimensions=[(2, 0), (3, 0)], + ) + object_fifo_link(of_in, of_out) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True + ) + npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N]) + dma_wait(of_in, of_out) + + print(ctx.module) + + +my_passthrough()