diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 3b07d4512..7a45f9454 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -142,6 +142,8 @@ def generate_aie_vmfb( f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}", f"--iree-hal-dump-executable-files-to={config.output_dir}", "--iree-scheduling-optimize-bindings=false", + "--iree-hal-memoization=false", + "--iree-hal-indirect-command-buffers=false", f"--mlir-disable-threading", "--mlir-elide-resource-strings-if-larger=10", ] diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index fb4b01a77..50649e8fd 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -405,6 +405,8 @@ function run_matmul_test() { --iree-amd-aie-enable-chess=${use_chess} \ --iree-amdaie-enable-packet-flow=${enable_packet_flow} \ --iree-hal-dump-executable-files-to=$PWD \ + --iree-hal-memoization=false \ + --iree-hal-indirect-command-buffers=false \ --mlir-elide-resource-strings-if-larger=10 \ --iree-amd-aie-show-invoked-commands" @@ -416,6 +418,8 @@ function run_matmul_test() { set +e echo "**** Generating matmul .vmfb file for ${name} ****" + ${IREE_COMPILE_EXE} "${matmul_ir}" \ + ${compilation_flags} --compile-to=vm -o "${matmul_vmfb}.vm" ${IREE_COMPILE_EXE} "${matmul_ir}" \ ${compilation_flags} -o "${matmul_vmfb}" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td index 109715633..9b38f3bfa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td @@ -19,8 +19,12 @@ def AMDAIE_ConnectionType: I32EnumAttr<"ConnectionType", ] > { let cppNamespace = "mlir::iree_compiler::AMDAIE"; + let genSpecializedAttr = 0; } +def AMDAIE_ConnectionTypeAttr + : EnumAttr; + def AMDAIE_CopyOpOperateOn: I32EnumAttr<"CopyOpOperateOn", "Enables templated functions that operate on either source or target of " "copy/dma operations", diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index d10f44458..6d6d99aee 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -445,13 +445,14 @@ void CircularDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, void ConnectionOp::build(mlir::OpBuilder &b, mlir::OperationState &result, Value target, Value source) { - build(b, result, target, {}, source, {}, nullptr); + build(b, result, target, {}, source, {}, nullptr, nullptr); } void ConnectionOp::build(mlir::OpBuilder &b, mlir::OperationState &result, Value target, ValueRange targetChannels, Value source, ValueRange sourceChannels) { - build(b, result, target, targetChannels, source, sourceChannels, nullptr); + build(b, result, target, targetChannels, source, sourceChannels, nullptr, + nullptr); } FailureOr @@ -469,6 +470,22 @@ ConnectionOp::getNpuCircularDmaCpyNdUser() { return npuDmaUsers[0]; } +std::optional ConnectionOp::getFlowOp() { + return dyn_cast_if_present(getFlow().getDefiningOp()); +} + +//===----------------------------------------------------------------------===// +// AMDAIE_FlowOp +//===----------------------------------------------------------------------===// + +LogicalResult FlowOp::verify() { + if (getSources().size() > 1 && getTargets().size() > 1) { + return emitOpError() + << "multiple source and multiple targets is unsupported"; + } + return success(); +} + //===----------------------------------------------------------------------===// // AMDAIE_LockOp //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index f14449ccd..324424ef0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -91,6 +91,42 @@ def AMDAIE_EndOp: AMDAIE_Op<"end", [Terminator]> { let assemblyFormat = [{ attr-dict }]; } +def AMDAIE_FlowOp: AMDAIE_Op<"flow", [AttrSizedOperandSegments]>, + Results<(outs Index)> { + let summary = "The data connection between a set of source and target " + "channels."; + let description = [{ + This operation represents a connection between source and target channels. + This is used to describe a logical data routing configuration between + channels, to be solved by the router for actual stream switch + configurations that implements it. The multiple sources and targets can + describe different connection patterns: + - Single source and multiple targets describes a data broadcasting pattern. + - Multiple sources and single target describes a data merge pattern. + - Multiple sources and multiple targets is not supported. + + Example: + + ```mlir + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_1_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1, %channel_2}) + {is_packet_flow = true, packet_id = 0 : ui8} + ``` + }]; + + let arguments = ( + ins Variadic:$sources, + Variadic:$targets, + BoolAttr:$is_packet_flow, + OptionalAttr:$packet_id + ); + + let assemblyFormat = [{ `(` `{` $sources `}` `->` `{` $targets `}` `)` attr-dict }]; + let hasVerifier = 1; +} + def AMDAIE_TileOp: AMDAIE_Op<"tile", [ Pure, DeclareOpInterfaceMethods @@ -319,20 +355,23 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [ ```mlir %tile = amdaie.tile(%c0, %c0) - %channel = amdaie.channel(%tile, 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA) ``` }]; let arguments = ( ins Index:$tile, - ConfinedAttr]>:$value + ConfinedAttr]>:$value, + StrmSwPortTypeAttr:$port_type ); let extraClassDeclaration = [{ TileOp getTileOp(); }]; - let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }]; + let assemblyFormat = [{ + `(` $tile `,` $value `,` `port_type` `=` $port_type `)` attr-dict + }]; } //===----------------------------------------------------------------------===// @@ -733,7 +772,8 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", Variadic:$target_channels, AnyAMDAIELogicalObjectFifoType:$source, Variadic:$source_channels, - OptionalAttr:$connection_type + OptionalAttr:$connection_type, + Optional:$flow ); let results = (outs Index:$result); @@ -745,8 +785,8 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", `,` $source ( ` ` `{` $source_channels^ `}` )? - ( `,` `connection_type` `=` $connection_type^ )? - `)` + ( `,` `flow` `=` $flow^ )? + `)` attr-dict `:` `(` type($target) `,` type($source) `)` }]; @@ -758,6 +798,7 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", ]; let extraClassDeclaration = [{ + std::optional getFlowOp(); Value getSourceMemref() { return getSource(); } Value getTargetMemref() { return getTarget(); } Type getSourceType() { return getSource().getType(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 47054c91f..edefe21ab 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -148,6 +148,29 @@ func.func @dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo {%[[CHANNEL_1]]}) {is_packet_flow = false} +// CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 1 : ui8} +func.func @flow() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true, packet_id = 1 : ui8} + return +} + +// ----- + // CHECK-LABEL: func.func @lock // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index ec8c43482..59da53759 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp index 40e9c9a4b..d0440336b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp @@ -45,20 +45,20 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) { for (Value tile : sourceLogicalObjFifo.getTiles()) { uint8_t channel = generator.getProducerDMAChannel(tile); auto channelOp = rewriter.create( - rewriter.getUnknownLoc(), tile, channel); + rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA); sourceChannels.push_back(channelOp.getResult()); } SmallVector targetChannels; for (Value tile : targetLogicalObjFifo.getTiles()) { uint8_t channel = generator.getConsumerDMAChannel(tile); auto channelOp = rewriter.create( - rewriter.getUnknownLoc(), tile, channel); + rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA); targetChannels.push_back(channelOp.getResult()); } rewriter.replaceOpWithNewOp( connectionOp, connectionOp.getTarget(), targetChannels, connectionOp.getSource(), sourceChannels, - connectionOp.getConnectionTypeAttr()); + connectionOp.getConnectionTypeAttr(), /*flow*/ nullptr); } return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp index 7c131ec61..e475e7ca2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp @@ -43,7 +43,7 @@ void AMDAIEAssignConnectionTypesPass::runOnOperation() { rewriter.replaceOpWithNewOp( connectionOp, connectionOp.getTarget(), connectionOp.getTargetChannels(), connectionOp.getSource(), - connectionOp.getSourceChannels(), connectionTypeAttr); + connectionOp.getSourceChannels(), connectionTypeAttr, /*flow*/ nullptr); return WalkResult::advance(); }); if (res.wasInterrupted()) return signalPassFailure(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp new file mode 100644 index 000000000..72086f0f3 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp @@ -0,0 +1,67 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" + +#define DEBUG_TYPE "iree-amdaie-assign-packet-ids" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIEAssignPacketIdsPass + : public impl::AMDAIEAssignPacketIdsBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEAssignPacketIdsPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to assign packet IDs " + "within the resource constraints"; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + auto ui8ty = + IntegerType::get(rewriter.getContext(), 8, IntegerType::Unsigned); + int pktFlowIndex{0}; + WalkResult res = parentOp->walk([&](AMDAIE::FlowOp flowOp) { + if (pktFlowIndex > deviceModel.getPacketIdMaxIdx()) { + flowOp.emitOpError() << "ran out of packet IDs to assign"; + return WalkResult::interrupt(); + } + rewriter.setInsertionPoint(flowOp); + IntegerAttr pktIdAttr = flowOp.getIsPacketFlow() + ? IntegerAttr::get(ui8ty, pktFlowIndex++) + : nullptr; + rewriter.replaceOpWithNewOp( + flowOp, flowOp.getSources(), flowOp.getTargets(), + flowOp.getIsPacketFlow(), pktIdAttr); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEAssignPacketIdsPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp new file mode 100644 index 000000000..4d0915e5c --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp @@ -0,0 +1,57 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" + +#define DEBUG_TYPE "iree-amdaie-connection-to-flow" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIEConnectionToFlowPass + : public impl::AMDAIEConnectionToFlowBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEConnectionToFlowPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + // TODO(jornt): currently, don't delete connections as they are still + // needed for lowering to AIE dialect dma_bds. This will be changed in the + // future. + WalkResult res = parentOp->walk([&](AMDAIE::ConnectionOp connectionOp) { + rewriter.setInsertionPoint(connectionOp); + std::optional connectionType = + connectionOp.getConnectionType(); + bool isPacketFlow = connectionType && connectionType.value() == + AMDAIE::ConnectionType::Packet; + auto flowOp = rewriter.create( + rewriter.getUnknownLoc(), connectionOp.getSourceChannels(), + connectionOp.getTargetChannels(), isPacketFlow, /*packetId*/ nullptr); + rewriter.replaceOpWithNewOp( + connectionOp, connectionOp.getTarget(), + connectionOp.getTargetChannels(), connectionOp.getSource(), + connectionOp.getSourceChannels(), connectionOp.getConnectionTypeAttr(), + flowOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEConnectionToFlowPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp index 0c2a23ff5..1120b3d1c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp @@ -27,8 +27,7 @@ namespace { /// Converts `scf.forall` into nested `scf.for` and then coalesce the `scf.for` /// loops. -LogicalResult coreForallToFor(RewriterBase &rewriter, - AMDAIE::CoreOp coreOp) { +LogicalResult coreForallToFor(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) { WalkResult res = coreOp->walk([&](scf::ForallOp forallOp) { SmallVector forOpResults; if (failed(scf::forallToForLoop(rewriter, forallOp, &forOpResults))) { @@ -55,12 +54,12 @@ class AMDAIEConvertCoreForallToForPass AMDAIEConvertCoreForallToForPass> { public: void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } AMDAIEConvertCoreForallToForPass() = default; AMDAIEConvertCoreForallToForPass( - const AMDAIEConvertCoreForallToForPass &pass){}; + const AMDAIEConvertCoreForallToForPass &pass) {}; void runOnOperation() override; }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index db1cefb14..4d361392d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -100,7 +100,7 @@ void AIEDeviceBuilder::createDMA( AIE::BDDimLayoutArrayAttr dims, size_t acqNum, size_t relNum, int64_t len, int64_t offset, const SmallVector &bufferOps, const std::pair &locks, - AIE::PacketFlowOp pktFlowOp) { + std::optional pktId) { OpBuilder::InsertionGuard g(rewriter); Block &endBlock = memOp->getRegion(0).getBlocks().back(); assert(!endBlock.getOps().empty() && @@ -123,10 +123,10 @@ void AIEDeviceBuilder::createDMA( acqNum); // Insert a packet op for MM2S DMAs if part of a packet flow. Only do this // for MM2S DMA ports as only those can insert packet headers. - if (channelDir == AIE::DMAChannelDir::MM2S && pktFlowOp) { + if (channelDir == AIE::DMAChannelDir::MM2S && pktId) { rewriter.create(rewriter.getUnknownLoc(), /*pkt_type*/ 0, - /*pkt_id*/ pktFlowOp.getID()); + /*pkt_id*/ pktId.value()); } if (!dims.getValue().empty()) { rewriter.create(rewriter.getUnknownLoc(), buff, offset, len, @@ -155,32 +155,29 @@ void AIEDeviceBuilder::createDMA( } SmallVector AIEDeviceBuilder::createFlowOps( - AMDAIE::ConnectionOp connectionOp, - ArrayRef producerChannels, + AMDAIE::FlowOp flowOp, ArrayRef producerChannels, ArrayRef consumerChannels) { LLVM_DEBUG(llvm::dbgs() << "-- createFlowOps\n"); OpBuilder::InsertionGuard g(rewriter); SmallVector flowOps; for (AMDAIE::ChannelOp producerChannel : producerChannels) { Value aieProducerTile = mapper.lookup(producerChannel.getTile()); - std::optional connectionType = - connectionOp.getConnectionType(); - if (connectionType && - connectionType.value() == AMDAIE::ConnectionType::Packet) { + std::optional pktId = flowOp.getPacketId(); + if (pktId) { OpBuilder::InsertionGuard gg(rewriter); AIE::PacketFlowOp pktFlow = rewriter.create( - rewriter.getUnknownLoc(), pktFlowIndex++, nullptr, nullptr); + rewriter.getUnknownLoc(), pktId.value(), nullptr, nullptr); Region &r_pktFlow = pktFlow.getPorts(); Block *b_pktFlow = rewriter.createBlock(&r_pktFlow); rewriter.setInsertionPointToStart(b_pktFlow); rewriter.create( - rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA, - producerChannel.getValue()); + rewriter.getUnknownLoc(), aieProducerTile, + producerChannel.getPortType(), producerChannel.getValue()); for (AMDAIE::ChannelOp consumerChannel : consumerChannels) { Value aieConsumerTile = mapper.lookup(consumerChannel.getTile()); rewriter.create( - rewriter.getUnknownLoc(), aieConsumerTile, AIE::WireBundle::DMA, - consumerChannel.getValue()); + rewriter.getUnknownLoc(), aieConsumerTile, + consumerChannel.getPortType(), consumerChannel.getValue()); } rewriter.create(rewriter.getUnknownLoc()); flowOps.push_back(pktFlow.getOperation()); @@ -436,12 +433,11 @@ LogicalResult AIEDeviceBuilder::npuDmaCpyNdOpToAIE( memOps = connectionToSourceTargetMemOps[connectionOp].first; // Set the packet info attribute for MM2S DMAs, operating on a packet flow // connection. - SmallVector flowOps = connectionToFlowOps.at(connectionOp); - if (flowOps.size() == 1 && isa(flowOps[0])) { - auto flowOp = cast(flowOps[0]); - pktInfoAttr = - AIE::PacketInfoAttr::get(rewriter.getContext(), - /*pkt_type*/ 0, /*pkt_id*/ flowOp.getID()); + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (maybeFlowOp && maybeFlowOp->getPacketId()) { + pktInfoAttr = AIE::PacketInfoAttr::get( + rewriter.getContext(), + /*pkt_type*/ 0, /*pkt_id*/ maybeFlowOp->getPacketId().value()); } } else if (dmaOp.getTarget()) { offsets = dmaOp.getTargetOffsets(); @@ -642,17 +638,10 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( } consumerChannels.push_back(channelOp); } - // Insert flow ops. - rewriter.setInsertionPointToEnd(deviceBlock); - SmallVector flowOps = - createFlowOps(connectionOp, producerChannels, consumerChannels); - connectionToFlowOps[connectionOp] = flowOps; - // If the connection has been converted into a single packet flow op, retrieve - // it for creating the DMA ops down below. - AIE::PacketFlowOp pktFlowOp; - if (flowOps.size() == 1 && isa(flowOps[0])) { - pktFlowOp = cast(flowOps[0]); - } + + std::optional maybeFlowOp = connectionOp.getFlowOp(); + std::optional packetId = + maybeFlowOp ? maybeFlowOp->getPacketId() : std::nullopt; FailureOr maybeNpuDmaUserOp = connectionOp.getNpuCircularDmaCpyNdUser(); @@ -743,7 +732,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); createDMA(memOp, AIE::DMAChannelDir::MM2S, channel.getValue(), dims, acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, - lockPair, pktFlowOp); + lockPair, packetId); } } @@ -831,7 +820,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); createDMA(memOp, AIE::DMAChannelDir::S2MM, channel.getValue(), dims, acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, - lockPair, pktFlowOp); + lockPair, packetId); } } @@ -842,6 +831,38 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( return success(); } +/// Convert the `amdaie.flow` ops into `aie.flow` ops. +LogicalResult AIEDeviceBuilder::flowToAIE(AMDAIE::FlowOp flowOp, + Block *deviceBlock) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n"); + rewriter.setInsertionPointToEnd(deviceBlock); + SmallVector producerChannels; + SmallVector consumerChannels; + for (Value producerChannel : flowOp.getSources()) { + auto channelOp = + dyn_cast_if_present(producerChannel.getDefiningOp()); + if (!channelOp) { + return flowOp.emitOpError() + << "found non-`amdaie.channel` source channel"; + } + producerChannels.push_back(channelOp); + } + for (Value consumerChannel : flowOp.getTargets()) { + auto channelOp = + dyn_cast_if_present(consumerChannel.getDefiningOp()); + if (!channelOp) { + return flowOp.emitOpError() + << "found non-`amdaie.channel` target channel"; + } + consumerChannels.push_back(channelOp); + } + // Insert flow ops. + rewriter.setInsertionPointToEnd(deviceBlock); + SmallVector flowOps = + createFlowOps(flowOp, producerChannels, consumerChannels); + return success(); +} + LogicalResult AIEDeviceBuilder::lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock, int &lockIndex) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LockOp]\n"); @@ -985,6 +1006,12 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE( } return WalkResult::skip(); }) + .Case([&](auto flowOp) { + if (failed(flowToAIE(flowOp, deviceBlock))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }) .Case([&](auto lockOp) { if (failed(lockToAIE(lockOp, deviceBlock, lockId))) { return WalkResult::interrupt(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h index d46f5f232..88ec017cd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h @@ -59,6 +59,7 @@ class AIEDeviceBuilder { int &bufferId); LogicalResult connectionToAIE(AMDAIE::ConnectionOp connectionOp, Block *deviceBlock, int &connectionIndex); + LogicalResult flowToAIE(AMDAIE::FlowOp flowOp, Block *deviceBlock); LogicalResult lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock, int &lockIndex); LogicalResult logicalObjFifoFromBuffersToAIE( @@ -83,12 +84,11 @@ class AIEDeviceBuilder { size_t acqNum, size_t relNum, int64_t len, int64_t offset, const SmallVector &bufferOps, const std::pair &locks, - AIE::PacketFlowOp pktFlowOp); + std::optional pktId); /// Utility to create flow ops from connection ops. SmallVector createFlowOps( - AMDAIE::ConnectionOp connectionOp, - ArrayRef producerChannels, + AMDAIE::FlowOp flowOp, ArrayRef producerChannels, ArrayRef consumerChannels); /// Utility to create `aie.shim_dma_allocation` ops and corresponding global @@ -122,9 +122,6 @@ class AIEDeviceBuilder { IRMapping mapper; /// Dedicated mapper for the HAL bindings. IRMapping bindingsMapper; - /// Index used to create unique packet flows. Expected to be incremented after - /// a new packet flow op is created. - int pktFlowIndex{0}; /// Map from tile values to AIE memory op (`aie.mem` or `aie.memtile_dma`). /// This is used to look up and add new DMA patterns to those memory ops. DenseMap tileToMemOpMap; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index 727047874..1a583444d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -343,13 +343,15 @@ void AMDAIETileAndFusePass::runOnOperation() { if (isTilingReductionDimension(consumerOp, tileSizesVal)) { tileAndFuseOptions.setFusionControlFn( [&](tensor::ExtractSliceOp sliceOp, OpResult originalProducer, - bool isDestinationOperand) -> std::tuple { - return {false, false}; + bool isDestinationOperand) + -> std::optional { + return std::nullopt; }); } else { tileAndFuseOptions.setFusionControlFn( [&](tensor::ExtractSliceOp sliceOp, OpResult originalProducer, - bool isDestinationOperand) -> std::tuple { + bool isDestinationOperand) + -> std::optional { bool fusableOp = TypeSwitch(originalProducer.getOwner()) // List ops that shouldnt be fused. @@ -360,7 +362,8 @@ void AMDAIETileAndFusePass::runOnOperation() { return op->getDialect() == context->getLoadedDialect(); }); - return {fusableOp, false}; + if (!fusableOp) return std::nullopt; + return scf::SCFTileAndFuseOptions::ControlFnResult{false}; }); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 758149c62..1c91b4d90 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -51,10 +51,12 @@ iree_cc_library( "AMDAIEAssignConnectionTypes.cpp" "AMDAIEAssignLogicalObjectFifoDepth.cpp" "AMDAIEAssignNpuDmaBdIds.cpp" + "AMDAIEAssignPacketIds.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" "AMDAIECombineStridedOps.cpp" + "AMDAIEConnectionToFlow.cpp" "AMDAIEConvertToDma.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEConvertCoreForallToFor.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index be7094123..5deb1c72d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -26,12 +26,14 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEASSIGNCHANNELS #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS +#define GEN_PASS_DEF_AMDAIEASSIGNPACKETIDS #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP #define GEN_PASS_DEF_AMDAIECANONICALIZENPUDMACPYND #define GEN_PASS_DEF_AMDAIECLEANUP #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS +#define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index dfd7a02cb..b327a72cb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -612,6 +612,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEObjFifoBufferizationPass()); + passManager.addPass(createAMDAIEConnectionToFlowPass()); + passManager.addPass(createAMDAIEAssignPacketIdsPass()); addAMDAIEToAIEPasses(passManager); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 6e4fc07fd..79618316d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -86,6 +86,9 @@ std::unique_ptr createAMDAIEAssignLogicalObjectFifoDepthPass( /// Create a pass to assign BD ids to `amdaie.npu.dma_cpy_nd` operations. std::unique_ptr createAMDAIEAssignNpuDmaBdIdsPass(); +/// Create a pass to assign packet ids to `amdaie.flow` operations. +std::unique_ptr createAMDAIEAssignPacketIdsPass(); + /// Create a pass to do some rewrites that help bridging the path to AIR/AIE /// lowering. std::unique_ptr createAMDAIEBridgeToAIRPass(); @@ -102,6 +105,9 @@ std::unique_ptr createAMDAIECanonicalizeNpuDmaCpyNdPass(); std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass( AMDAIECanonicalizeDoublyStridedOpOptions options = {}); +/// Create pass to create `amdaie.flow` ops for connections. +std::unique_ptr createAMDAIEConnectionToFlowPass(); + /// Pass to unroll the loops within the control code regions. std::unique_ptr createAMDAIEControlCodeLoopUnrollPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index f10c9f845..b80eac738 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -61,6 +61,12 @@ def AMDAIEAssignNpuDmaBdIds : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignNpuDmaBdIdsPass()"; } +def AMDAIEAssignPacketIds : + Pass<"iree-amdaie-assign-packet-ids", ""> { + let summary = "Assign packet ids to `amdaie.flow` operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignPacketIdsPass()"; +} + def AMDAIEBridgeToAIR : Pass<"iree-amdaie-bridge-to-air", ""> { let summary = "Perform transformations that allow hooking into AIR/AIE lowering"; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEBridgeToAIRPass()"; @@ -133,6 +139,12 @@ def AMDAIECombineStridedOps : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECombineStridedOpsPass()"; } +def AMDAIEConnectionToFlow : + Pass<"iree-amdaie-connection-to-flow", ""> { + let summary = "Create flow ops for connections."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConnectionToFlowPass()"; +} + def AMDAIEControlCodeLoopUnroll : Pass<"iree-amdaie-controlcode-loop-unroll", ""> { let summary = "Unroll the loops in the control code regions."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index f4004a9a5..c083125bd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -15,11 +15,13 @@ iree_lit_test_suite( "assign_connection_types.mlir" "assign_logical_objectfifo_depth.mlir" "assign_npu_dma_bd_ids.mlir" + "assign_packet_ids.mlir" "bridge_to_air.mlir" "bufferize_to_allocation.mlir" "canonicalize_doubly_strided_op.mlir" "canonicalize_npu_dma_cpy_nd.mlir" "combine_strided_ops.mlir" + "connection_to_flow.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" "create_aie_workgroup.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir index 80855c059..92b0f691b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir @@ -6,14 +6,14 @@ // CHECK: amdaie.workgroup // CHECK: %[[tile_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[tile_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0) -// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0) +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0, port_type = DMA) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL_0]]}) -// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1) -// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1) +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1, port_type = DMA) +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1, port_type = DMA) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]}) -// CHECK: %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2) -// CHECK: %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2) +// CHECK: %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2, port_type = DMA) +// CHECK: %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2, port_type = DMA) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]}) module { func.func @assign_channels(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir index fba568fd1..48a470fdd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir @@ -7,9 +7,9 @@ // CHECK: %[[OBJ0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // CHECK: %[[OBJ1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[OBJ2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG2]] -// CHECK: amdaie.connection(%[[OBJ1]], %[[OBJ0]], connection_type = Circuit) -// CHECK: amdaie.connection(%[[OBJ0]], %[[OBJ1]], connection_type = Circuit) -// CHECK: amdaie.connection(%[[OBJ2]], %[[OBJ1]], connection_type = Circuit) +// CHECK: amdaie.connection(%[[OBJ1]], %[[OBJ0]]) {connection_type = #amdaie} +// CHECK: amdaie.connection(%[[OBJ0]], %[[OBJ1]]) {connection_type = #amdaie} +// CHECK: amdaie.connection(%[[OBJ2]], %[[OBJ1]]) {connection_type = #amdaie} // PACKET-LABEL: @assign_connection_types // PACKET-SAME: %[[ARG0:.+]]: memref<8x16xi32>, %[[ARG1:.+]]: memref<1x1x8x16xi32, 1>, %[[ARG2:.+]]: memref<1x1x8x16xi32, 2> @@ -17,9 +17,9 @@ // PACKET: %[[OBJ0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // PACKET: %[[OBJ1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // PACKET: %[[OBJ2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG2]] -// PACKET: amdaie.connection(%[[OBJ1]], %[[OBJ0]], connection_type = Packet) -// PACKET: amdaie.connection(%[[OBJ0]], %[[OBJ1]], connection_type = Packet) -// PACKET: amdaie.connection(%[[OBJ2]], %[[OBJ1]], connection_type = Packet) +// PACKET: amdaie.connection(%[[OBJ1]], %[[OBJ0]]) {connection_type = #amdaie} +// PACKET: amdaie.connection(%[[OBJ0]], %[[OBJ1]]) {connection_type = #amdaie} +// PACKET: amdaie.connection(%[[OBJ2]], %[[OBJ1]]) {connection_type = #amdaie} module { func.func @assign_connection_types(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir new file mode 100644 index 000000000..15e196d7c --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir @@ -0,0 +1,116 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-assign-packet-ids)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @assign_packet_ids(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @assign_packet_ids +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 1, port_type = DMA) +// CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false} +// CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 0 : ui8} +// CHECK: amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true, packet_id = 1 : ui8} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_packet_ids(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %2 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = true} + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_packet_ids(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %5 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %6 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %7 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %8 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %9 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %10 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %11 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %12 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %13 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %14 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %15 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %16 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %17 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %18 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %19 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %20 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %21 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %22 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %23 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %24 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %25 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %26 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %27 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %28 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %29 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %30 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %31 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + // expected-error @+1 {{ran out of packet IDs to assign}} + %32 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + amdaie.controlcode { + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir new file mode 100644 index 000000000..74691c7ba --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir @@ -0,0 +1,44 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-connection-to-flow)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: @connection_to_flow +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA) +// CHECK: %[[FLOW_0:.+]] = amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL]]}, flow = %[[FLOW_0]]) +// CHECK: %[[FLOW_1:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL]]}) {is_packet_flow = true} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_1]]) +// CHECK: %[[FLOW_2:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL_2]]}) {is_packet_flow = false} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_2]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_2]]) +module { + func.func @connection_to_flow(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_2} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.connection(%0 {%channel}, %1 {%channel_1}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.connection(%2 {%channel_2}, %1 {%channel_1}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir index 39be67374..8ca613178 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir @@ -1,15 +1,14 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-convert-core-forall-to-for,canonicalize)" --split-input-file %s | FileCheck %s // CHECK-LABEL: @test_single -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK-DAG: amdaie.core +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK-DAG: %[[REM:.+]] = arith.remsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: %[[DIV:.+]] = arith.divsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: func.call @callee(%[[DIV]], %[[REM]]) : (index, index) -> () +// CHECK-DAG: %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C2]], %[[C2]]) : index, index +// CHECK-DAG: func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> () module @test_single { func.func private @callee(%i: index, %j: index) %c0 = arith.constant 0 : index @@ -28,20 +27,18 @@ module @test_single { // ----- // CHECK-LABEL: @test_multi -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: amdaie.core +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK-DAG: %[[REM:.+]] = arith.remsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: %[[DIV:.+]] = arith.divsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: func.call @callee(%[[DIV]], %[[REM]]) : (index, index) -> () +// CHECK-DAG: %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C2]], %[[C2]]) : index, index +// CHECK-DAG: func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> () // CHECK-DAG: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]] { -// CHECK-DAG: %[[REM1:.+]] = arith.remsi %[[ARG1]], %[[C4]] : index -// CHECK-DAG: %[[DIV1:.+]] = arith.divsi %[[ARG1]], %[[C4]] : index -// CHECK-DAG: func.call @callee(%[[DIV1]], %[[REM1]]) : (index, index) -> () +// CHECK-DAG: %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C4]], %[[C4]]) : index, index +// CHECK-DAG: func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> () module @test_multi { func.func private @callee(%i: index, %j: index) %c0 = arith.constant 0 : index @@ -63,19 +60,17 @@ module @test_multi { // ----- // CHECK-LABEL: @test_nested -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: amdaie.core +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C16]] step %[[C1]] { -// CHECK-DAG: %[[REM0:.+]] = arith.remsi %[[ARG0]], %[[C4]] : index -// CHECK-DAG: %[[DIV0:.+]] = arith.divsi %[[ARG0]], %[[C4]] : index +// CHECK-DAG: %[[D1:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C4]], %[[C4]]) : index, index // CHECK-DAG: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK-DAG: %[[REM1:.+]] = arith.remsi %[[ARG1]], %[[C2]] : index -// CHECK-DAG: %[[DIV1:.+]] = arith.divsi %[[ARG1]], %[[C2]] : index -// CHECK-DAG: func.call @callee(%[[DIV0]], %[[REM0]], %[[DIV1]], %[[REM1]]) : (index, index, index, index) -> () +// CHECK-DAG: %[[D2:.+]]:2 = affine.delinearize_index %[[ARG1]] into (%[[C2]], %[[C2]]) : index, index +// CHECK-DAG: func.call @callee(%[[D1]]#0, %[[D1]]#1, %[[D2]]#0, %[[D2]]#1) : (index, index, index, index) -> () module @test_nested { func.func private @callee(%i: index, %j: index, %k: index, %l: index) %c0 = arith.constant 0 : index diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir index cadeb4c5f..35ce94bf7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir @@ -62,12 +62,12 @@ module { // CHECK: %[[SECOND_LOOP:.*]]:2 = scf.for %[[IV0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]]) // CHECK: { // CHECK: %[[MATMUL:.*]] = linalg.generic +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]] // CHECK: %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]] // CHECK: scf.yield %[[YIELD_MATMUL]], %[[YIELD_UNPACK]] @@ -159,12 +159,12 @@ module { // CHECK: arith.addi // CHECK: } // CHECK: %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]] +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[YIELD_ELEM:.*]] = tensor.insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]] // CHECK: %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]] // CHECK: scf.yield %[[YIELD_MATMUL]], %[[YIELD_ELEM]], %[[YIELD_UNPACK]] diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir index fc84bee94..72b51ee59 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir @@ -59,15 +59,15 @@ module { // CHECK: %[[SECOND_LOOP:.*]]:2 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]]) // CHECK: { // CHECK: %[[MATMUL:.*]] = linalg.generic +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: } // CHECK: } // CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : @@ -153,16 +153,16 @@ module { // CHECK: { // CHECK: arith.addi // CHECK: } +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] -// CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: } // CHECK: } // CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 49e52c6e1..c8341cade 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -202,11 +202,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %3 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 1024] [32, 32] [64, 1]) + %4 = amdaie.npu.circular_dma_cpy_nd %3([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -281,11 +282,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { - %3 = amdaie.npu.circular_dma_cpy_nd %2([0, 0] [32, 32] [128, 1], [0, 1024] [32, 32] [64, 1]) + %4 = amdaie.npu.circular_dma_cpy_nd %3([0, 0] [32, 32] [128, 1], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -370,17 +372,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_7 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_1, 1) - %channel_3 = amdaie.channel(%tile_0_2, 1) - %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA) + %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1]) - %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1]) + %8 = amdaie.npu.circular_dma_cpy_nd %3([] [] [], [0, 0] [32, 32] [64, 1]) + %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -521,17 +525,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_7 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> - %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> - %channel_2 = amdaie.channel(%tile_0_1, 1) - %channel_3 = amdaie.channel(%tile_0_2, 1) - %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> + %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> + %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA) + %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) amdaie.controlcode { - %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1]) - %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1]) + %8 = amdaie.npu.circular_dma_cpy_nd %3([] [] [], [0, 0] [32, 32] [64, 1]) + %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -602,15 +608,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %channel_2 = amdaie.channel(%tile_0_1, 0) - %channel_3 = amdaie.channel(%tile_0_2, 0) - %4 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %4 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %5 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %6 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}, flow = %5) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %5 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [0, 1024] [32, 32] [64, 1]) - %6 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [0, 1024] [32, 32] [64, 1]) + %7 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [0, 1024] [32, 32] [64, 1]) + %8 = amdaie.npu.circular_dma_cpy_nd %6([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -640,15 +648,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_1 = amdaie.lock(%tile_0_1(1), 0) %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { - %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] []) - %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] []) + %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a source BD ID op to lower to the AIE dialect}} - %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo> + %7 = amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -679,13 +688,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_1 = amdaie.lock(%tile_0_1(1), 0) %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] []) - %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] []) + %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -732,28 +742,30 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_1(1), 0) %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_0, 1) - %channel_3 = amdaie.channel(%tile_0_1, 1) - %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1]) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12, MM2S) - %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13, MM2S) + %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] []) + %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1]) + %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14, MM2S) + %15 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, MM2S) scf.forall (%arg0, %arg1) in (2, 1) { - %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14, S2MM) - %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15, S2MM) + %16 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%16, S2MM) + %17 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%17, S2MM) } amdaie.end } @@ -797,27 +809,29 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_1(1), 0) %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_0, 1) - %channel_3 = amdaie.channel(%tile_0_1, 1) - %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1]) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo> - %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12, MM2S) - %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13, MM2S) - %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14, S2MM) - %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15, S2MM) + %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] []) + %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1]) + %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo> + %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14, MM2S) + %15 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, MM2S) + %16 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%16, S2MM) + %17 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%17, S2MM) amdaie.end } } @@ -1101,14 +1115,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_3}, {%lock}, {%lock_4}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6, %buffer_9, %buffer_10}, {%lock_7, %lock_11}, {%lock_8, %lock_12}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile, 0) - %channel_13 = amdaie.channel(%tile_0, 0) - %4 = amdaie.connection(%2 {%channel_13}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %channel_14 = amdaie.channel(%tile_0, 1) - %channel_15 = amdaie.channel(%tile_1, 0) - %channel_16 = amdaie.channel(%tile_2, 0) - %5 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) - %6 = amdaie.core(%tile_1, in : [%5], out : []) { + %channel = amdaie.channel(%tile, 0, port_type = DMA) + %channel_13 = amdaie.channel(%tile_0, 0, port_type = DMA) + %4 = amdaie.flow({%channel} -> {%channel_13}) {is_packet_flow = false} + %5 = amdaie.connection(%2 {%channel_13}, %1 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %channel_14 = amdaie.channel(%tile_0, 1, port_type = DMA) + %channel_15 = amdaie.channel(%tile_1, 0, port_type = DMA) + %channel_16 = amdaie.channel(%tile_2, 0, port_type = DMA) + %6 = amdaie.flow({%channel_14} -> {%channel_15, %channel_16}) {is_packet_flow = false} + %7 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}, flow = %6) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %8 = amdaie.core(%tile_1, in : [%7], out : []) { amdaie.use_lock(%lock_8, AcquireGreaterOrEqual(1)) %reinterpret_cast = memref.reinterpret_cast %buffer_5 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>) @@ -1119,7 +1135,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.use_lock(%lock_7, AcquireGreaterOrEqual(1)) amdaie.end } - %7 = amdaie.core(%tile_2, in : [%5], out : []) { + %9 = amdaie.core(%tile_2, in : [%7], out : []) { amdaie.use_lock(%lock_12, AcquireGreaterOrEqual(1)) %reinterpret_cast = memref.reinterpret_cast %buffer_9 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>) @@ -1131,11 +1147,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.end } amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([0, 0] [64, 64] [32, 1], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [64, 64] [32, 1]) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%11, MM2S) + %10 = amdaie.npu.circular_dma_cpy_nd %5([0, 0] [64, 64] [32, 1], [] [] []) + %11 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [64, 64] [32, 1]) + %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %13 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13, MM2S) amdaie.end } } diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index a53b0342a..2d23678d4 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -427,6 +427,10 @@ uint32_t AMDAIEDeviceModel::getColumnShift() const { uint32_t AMDAIEDeviceModel::getRowShift() const { return configPtr.RowShift; } +uint8_t AMDAIEDeviceModel::getPacketIdMaxIdx() const { + return deviceConfig.packetIdMaxIdx; +} + uint8_t AMDAIEDeviceModel::getStreamSwitchArbiterMax(uint8_t col, uint8_t row) const { assert(isCoreTile(col, row) || isMemTile(col, row) || isShimTile(col, row)); @@ -475,6 +479,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIE1_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE1_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE1_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIE1_SS_ARBITER_MAX; @@ -498,6 +503,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2302: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIEML_SS_ARBITER_MAX; @@ -520,6 +526,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2802: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIEML_SS_ARBITER_MAX; @@ -546,6 +553,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { case AMDAIEDevice::npu1_3col: case AMDAIEDevice::npu1_4col: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIE2IPU_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE2IPU_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE2IPU_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIE2IPU_SS_ARBITER_MAX; @@ -591,6 +599,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::npu4: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIE_STRIXB0_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE_STRIXB0_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX; diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index 4a21df60a..de8c855b7 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -222,6 +222,7 @@ struct AMDAIEDeviceModel { /// aie-rt for whatever reason. Make sure the parameters can't be retrieved in /// another way before adding new fields to this struct. struct AMDAIEDeviceConfig { + uint8_t packetIdMaxIdx{0}; /// Currently, the max arbiter/msel is hidden inside aie-rt. uint8_t streamSwitchCoreArbiterMax{0}; uint8_t streamSwitchCoreMSelMax{0}; @@ -317,6 +318,8 @@ struct AMDAIEDeviceModel { uint32_t getColumnShift() const; uint32_t getRowShift() const; + uint8_t getPacketIdMaxIdx() const; + uint8_t getStreamSwitchArbiterMax(uint8_t col, uint8_t row) const; uint8_t getStreamSwitchMSelMax(uint8_t col, uint8_t row) const; diff --git a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c index 8e5185d7a..515491ef0 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c +++ b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c @@ -5,6 +5,7 @@ // SPDX-License-Identifier: # Apache-2.0 WITH LLVM-exception #include "xaiengine/xaiegbl_defs.h" +#include "xaiengine/xaiegbl.h" #undef s8 #undef u8 #undef u16 @@ -24,6 +25,7 @@ const uint64_t XAIE1_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIE1_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIE1_NUM_ROWS = XAIE_NUM_ROWS; const int XAIE1_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIE1_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIE1_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIE1_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; const int XAIE1_TILE_DMA_NUM_CH = XAIE_TILE_DMA_NUM_CH; @@ -81,6 +83,7 @@ const uint64_t XAIEML_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIEML_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIEML_NUM_ROWS = XAIE_NUM_ROWS; const int XAIEML_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIEML_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIEML_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIEML_MEM_TILE_NUM_LOCKS = XAIE_MEM_TILE_NUM_LOCKS; const int XAIEML_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; @@ -140,6 +143,7 @@ const uint64_t XAIE2IPU_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIE2IPU_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIE2IPU_NUM_ROWS = XAIE_NUM_ROWS; const int XAIE2IPU_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIE2IPU_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIE2IPU_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIE2IPU_MEM_TILE_NUM_LOCKS = XAIE_MEM_TILE_NUM_LOCKS; const int XAIE2IPU_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; @@ -198,6 +202,7 @@ const uint64_t XAIE_STRIXB0_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIE_STRIXB0_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIE_STRIXB0_NUM_ROWS = XAIE_NUM_ROWS; const int XAIE_STRIXB0_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIE_STRIXB0_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIE_STRIXB0_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIE_STRIXB0_MEM_TILE_NUM_LOCKS = XAIE_MEM_TILE_NUM_LOCKS; const int XAIE_STRIXB0_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; diff --git a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h index 5d9bf834d..d5db882f6 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h +++ b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h @@ -13,6 +13,7 @@ extern uint8_t XAIE1_AIE_TILE_NUM_ROWS; extern uint8_t XAIE1_AIE_TILE_ROW_START; extern uint8_t XAIE1_COL_SHIFT; +extern uint8_t XAIE1_PACKET_ID_MAX; extern uint8_t XAIE1_MEM_TILE_DMA_NUM_CH; extern uint8_t XAIE1_MEM_TILE_NUM_LOCKS; extern uint8_t XAIE1_MEM_TILE_NUM_ROWS; @@ -45,6 +46,7 @@ extern uint8_t XAIEML_MEM_TILE_ROW_START; extern uint8_t XAIEML_NUM_COLS; extern uint8_t XAIEML_NUM_NOC_INTR_OFFSET; extern uint8_t XAIEML_NUM_ROWS; +extern uint8_t XAIEML_PACKET_ID_MAX; extern uint8_t XAIEML_ROW_SHIFT; extern uint8_t XAIEML_SHIM_DMA_NUM_CH; extern uint8_t XAIEML_SHIM_NUM_LOCKS; @@ -70,6 +72,7 @@ extern uint8_t XAIE2IPU_MEM_TILE_ROW_START; extern uint8_t XAIE2IPU_NUM_COLS; extern uint8_t XAIE2IPU_NUM_NOC_INTR_OFFSET; extern uint8_t XAIE2IPU_NUM_ROWS; +extern uint8_t XAIE2IPU_PACKET_ID_MAX; extern uint8_t XAIE2IPU_ROW_SHIFT; extern uint8_t XAIE2IPU_SHIM_DMA_NUM_CH; extern uint8_t XAIE2IPU_SHIM_NUM_LOCKS; @@ -100,6 +103,7 @@ extern uint8_t XAIE_STRIXB0_MEM_TILE_ROW_START; extern uint8_t XAIE_STRIXB0_NUM_COLS; extern uint8_t XAIE_STRIXB0_NUM_NOC_INTR_OFFSET; extern uint8_t XAIE_STRIXB0_NUM_ROWS; +extern uint8_t XAIE_STRIXB0_PACKET_ID_MAX; extern uint8_t XAIE_STRIXB0_ROW_SHIFT; extern uint8_t XAIE_STRIXB0_SHIM_DMA_NUM_CH; extern uint8_t XAIE_STRIXB0_SHIM_NUM_LOCKS; diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc index 770527e93..268614046 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc @@ -340,8 +340,9 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch( std::vector bos; // TODO(max): do we need multiple descriptor sets ever for AIE? uint32_t set = 0; - iree_hal_xrt_direct_command_buffer_push_descriptor_set( - base_command_buffer, set, bindings.count, bindings.values); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_direct_command_buffer_push_descriptor_set( + base_command_buffer, set, bindings.count, bindings.values)); for (iree_host_size_t j = 0; j < bindings.count; ++j) { xrt::bo arg_buffer = xrt::bo(*command_buffer->descriptor_sets[set].bindings[j], diff --git a/tests/conftest.py b/tests/conftest.py index 3a2a5f76b..3bc6d4daa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry -from iree.compiler.api import Session, Output, Source +from iree.compiler.api import Session, Output, Source, _initializeGlobalCL from iree.compiler.extras import types as T from iree.runtime import VmModule from iree.runtime import get_driver, Config, SystemContext @@ -49,8 +49,16 @@ def pytest_addoption(parser): parser.addoption("--iree-aie-debug", action="store_true") +@pytest.fixture(scope="session") +def global_cl_args(request): + _initializeGlobalCL( + "--iree-hal-memoization=false", + "--iree-hal-indirect-command-buffers=false", + ) + + @pytest.fixture -def iree_session(request, pytestconfig) -> Session: +def iree_session(request, pytestconfig, global_cl_args) -> Session: s = Session() s.context.append_dialect_registry(get_dialect_registry()) s.context.load_all_available_dialects() diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 598bfdcab..665ea2ca5 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -147,8 +147,5 @@ def test_matmul( arg0 = np.random.randint(-1, 3, (M, K), dtype=lhs_rhs_type) arg1 = np.random.randint(-1, 3, (K, N), dtype=lhs_rhs_type) with invokable_module(session, module, device) as module: - for i in range(num_repeat_runs): - results = module[matmul_name](arg0, arg1).to_host() - assert np.array_equal( - results, (arg0.astype(acc_type) @ arg1.astype(acc_type)) - ) + results = module[matmul_name](arg0, arg1).to_host() + assert np.array_equal(results, (arg0.astype(acc_type) @ arg1.astype(acc_type))) diff --git a/third_party/iree b/third_party/iree index d55785dc6..20a7638c1 160000 --- a/third_party/iree +++ b/third_party/iree @@ -1 +1 @@ -Subproject commit d55785dc6affb9c0ffe787e0dc0e8f6f32a73da2 +Subproject commit 20a7638c1584c98e1b2442a011c546f5d471631d