From fe0a3fe32523cd14a6278ad49b794dcfea8d5ed0 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 2 Oct 2024 05:00:28 +0530 Subject: [PATCH 1/2] IREE Bump to 18th Sep, 2024 (#788) --- build_tools/ci/cpu_comparison/run.py | 2 + build_tools/ci/run_matmul_test.sh | 4 ++ .../AMDAIEConvertCoreForallToFor.cpp | 7 ++- .../Transforms/AMDAIETileAndFuse.cpp | 11 ++-- .../test/convert_core_forall_to_for.mlir | 51 +++++++++---------- .../test/fuse_consumer_into_loop_scf_for.mlir | 8 +-- .../fuse_consumer_into_loop_scf_forall.mlir | 14 ++--- .../driver/xrt/direct_command_buffer.cc | 5 +- tests/conftest.py | 12 ++++- tests/test_matmul.py | 7 +-- third_party/iree | 2 +- 11 files changed, 66 insertions(+), 57 deletions(-) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 3b07d4512..7a45f9454 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -142,6 +142,8 @@ def generate_aie_vmfb( f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}", f"--iree-hal-dump-executable-files-to={config.output_dir}", "--iree-scheduling-optimize-bindings=false", + "--iree-hal-memoization=false", + "--iree-hal-indirect-command-buffers=false", f"--mlir-disable-threading", "--mlir-elide-resource-strings-if-larger=10", ] diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index fb4b01a77..50649e8fd 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -405,6 +405,8 @@ function run_matmul_test() { --iree-amd-aie-enable-chess=${use_chess} \ --iree-amdaie-enable-packet-flow=${enable_packet_flow} \ --iree-hal-dump-executable-files-to=$PWD \ + --iree-hal-memoization=false \ + --iree-hal-indirect-command-buffers=false \ --mlir-elide-resource-strings-if-larger=10 \ --iree-amd-aie-show-invoked-commands" @@ -416,6 +418,8 @@ function run_matmul_test() { set +e echo "**** Generating matmul .vmfb file for ${name} ****" + ${IREE_COMPILE_EXE} "${matmul_ir}" \ + ${compilation_flags} --compile-to=vm -o "${matmul_vmfb}.vm" ${IREE_COMPILE_EXE} "${matmul_ir}" \ ${compilation_flags} -o "${matmul_vmfb}" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp index 0c2a23ff5..1120b3d1c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp @@ -27,8 +27,7 @@ namespace { /// Converts `scf.forall` into nested `scf.for` and then coalesce the `scf.for` /// loops. -LogicalResult coreForallToFor(RewriterBase &rewriter, - AMDAIE::CoreOp coreOp) { +LogicalResult coreForallToFor(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) { WalkResult res = coreOp->walk([&](scf::ForallOp forallOp) { SmallVector forOpResults; if (failed(scf::forallToForLoop(rewriter, forallOp, &forOpResults))) { @@ -55,12 +54,12 @@ class AMDAIEConvertCoreForallToForPass AMDAIEConvertCoreForallToForPass> { public: void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } AMDAIEConvertCoreForallToForPass() = default; AMDAIEConvertCoreForallToForPass( - const AMDAIEConvertCoreForallToForPass &pass){}; + const AMDAIEConvertCoreForallToForPass &pass) {}; void runOnOperation() override; }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index 727047874..1a583444d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -343,13 +343,15 @@ void AMDAIETileAndFusePass::runOnOperation() { if (isTilingReductionDimension(consumerOp, tileSizesVal)) { tileAndFuseOptions.setFusionControlFn( [&](tensor::ExtractSliceOp sliceOp, OpResult originalProducer, - bool isDestinationOperand) -> std::tuple { - return {false, false}; + bool isDestinationOperand) + -> std::optional { + return std::nullopt; }); } else { tileAndFuseOptions.setFusionControlFn( [&](tensor::ExtractSliceOp sliceOp, OpResult originalProducer, - bool isDestinationOperand) -> std::tuple { + bool isDestinationOperand) + -> std::optional { bool fusableOp = TypeSwitch(originalProducer.getOwner()) // List ops that shouldnt be fused. @@ -360,7 +362,8 @@ void AMDAIETileAndFusePass::runOnOperation() { return op->getDialect() == context->getLoadedDialect(); }); - return {fusableOp, false}; + if (!fusableOp) return std::nullopt; + return scf::SCFTileAndFuseOptions::ControlFnResult{false}; }); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir index 39be67374..8ca613178 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_core_forall_to_for.mlir @@ -1,15 +1,14 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-convert-core-forall-to-for,canonicalize)" --split-input-file %s | FileCheck %s // CHECK-LABEL: @test_single -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK-DAG: amdaie.core +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK-DAG: %[[REM:.+]] = arith.remsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: %[[DIV:.+]] = arith.divsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: func.call @callee(%[[DIV]], %[[REM]]) : (index, index) -> () +// CHECK-DAG: %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C2]], %[[C2]]) : index, index +// CHECK-DAG: func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> () module @test_single { func.func private @callee(%i: index, %j: index) %c0 = arith.constant 0 : index @@ -28,20 +27,18 @@ module @test_single { // ----- // CHECK-LABEL: @test_multi -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: amdaie.core +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK-DAG: %[[REM:.+]] = arith.remsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: %[[DIV:.+]] = arith.divsi %[[ARG0]], %[[C2]] : index -// CHECK-DAG: func.call @callee(%[[DIV]], %[[REM]]) : (index, index) -> () +// CHECK-DAG: %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C2]], %[[C2]]) : index, index +// CHECK-DAG: func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> () // CHECK-DAG: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]] { -// CHECK-DAG: %[[REM1:.+]] = arith.remsi %[[ARG1]], %[[C4]] : index -// CHECK-DAG: %[[DIV1:.+]] = arith.divsi %[[ARG1]], %[[C4]] : index -// CHECK-DAG: func.call @callee(%[[DIV1]], %[[REM1]]) : (index, index) -> () +// CHECK-DAG: %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C4]], %[[C4]]) : index, index +// CHECK-DAG: func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> () module @test_multi { func.func private @callee(%i: index, %j: index) %c0 = arith.constant 0 : index @@ -63,19 +60,17 @@ module @test_multi { // ----- // CHECK-LABEL: @test_nested -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: amdaie.core +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C16]] step %[[C1]] { -// CHECK-DAG: %[[REM0:.+]] = arith.remsi %[[ARG0]], %[[C4]] : index -// CHECK-DAG: %[[DIV0:.+]] = arith.divsi %[[ARG0]], %[[C4]] : index +// CHECK-DAG: %[[D1:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C4]], %[[C4]]) : index, index // CHECK-DAG: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK-DAG: %[[REM1:.+]] = arith.remsi %[[ARG1]], %[[C2]] : index -// CHECK-DAG: %[[DIV1:.+]] = arith.divsi %[[ARG1]], %[[C2]] : index -// CHECK-DAG: func.call @callee(%[[DIV0]], %[[REM0]], %[[DIV1]], %[[REM1]]) : (index, index, index, index) -> () +// CHECK-DAG: %[[D2:.+]]:2 = affine.delinearize_index %[[ARG1]] into (%[[C2]], %[[C2]]) : index, index +// CHECK-DAG: func.call @callee(%[[D1]]#0, %[[D1]]#1, %[[D2]]#0, %[[D2]]#1) : (index, index, index, index) -> () module @test_nested { func.func private @callee(%i: index, %j: index, %k: index, %l: index) %c0 = arith.constant 0 : index diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir index cadeb4c5f..35ce94bf7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir @@ -62,12 +62,12 @@ module { // CHECK: %[[SECOND_LOOP:.*]]:2 = scf.for %[[IV0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]]) // CHECK: { // CHECK: %[[MATMUL:.*]] = linalg.generic +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]] // CHECK: %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]] // CHECK: scf.yield %[[YIELD_MATMUL]], %[[YIELD_UNPACK]] @@ -159,12 +159,12 @@ module { // CHECK: arith.addi // CHECK: } // CHECK: %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]] +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]]) // CHECK: %[[YIELD_ELEM:.*]] = tensor.insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]] // CHECK: %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]] // CHECK: scf.yield %[[YIELD_MATMUL]], %[[YIELD_ELEM]], %[[YIELD_UNPACK]] diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir index fc84bee94..72b51ee59 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir @@ -59,15 +59,15 @@ module { // CHECK: %[[SECOND_LOOP:.*]]:2 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]]) // CHECK: { // CHECK: %[[MATMUL:.*]] = linalg.generic +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: } // CHECK: } // CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : @@ -153,16 +153,16 @@ module { // CHECK: { // CHECK: arith.addi // CHECK: } +// CHECK: affine.apply +// CHECK: affine.apply // CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) // CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] -// CHECK: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]]) -// CHECK: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]]) // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] -// CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1] // CHECK: } // CHECK: } // CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc index 770527e93..268614046 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc @@ -340,8 +340,9 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch( std::vector bos; // TODO(max): do we need multiple descriptor sets ever for AIE? uint32_t set = 0; - iree_hal_xrt_direct_command_buffer_push_descriptor_set( - base_command_buffer, set, bindings.count, bindings.values); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_direct_command_buffer_push_descriptor_set( + base_command_buffer, set, bindings.count, bindings.values)); for (iree_host_size_t j = 0; j < bindings.count; ++j) { xrt::bo arg_buffer = xrt::bo(*command_buffer->descriptor_sets[set].bindings[j], diff --git a/tests/conftest.py b/tests/conftest.py index 3a2a5f76b..3bc6d4daa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry -from iree.compiler.api import Session, Output, Source +from iree.compiler.api import Session, Output, Source, _initializeGlobalCL from iree.compiler.extras import types as T from iree.runtime import VmModule from iree.runtime import get_driver, Config, SystemContext @@ -49,8 +49,16 @@ def pytest_addoption(parser): parser.addoption("--iree-aie-debug", action="store_true") +@pytest.fixture(scope="session") +def global_cl_args(request): + _initializeGlobalCL( + "--iree-hal-memoization=false", + "--iree-hal-indirect-command-buffers=false", + ) + + @pytest.fixture -def iree_session(request, pytestconfig) -> Session: +def iree_session(request, pytestconfig, global_cl_args) -> Session: s = Session() s.context.append_dialect_registry(get_dialect_registry()) s.context.load_all_available_dialects() diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 598bfdcab..665ea2ca5 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -147,8 +147,5 @@ def test_matmul( arg0 = np.random.randint(-1, 3, (M, K), dtype=lhs_rhs_type) arg1 = np.random.randint(-1, 3, (K, N), dtype=lhs_rhs_type) with invokable_module(session, module, device) as module: - for i in range(num_repeat_runs): - results = module[matmul_name](arg0, arg1).to_host() - assert np.array_equal( - results, (arg0.astype(acc_type) @ arg1.astype(acc_type)) - ) + results = module[matmul_name](arg0, arg1).to_host() + assert np.array_equal(results, (arg0.astype(acc_type) @ arg1.astype(acc_type))) diff --git a/third_party/iree b/third_party/iree index d55785dc6..20a7638c1 160000 --- a/third_party/iree +++ b/third_party/iree @@ -1 +1 @@ -Subproject commit d55785dc6affb9c0ffe787e0dc0e8f6f32a73da2 +Subproject commit 20a7638c1584c98e1b2442a011c546f5d471631d From b7f8fc4a39bb7202dc8e1cbddad028791eb3bb64 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Wed, 2 Oct 2024 18:21:56 +0200 Subject: [PATCH 2/2] Add `amdaie.flow` for packet flow lowering (#815) This PR introduces the `amdaie.flow` operation (in consistency with `aie.flow`). This operation represents connections between ports/channels on tiles in the array and is lowered to `aie.flow` for routing. This idea is to lower `amdaie.connection` to `amdaie.flow` + DMA operations, however this PR goes halfway by creating `amdaie.flow` operations, while still keeping a reference inside `amdaie.connection` as the latter is still needed inside `LowerToAIE` for lowering to the AIE dialect DMA operations. This will be cleaned up in the future, but for now, this enables me to start adding routes for control packets in conjunction with data routes for https://github.com/nod-ai/iree-amd-aie/issues/714. --- .../AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td | 4 + .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp | 21 +- .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td | 53 ++++- .../iree-amd-aie/IR/test/roundtrip.mlir | 23 ++ .../samples/matmul_pack_peel_objectfifo.mlir | 2 +- .../Transforms/AMDAIEAssignChannels.cpp | 6 +- .../AMDAIEAssignConnectionTypes.cpp | 2 +- .../Transforms/AMDAIEAssignPacketIds.cpp | 67 ++++++ .../Transforms/AMDAIEConnectionToFlow.cpp | 57 +++++ .../Transforms/AMDAIELowerToAIE.cpp | 93 +++++--- .../Transforms/AMDAIELowerToAIE.h | 9 +- .../iree-amd-aie/Transforms/CMakeLists.txt | 2 + .../iree-amd-aie/Transforms/PassDetail.h | 2 + .../iree-amd-aie/Transforms/Passes.cpp | 2 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 6 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 12 + .../Transforms/test/CMakeLists.txt | 2 + .../Transforms/test/assign_channels.mlir | 12 +- .../test/assign_connection_types.mlir | 12 +- .../Transforms/test/assign_packet_ids.mlir | 116 +++++++++ .../Transforms/test/connection_to_flow.mlir | 44 ++++ .../Transforms/test/lower_to_aie.mlir | 220 ++++++++++-------- .../aie_runtime/iree_aie_runtime.cc | 9 + .../aie_runtime/iree_aie_runtime.h | 3 + .../src/iree-amd-aie/aie_runtime/xaie_hwcfg.c | 5 + .../src/iree-amd-aie/aie_runtime/xaie_hwcfg.h | 4 + 26 files changed, 622 insertions(+), 166 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td index 109715633..9b38f3bfa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td @@ -19,8 +19,12 @@ def AMDAIE_ConnectionType: I32EnumAttr<"ConnectionType", ] > { let cppNamespace = "mlir::iree_compiler::AMDAIE"; + let genSpecializedAttr = 0; } +def AMDAIE_ConnectionTypeAttr + : EnumAttr; + def AMDAIE_CopyOpOperateOn: I32EnumAttr<"CopyOpOperateOn", "Enables templated functions that operate on either source or target of " "copy/dma operations", diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index d10f44458..6d6d99aee 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -445,13 +445,14 @@ void CircularDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, void ConnectionOp::build(mlir::OpBuilder &b, mlir::OperationState &result, Value target, Value source) { - build(b, result, target, {}, source, {}, nullptr); + build(b, result, target, {}, source, {}, nullptr, nullptr); } void ConnectionOp::build(mlir::OpBuilder &b, mlir::OperationState &result, Value target, ValueRange targetChannels, Value source, ValueRange sourceChannels) { - build(b, result, target, targetChannels, source, sourceChannels, nullptr); + build(b, result, target, targetChannels, source, sourceChannels, nullptr, + nullptr); } FailureOr @@ -469,6 +470,22 @@ ConnectionOp::getNpuCircularDmaCpyNdUser() { return npuDmaUsers[0]; } +std::optional ConnectionOp::getFlowOp() { + return dyn_cast_if_present(getFlow().getDefiningOp()); +} + +//===----------------------------------------------------------------------===// +// AMDAIE_FlowOp +//===----------------------------------------------------------------------===// + +LogicalResult FlowOp::verify() { + if (getSources().size() > 1 && getTargets().size() > 1) { + return emitOpError() + << "multiple source and multiple targets is unsupported"; + } + return success(); +} + //===----------------------------------------------------------------------===// // AMDAIE_LockOp //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index f14449ccd..324424ef0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -91,6 +91,42 @@ def AMDAIE_EndOp: AMDAIE_Op<"end", [Terminator]> { let assemblyFormat = [{ attr-dict }]; } +def AMDAIE_FlowOp: AMDAIE_Op<"flow", [AttrSizedOperandSegments]>, + Results<(outs Index)> { + let summary = "The data connection between a set of source and target " + "channels."; + let description = [{ + This operation represents a connection between source and target channels. + This is used to describe a logical data routing configuration between + channels, to be solved by the router for actual stream switch + configurations that implements it. The multiple sources and targets can + describe different connection patterns: + - Single source and multiple targets describes a data broadcasting pattern. + - Multiple sources and single target describes a data merge pattern. + - Multiple sources and multiple targets is not supported. + + Example: + + ```mlir + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_1_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1, %channel_2}) + {is_packet_flow = true, packet_id = 0 : ui8} + ``` + }]; + + let arguments = ( + ins Variadic:$sources, + Variadic:$targets, + BoolAttr:$is_packet_flow, + OptionalAttr:$packet_id + ); + + let assemblyFormat = [{ `(` `{` $sources `}` `->` `{` $targets `}` `)` attr-dict }]; + let hasVerifier = 1; +} + def AMDAIE_TileOp: AMDAIE_Op<"tile", [ Pure, DeclareOpInterfaceMethods @@ -319,20 +355,23 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [ ```mlir %tile = amdaie.tile(%c0, %c0) - %channel = amdaie.channel(%tile, 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA) ``` }]; let arguments = ( ins Index:$tile, - ConfinedAttr]>:$value + ConfinedAttr]>:$value, + StrmSwPortTypeAttr:$port_type ); let extraClassDeclaration = [{ TileOp getTileOp(); }]; - let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }]; + let assemblyFormat = [{ + `(` $tile `,` $value `,` `port_type` `=` $port_type `)` attr-dict + }]; } //===----------------------------------------------------------------------===// @@ -733,7 +772,8 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", Variadic:$target_channels, AnyAMDAIELogicalObjectFifoType:$source, Variadic:$source_channels, - OptionalAttr:$connection_type + OptionalAttr:$connection_type, + Optional:$flow ); let results = (outs Index:$result); @@ -745,8 +785,8 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", `,` $source ( ` ` `{` $source_channels^ `}` )? - ( `,` `connection_type` `=` $connection_type^ )? - `)` + ( `,` `flow` `=` $flow^ )? + `)` attr-dict `:` `(` type($target) `,` type($source) `)` }]; @@ -758,6 +798,7 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", ]; let extraClassDeclaration = [{ + std::optional getFlowOp(); Value getSourceMemref() { return getSource(); } Value getTargetMemref() { return getTarget(); } Type getSourceType() { return getSource().getType(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 47054c91f..edefe21ab 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -148,6 +148,29 @@ func.func @dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo {%[[CHANNEL_1]]}) {is_packet_flow = false} +// CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 1 : ui8} +func.func @flow() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true, packet_id = 1 : ui8} + return +} + +// ----- + // CHECK-LABEL: func.func @lock // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index ec8c43482..59da53759 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp index 40e9c9a4b..d0440336b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp @@ -45,20 +45,20 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) { for (Value tile : sourceLogicalObjFifo.getTiles()) { uint8_t channel = generator.getProducerDMAChannel(tile); auto channelOp = rewriter.create( - rewriter.getUnknownLoc(), tile, channel); + rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA); sourceChannels.push_back(channelOp.getResult()); } SmallVector targetChannels; for (Value tile : targetLogicalObjFifo.getTiles()) { uint8_t channel = generator.getConsumerDMAChannel(tile); auto channelOp = rewriter.create( - rewriter.getUnknownLoc(), tile, channel); + rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA); targetChannels.push_back(channelOp.getResult()); } rewriter.replaceOpWithNewOp( connectionOp, connectionOp.getTarget(), targetChannels, connectionOp.getSource(), sourceChannels, - connectionOp.getConnectionTypeAttr()); + connectionOp.getConnectionTypeAttr(), /*flow*/ nullptr); } return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp index 7c131ec61..e475e7ca2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignConnectionTypes.cpp @@ -43,7 +43,7 @@ void AMDAIEAssignConnectionTypesPass::runOnOperation() { rewriter.replaceOpWithNewOp( connectionOp, connectionOp.getTarget(), connectionOp.getTargetChannels(), connectionOp.getSource(), - connectionOp.getSourceChannels(), connectionTypeAttr); + connectionOp.getSourceChannels(), connectionTypeAttr, /*flow*/ nullptr); return WalkResult::advance(); }); if (res.wasInterrupted()) return signalPassFailure(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp new file mode 100644 index 000000000..72086f0f3 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignPacketIds.cpp @@ -0,0 +1,67 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" + +#define DEBUG_TYPE "iree-amdaie-assign-packet-ids" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIEAssignPacketIdsPass + : public impl::AMDAIEAssignPacketIdsBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEAssignPacketIdsPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to assign packet IDs " + "within the resource constraints"; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + auto ui8ty = + IntegerType::get(rewriter.getContext(), 8, IntegerType::Unsigned); + int pktFlowIndex{0}; + WalkResult res = parentOp->walk([&](AMDAIE::FlowOp flowOp) { + if (pktFlowIndex > deviceModel.getPacketIdMaxIdx()) { + flowOp.emitOpError() << "ran out of packet IDs to assign"; + return WalkResult::interrupt(); + } + rewriter.setInsertionPoint(flowOp); + IntegerAttr pktIdAttr = flowOp.getIsPacketFlow() + ? IntegerAttr::get(ui8ty, pktFlowIndex++) + : nullptr; + rewriter.replaceOpWithNewOp( + flowOp, flowOp.getSources(), flowOp.getTargets(), + flowOp.getIsPacketFlow(), pktIdAttr); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEAssignPacketIdsPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp new file mode 100644 index 000000000..4d0915e5c --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConnectionToFlow.cpp @@ -0,0 +1,57 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" + +#define DEBUG_TYPE "iree-amdaie-connection-to-flow" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIEConnectionToFlowPass + : public impl::AMDAIEConnectionToFlowBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEConnectionToFlowPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + // TODO(jornt): currently, don't delete connections as they are still + // needed for lowering to AIE dialect dma_bds. This will be changed in the + // future. + WalkResult res = parentOp->walk([&](AMDAIE::ConnectionOp connectionOp) { + rewriter.setInsertionPoint(connectionOp); + std::optional connectionType = + connectionOp.getConnectionType(); + bool isPacketFlow = connectionType && connectionType.value() == + AMDAIE::ConnectionType::Packet; + auto flowOp = rewriter.create( + rewriter.getUnknownLoc(), connectionOp.getSourceChannels(), + connectionOp.getTargetChannels(), isPacketFlow, /*packetId*/ nullptr); + rewriter.replaceOpWithNewOp( + connectionOp, connectionOp.getTarget(), + connectionOp.getTargetChannels(), connectionOp.getSource(), + connectionOp.getSourceChannels(), connectionOp.getConnectionTypeAttr(), + flowOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEConnectionToFlowPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index db1cefb14..4d361392d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -100,7 +100,7 @@ void AIEDeviceBuilder::createDMA( AIE::BDDimLayoutArrayAttr dims, size_t acqNum, size_t relNum, int64_t len, int64_t offset, const SmallVector &bufferOps, const std::pair &locks, - AIE::PacketFlowOp pktFlowOp) { + std::optional pktId) { OpBuilder::InsertionGuard g(rewriter); Block &endBlock = memOp->getRegion(0).getBlocks().back(); assert(!endBlock.getOps().empty() && @@ -123,10 +123,10 @@ void AIEDeviceBuilder::createDMA( acqNum); // Insert a packet op for MM2S DMAs if part of a packet flow. Only do this // for MM2S DMA ports as only those can insert packet headers. - if (channelDir == AIE::DMAChannelDir::MM2S && pktFlowOp) { + if (channelDir == AIE::DMAChannelDir::MM2S && pktId) { rewriter.create(rewriter.getUnknownLoc(), /*pkt_type*/ 0, - /*pkt_id*/ pktFlowOp.getID()); + /*pkt_id*/ pktId.value()); } if (!dims.getValue().empty()) { rewriter.create(rewriter.getUnknownLoc(), buff, offset, len, @@ -155,32 +155,29 @@ void AIEDeviceBuilder::createDMA( } SmallVector AIEDeviceBuilder::createFlowOps( - AMDAIE::ConnectionOp connectionOp, - ArrayRef producerChannels, + AMDAIE::FlowOp flowOp, ArrayRef producerChannels, ArrayRef consumerChannels) { LLVM_DEBUG(llvm::dbgs() << "-- createFlowOps\n"); OpBuilder::InsertionGuard g(rewriter); SmallVector flowOps; for (AMDAIE::ChannelOp producerChannel : producerChannels) { Value aieProducerTile = mapper.lookup(producerChannel.getTile()); - std::optional connectionType = - connectionOp.getConnectionType(); - if (connectionType && - connectionType.value() == AMDAIE::ConnectionType::Packet) { + std::optional pktId = flowOp.getPacketId(); + if (pktId) { OpBuilder::InsertionGuard gg(rewriter); AIE::PacketFlowOp pktFlow = rewriter.create( - rewriter.getUnknownLoc(), pktFlowIndex++, nullptr, nullptr); + rewriter.getUnknownLoc(), pktId.value(), nullptr, nullptr); Region &r_pktFlow = pktFlow.getPorts(); Block *b_pktFlow = rewriter.createBlock(&r_pktFlow); rewriter.setInsertionPointToStart(b_pktFlow); rewriter.create( - rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA, - producerChannel.getValue()); + rewriter.getUnknownLoc(), aieProducerTile, + producerChannel.getPortType(), producerChannel.getValue()); for (AMDAIE::ChannelOp consumerChannel : consumerChannels) { Value aieConsumerTile = mapper.lookup(consumerChannel.getTile()); rewriter.create( - rewriter.getUnknownLoc(), aieConsumerTile, AIE::WireBundle::DMA, - consumerChannel.getValue()); + rewriter.getUnknownLoc(), aieConsumerTile, + consumerChannel.getPortType(), consumerChannel.getValue()); } rewriter.create(rewriter.getUnknownLoc()); flowOps.push_back(pktFlow.getOperation()); @@ -436,12 +433,11 @@ LogicalResult AIEDeviceBuilder::npuDmaCpyNdOpToAIE( memOps = connectionToSourceTargetMemOps[connectionOp].first; // Set the packet info attribute for MM2S DMAs, operating on a packet flow // connection. - SmallVector flowOps = connectionToFlowOps.at(connectionOp); - if (flowOps.size() == 1 && isa(flowOps[0])) { - auto flowOp = cast(flowOps[0]); - pktInfoAttr = - AIE::PacketInfoAttr::get(rewriter.getContext(), - /*pkt_type*/ 0, /*pkt_id*/ flowOp.getID()); + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (maybeFlowOp && maybeFlowOp->getPacketId()) { + pktInfoAttr = AIE::PacketInfoAttr::get( + rewriter.getContext(), + /*pkt_type*/ 0, /*pkt_id*/ maybeFlowOp->getPacketId().value()); } } else if (dmaOp.getTarget()) { offsets = dmaOp.getTargetOffsets(); @@ -642,17 +638,10 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( } consumerChannels.push_back(channelOp); } - // Insert flow ops. - rewriter.setInsertionPointToEnd(deviceBlock); - SmallVector flowOps = - createFlowOps(connectionOp, producerChannels, consumerChannels); - connectionToFlowOps[connectionOp] = flowOps; - // If the connection has been converted into a single packet flow op, retrieve - // it for creating the DMA ops down below. - AIE::PacketFlowOp pktFlowOp; - if (flowOps.size() == 1 && isa(flowOps[0])) { - pktFlowOp = cast(flowOps[0]); - } + + std::optional maybeFlowOp = connectionOp.getFlowOp(); + std::optional packetId = + maybeFlowOp ? maybeFlowOp->getPacketId() : std::nullopt; FailureOr maybeNpuDmaUserOp = connectionOp.getNpuCircularDmaCpyNdUser(); @@ -743,7 +732,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); createDMA(memOp, AIE::DMAChannelDir::MM2S, channel.getValue(), dims, acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, - lockPair, pktFlowOp); + lockPair, packetId); } } @@ -831,7 +820,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); createDMA(memOp, AIE::DMAChannelDir::S2MM, channel.getValue(), dims, acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, - lockPair, pktFlowOp); + lockPair, packetId); } } @@ -842,6 +831,38 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( return success(); } +/// Convert the `amdaie.flow` ops into `aie.flow` ops. +LogicalResult AIEDeviceBuilder::flowToAIE(AMDAIE::FlowOp flowOp, + Block *deviceBlock) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n"); + rewriter.setInsertionPointToEnd(deviceBlock); + SmallVector producerChannels; + SmallVector consumerChannels; + for (Value producerChannel : flowOp.getSources()) { + auto channelOp = + dyn_cast_if_present(producerChannel.getDefiningOp()); + if (!channelOp) { + return flowOp.emitOpError() + << "found non-`amdaie.channel` source channel"; + } + producerChannels.push_back(channelOp); + } + for (Value consumerChannel : flowOp.getTargets()) { + auto channelOp = + dyn_cast_if_present(consumerChannel.getDefiningOp()); + if (!channelOp) { + return flowOp.emitOpError() + << "found non-`amdaie.channel` target channel"; + } + consumerChannels.push_back(channelOp); + } + // Insert flow ops. + rewriter.setInsertionPointToEnd(deviceBlock); + SmallVector flowOps = + createFlowOps(flowOp, producerChannels, consumerChannels); + return success(); +} + LogicalResult AIEDeviceBuilder::lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock, int &lockIndex) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LockOp]\n"); @@ -985,6 +1006,12 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE( } return WalkResult::skip(); }) + .Case([&](auto flowOp) { + if (failed(flowToAIE(flowOp, deviceBlock))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }) .Case([&](auto lockOp) { if (failed(lockToAIE(lockOp, deviceBlock, lockId))) { return WalkResult::interrupt(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h index d46f5f232..88ec017cd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h @@ -59,6 +59,7 @@ class AIEDeviceBuilder { int &bufferId); LogicalResult connectionToAIE(AMDAIE::ConnectionOp connectionOp, Block *deviceBlock, int &connectionIndex); + LogicalResult flowToAIE(AMDAIE::FlowOp flowOp, Block *deviceBlock); LogicalResult lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock, int &lockIndex); LogicalResult logicalObjFifoFromBuffersToAIE( @@ -83,12 +84,11 @@ class AIEDeviceBuilder { size_t acqNum, size_t relNum, int64_t len, int64_t offset, const SmallVector &bufferOps, const std::pair &locks, - AIE::PacketFlowOp pktFlowOp); + std::optional pktId); /// Utility to create flow ops from connection ops. SmallVector createFlowOps( - AMDAIE::ConnectionOp connectionOp, - ArrayRef producerChannels, + AMDAIE::FlowOp flowOp, ArrayRef producerChannels, ArrayRef consumerChannels); /// Utility to create `aie.shim_dma_allocation` ops and corresponding global @@ -122,9 +122,6 @@ class AIEDeviceBuilder { IRMapping mapper; /// Dedicated mapper for the HAL bindings. IRMapping bindingsMapper; - /// Index used to create unique packet flows. Expected to be incremented after - /// a new packet flow op is created. - int pktFlowIndex{0}; /// Map from tile values to AIE memory op (`aie.mem` or `aie.memtile_dma`). /// This is used to look up and add new DMA patterns to those memory ops. DenseMap tileToMemOpMap; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 758149c62..1c91b4d90 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -51,10 +51,12 @@ iree_cc_library( "AMDAIEAssignConnectionTypes.cpp" "AMDAIEAssignLogicalObjectFifoDepth.cpp" "AMDAIEAssignNpuDmaBdIds.cpp" + "AMDAIEAssignPacketIds.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" "AMDAIECombineStridedOps.cpp" + "AMDAIEConnectionToFlow.cpp" "AMDAIEConvertToDma.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEConvertCoreForallToFor.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index be7094123..5deb1c72d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -26,12 +26,14 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEASSIGNCHANNELS #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS +#define GEN_PASS_DEF_AMDAIEASSIGNPACKETIDS #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP #define GEN_PASS_DEF_AMDAIECANONICALIZENPUDMACPYND #define GEN_PASS_DEF_AMDAIECLEANUP #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS +#define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index dfd7a02cb..b327a72cb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -612,6 +612,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEObjFifoBufferizationPass()); + passManager.addPass(createAMDAIEConnectionToFlowPass()); + passManager.addPass(createAMDAIEAssignPacketIdsPass()); addAMDAIEToAIEPasses(passManager); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 1fbc24ca1..491a0181d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -86,6 +86,9 @@ std::unique_ptr createAMDAIEAssignLogicalObjectFifoDepthPass( /// Create a pass to assign BD ids to `amdaie.npu.dma_cpy_nd` operations. std::unique_ptr createAMDAIEAssignNpuDmaBdIdsPass(); +/// Create a pass to assign packet ids to `amdaie.flow` operations. +std::unique_ptr createAMDAIEAssignPacketIdsPass(); + /// Create a pass to do some rewrites that help bridging the path to AIR/AIE /// lowering. std::unique_ptr createAMDAIEBridgeToAIRPass(); @@ -102,6 +105,9 @@ std::unique_ptr createAMDAIECanonicalizeNpuDmaCpyNdPass(); std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass( AMDAIECanonicalizeDoublyStridedOpOptions options = {}); +/// Create pass to create `amdaie.flow` ops for connections. +std::unique_ptr createAMDAIEConnectionToFlowPass(); + /// Pass to unroll the loops within the control code regions. std::unique_ptr createAMDAIEControlCodeLoopUnrollPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 575decddb..c692627da 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -61,6 +61,12 @@ def AMDAIEAssignNpuDmaBdIds : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignNpuDmaBdIdsPass()"; } +def AMDAIEAssignPacketIds : + Pass<"iree-amdaie-assign-packet-ids", ""> { + let summary = "Assign packet ids to `amdaie.flow` operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignPacketIdsPass()"; +} + def AMDAIEBridgeToAIR : Pass<"iree-amdaie-bridge-to-air", ""> { let summary = "Perform transformations that allow hooking into AIR/AIE lowering"; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEBridgeToAIRPass()"; @@ -133,6 +139,12 @@ def AMDAIECombineStridedOps : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECombineStridedOpsPass()"; } +def AMDAIEConnectionToFlow : + Pass<"iree-amdaie-connection-to-flow", ""> { + let summary = "Create flow ops for connections."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConnectionToFlowPass()"; +} + def AMDAIEControlCodeLoopUnroll : Pass<"iree-amdaie-controlcode-loop-unroll", ""> { let summary = "Unroll the loops in the control code regions."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index f4004a9a5..c083125bd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -15,11 +15,13 @@ iree_lit_test_suite( "assign_connection_types.mlir" "assign_logical_objectfifo_depth.mlir" "assign_npu_dma_bd_ids.mlir" + "assign_packet_ids.mlir" "bridge_to_air.mlir" "bufferize_to_allocation.mlir" "canonicalize_doubly_strided_op.mlir" "canonicalize_npu_dma_cpy_nd.mlir" "combine_strided_ops.mlir" + "connection_to_flow.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" "create_aie_workgroup.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir index 80855c059..92b0f691b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir @@ -6,14 +6,14 @@ // CHECK: amdaie.workgroup // CHECK: %[[tile_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[tile_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0) -// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0) +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0, port_type = DMA) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL_0]]}) -// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1) -// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1) +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1, port_type = DMA) +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1, port_type = DMA) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]}) -// CHECK: %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2) -// CHECK: %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2) +// CHECK: %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2, port_type = DMA) +// CHECK: %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2, port_type = DMA) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]}) module { func.func @assign_channels(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir index fba568fd1..48a470fdd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_connection_types.mlir @@ -7,9 +7,9 @@ // CHECK: %[[OBJ0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // CHECK: %[[OBJ1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[OBJ2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG2]] -// CHECK: amdaie.connection(%[[OBJ1]], %[[OBJ0]], connection_type = Circuit) -// CHECK: amdaie.connection(%[[OBJ0]], %[[OBJ1]], connection_type = Circuit) -// CHECK: amdaie.connection(%[[OBJ2]], %[[OBJ1]], connection_type = Circuit) +// CHECK: amdaie.connection(%[[OBJ1]], %[[OBJ0]]) {connection_type = #amdaie} +// CHECK: amdaie.connection(%[[OBJ0]], %[[OBJ1]]) {connection_type = #amdaie} +// CHECK: amdaie.connection(%[[OBJ2]], %[[OBJ1]]) {connection_type = #amdaie} // PACKET-LABEL: @assign_connection_types // PACKET-SAME: %[[ARG0:.+]]: memref<8x16xi32>, %[[ARG1:.+]]: memref<1x1x8x16xi32, 1>, %[[ARG2:.+]]: memref<1x1x8x16xi32, 2> @@ -17,9 +17,9 @@ // PACKET: %[[OBJ0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // PACKET: %[[OBJ1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // PACKET: %[[OBJ2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG2]] -// PACKET: amdaie.connection(%[[OBJ1]], %[[OBJ0]], connection_type = Packet) -// PACKET: amdaie.connection(%[[OBJ0]], %[[OBJ1]], connection_type = Packet) -// PACKET: amdaie.connection(%[[OBJ2]], %[[OBJ1]], connection_type = Packet) +// PACKET: amdaie.connection(%[[OBJ1]], %[[OBJ0]]) {connection_type = #amdaie} +// PACKET: amdaie.connection(%[[OBJ0]], %[[OBJ1]]) {connection_type = #amdaie} +// PACKET: amdaie.connection(%[[OBJ2]], %[[OBJ1]]) {connection_type = #amdaie} module { func.func @assign_connection_types(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir new file mode 100644 index 000000000..15e196d7c --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir @@ -0,0 +1,116 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-assign-packet-ids)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @assign_packet_ids(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @assign_packet_ids +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 1, port_type = DMA) +// CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false} +// CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 0 : ui8} +// CHECK: amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true, packet_id = 1 : ui8} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_packet_ids(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %2 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = true} + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_packet_ids(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %5 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %6 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %7 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %8 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %9 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %10 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %11 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %12 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %13 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %14 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %15 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %16 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %17 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %18 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %19 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %20 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %21 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %22 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %23 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %24 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %25 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %26 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %27 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %28 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %29 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %30 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + %31 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + // expected-error @+1 {{ran out of packet IDs to assign}} + %32 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} + amdaie.controlcode { + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir new file mode 100644 index 000000000..74691c7ba --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir @@ -0,0 +1,44 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-connection-to-flow)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: @connection_to_flow +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA) +// CHECK: %[[FLOW_0:.+]] = amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL]]}, flow = %[[FLOW_0]]) +// CHECK: %[[FLOW_1:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL]]}) {is_packet_flow = true} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_1]]) +// CHECK: %[[FLOW_2:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL_2]]}) {is_packet_flow = false} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_2]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_2]]) +module { + func.func @connection_to_flow(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_2} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.connection(%0 {%channel}, %1 {%channel_1}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.connection(%2 {%channel_2}, %1 {%channel_1}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 49e52c6e1..c8341cade 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -202,11 +202,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %3 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 1024] [32, 32] [64, 1]) + %4 = amdaie.npu.circular_dma_cpy_nd %3([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -281,11 +282,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { - %3 = amdaie.npu.circular_dma_cpy_nd %2([0, 0] [32, 32] [128, 1], [0, 1024] [32, 32] [64, 1]) + %4 = amdaie.npu.circular_dma_cpy_nd %3([0, 0] [32, 32] [128, 1], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -370,17 +372,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_7 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_1, 1) - %channel_3 = amdaie.channel(%tile_0_2, 1) - %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA) + %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1]) - %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1]) + %8 = amdaie.npu.circular_dma_cpy_nd %3([] [] [], [0, 0] [32, 32] [64, 1]) + %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -521,17 +525,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_7 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile_0_1, 0) - %channel_1 = amdaie.channel(%tile_0_2, 0) - %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> - %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> - %channel_2 = amdaie.channel(%tile_0_1, 1) - %channel_3 = amdaie.channel(%tile_0_2, 1) - %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> + %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> + %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA) + %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) amdaie.controlcode { - %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1]) - %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1]) + %8 = amdaie.npu.circular_dma_cpy_nd %3([] [] [], [0, 0] [32, 32] [64, 1]) + %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -602,15 +608,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %channel_2 = amdaie.channel(%tile_0_1, 0) - %channel_3 = amdaie.channel(%tile_0_2, 0) - %4 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %4 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %5 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %6 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}, flow = %5) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %5 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [0, 1024] [32, 32] [64, 1]) - %6 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [0, 1024] [32, 32] [64, 1]) + %7 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [0, 1024] [32, 32] [64, 1]) + %8 = amdaie.npu.circular_dma_cpy_nd %6([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -640,15 +648,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_1 = amdaie.lock(%tile_0_1(1), 0) %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { - %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] []) - %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] []) + %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a source BD ID op to lower to the AIE dialect}} - %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo> + %7 = amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -679,13 +688,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_1 = amdaie.lock(%tile_0_1(1), 0) %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] []) - %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] []) + %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -732,28 +742,30 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_1(1), 0) %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_0, 1) - %channel_3 = amdaie.channel(%tile_0_1, 1) - %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1]) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12, MM2S) - %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13, MM2S) + %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] []) + %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1]) + %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14, MM2S) + %15 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, MM2S) scf.forall (%arg0, %arg1) in (2, 1) { - %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14, S2MM) - %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15, S2MM) + %16 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%16, S2MM) + %17 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%17, S2MM) } amdaie.end } @@ -797,27 +809,29 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_1(1), 0) %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0) - %channel_1 = amdaie.channel(%tile_0_1, 0) - %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_0, 1) - %channel_3 = amdaie.channel(%tile_0_1, 1) - %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} + %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1]) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo> - %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12, MM2S) - %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13, MM2S) - %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14, S2MM) - %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15, S2MM) + %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] []) + %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1]) + %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo> + %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14, MM2S) + %15 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, MM2S) + %16 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%16, S2MM) + %17 = amdaie.npu.dma_cpy_nd %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%17, S2MM) amdaie.end } } @@ -1101,14 +1115,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_3}, {%lock}, {%lock_4}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6, %buffer_9, %buffer_10}, {%lock_7, %lock_11}, {%lock_8, %lock_12}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile, 0) - %channel_13 = amdaie.channel(%tile_0, 0) - %4 = amdaie.connection(%2 {%channel_13}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %channel_14 = amdaie.channel(%tile_0, 1) - %channel_15 = amdaie.channel(%tile_1, 0) - %channel_16 = amdaie.channel(%tile_2, 0) - %5 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) - %6 = amdaie.core(%tile_1, in : [%5], out : []) { + %channel = amdaie.channel(%tile, 0, port_type = DMA) + %channel_13 = amdaie.channel(%tile_0, 0, port_type = DMA) + %4 = amdaie.flow({%channel} -> {%channel_13}) {is_packet_flow = false} + %5 = amdaie.connection(%2 {%channel_13}, %1 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %channel_14 = amdaie.channel(%tile_0, 1, port_type = DMA) + %channel_15 = amdaie.channel(%tile_1, 0, port_type = DMA) + %channel_16 = amdaie.channel(%tile_2, 0, port_type = DMA) + %6 = amdaie.flow({%channel_14} -> {%channel_15, %channel_16}) {is_packet_flow = false} + %7 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}, flow = %6) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %8 = amdaie.core(%tile_1, in : [%7], out : []) { amdaie.use_lock(%lock_8, AcquireGreaterOrEqual(1)) %reinterpret_cast = memref.reinterpret_cast %buffer_5 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>) @@ -1119,7 +1135,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.use_lock(%lock_7, AcquireGreaterOrEqual(1)) amdaie.end } - %7 = amdaie.core(%tile_2, in : [%5], out : []) { + %9 = amdaie.core(%tile_2, in : [%7], out : []) { amdaie.use_lock(%lock_12, AcquireGreaterOrEqual(1)) %reinterpret_cast = memref.reinterpret_cast %buffer_9 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>) @@ -1131,11 +1147,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.end } amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([0, 0] [64, 64] [32, 1], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [64, 64] [32, 1]) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%11, MM2S) + %10 = amdaie.npu.circular_dma_cpy_nd %5([0, 0] [64, 64] [32, 1], [] [] []) + %11 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [64, 64] [32, 1]) + %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %13 = amdaie.npu.dma_cpy_nd %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13, MM2S) amdaie.end } } diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index a53b0342a..2d23678d4 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -427,6 +427,10 @@ uint32_t AMDAIEDeviceModel::getColumnShift() const { uint32_t AMDAIEDeviceModel::getRowShift() const { return configPtr.RowShift; } +uint8_t AMDAIEDeviceModel::getPacketIdMaxIdx() const { + return deviceConfig.packetIdMaxIdx; +} + uint8_t AMDAIEDeviceModel::getStreamSwitchArbiterMax(uint8_t col, uint8_t row) const { assert(isCoreTile(col, row) || isMemTile(col, row) || isShimTile(col, row)); @@ -475,6 +479,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIE1_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE1_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE1_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIE1_SS_ARBITER_MAX; @@ -498,6 +503,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2302: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIEML_SS_ARBITER_MAX; @@ -520,6 +526,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2802: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIEML_SS_ARBITER_MAX; @@ -546,6 +553,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { case AMDAIEDevice::npu1_3col: case AMDAIEDevice::npu1_4col: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIE2IPU_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE2IPU_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE2IPU_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIE2IPU_SS_ARBITER_MAX; @@ -591,6 +599,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::npu4: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.packetIdMaxIdx = XAIE_STRIXB0_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE_STRIXB0_SS_MSEL_MAX; deviceConfig.streamSwitchMemTileArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX; diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index 4a21df60a..de8c855b7 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -222,6 +222,7 @@ struct AMDAIEDeviceModel { /// aie-rt for whatever reason. Make sure the parameters can't be retrieved in /// another way before adding new fields to this struct. struct AMDAIEDeviceConfig { + uint8_t packetIdMaxIdx{0}; /// Currently, the max arbiter/msel is hidden inside aie-rt. uint8_t streamSwitchCoreArbiterMax{0}; uint8_t streamSwitchCoreMSelMax{0}; @@ -317,6 +318,8 @@ struct AMDAIEDeviceModel { uint32_t getColumnShift() const; uint32_t getRowShift() const; + uint8_t getPacketIdMaxIdx() const; + uint8_t getStreamSwitchArbiterMax(uint8_t col, uint8_t row) const; uint8_t getStreamSwitchMSelMax(uint8_t col, uint8_t row) const; diff --git a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c index 8e5185d7a..515491ef0 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c +++ b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c @@ -5,6 +5,7 @@ // SPDX-License-Identifier: # Apache-2.0 WITH LLVM-exception #include "xaiengine/xaiegbl_defs.h" +#include "xaiengine/xaiegbl.h" #undef s8 #undef u8 #undef u16 @@ -24,6 +25,7 @@ const uint64_t XAIE1_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIE1_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIE1_NUM_ROWS = XAIE_NUM_ROWS; const int XAIE1_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIE1_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIE1_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIE1_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; const int XAIE1_TILE_DMA_NUM_CH = XAIE_TILE_DMA_NUM_CH; @@ -81,6 +83,7 @@ const uint64_t XAIEML_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIEML_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIEML_NUM_ROWS = XAIE_NUM_ROWS; const int XAIEML_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIEML_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIEML_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIEML_MEM_TILE_NUM_LOCKS = XAIE_MEM_TILE_NUM_LOCKS; const int XAIEML_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; @@ -140,6 +143,7 @@ const uint64_t XAIE2IPU_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIE2IPU_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIE2IPU_NUM_ROWS = XAIE_NUM_ROWS; const int XAIE2IPU_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIE2IPU_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIE2IPU_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIE2IPU_MEM_TILE_NUM_LOCKS = XAIE_MEM_TILE_NUM_LOCKS; const int XAIE2IPU_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; @@ -198,6 +202,7 @@ const uint64_t XAIE_STRIXB0_BASE_ADDR = XAIE_BASE_ADDR; const uint64_t XAIE_STRIXB0_NPI_BASEADDR = XAIE_NPI_BASEADDR; const int XAIE_STRIXB0_NUM_ROWS = XAIE_NUM_ROWS; const int XAIE_STRIXB0_NUM_COLS = XAIE_NUM_COLS; +const uint8_t XAIE_STRIXB0_PACKET_ID_MAX = XAIE_PACKET_ID_MAX; const int XAIE_STRIXB0_TILE_NUM_LOCKS = XAIE_TILE_NUM_LOCKS; const int XAIE_STRIXB0_MEM_TILE_NUM_LOCKS = XAIE_MEM_TILE_NUM_LOCKS; const int XAIE_STRIXB0_SHIM_NUM_LOCKS = XAIE_SHIM_NUM_LOCKS; diff --git a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h index 5d9bf834d..d5db882f6 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h +++ b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.h @@ -13,6 +13,7 @@ extern uint8_t XAIE1_AIE_TILE_NUM_ROWS; extern uint8_t XAIE1_AIE_TILE_ROW_START; extern uint8_t XAIE1_COL_SHIFT; +extern uint8_t XAIE1_PACKET_ID_MAX; extern uint8_t XAIE1_MEM_TILE_DMA_NUM_CH; extern uint8_t XAIE1_MEM_TILE_NUM_LOCKS; extern uint8_t XAIE1_MEM_TILE_NUM_ROWS; @@ -45,6 +46,7 @@ extern uint8_t XAIEML_MEM_TILE_ROW_START; extern uint8_t XAIEML_NUM_COLS; extern uint8_t XAIEML_NUM_NOC_INTR_OFFSET; extern uint8_t XAIEML_NUM_ROWS; +extern uint8_t XAIEML_PACKET_ID_MAX; extern uint8_t XAIEML_ROW_SHIFT; extern uint8_t XAIEML_SHIM_DMA_NUM_CH; extern uint8_t XAIEML_SHIM_NUM_LOCKS; @@ -70,6 +72,7 @@ extern uint8_t XAIE2IPU_MEM_TILE_ROW_START; extern uint8_t XAIE2IPU_NUM_COLS; extern uint8_t XAIE2IPU_NUM_NOC_INTR_OFFSET; extern uint8_t XAIE2IPU_NUM_ROWS; +extern uint8_t XAIE2IPU_PACKET_ID_MAX; extern uint8_t XAIE2IPU_ROW_SHIFT; extern uint8_t XAIE2IPU_SHIM_DMA_NUM_CH; extern uint8_t XAIE2IPU_SHIM_NUM_LOCKS; @@ -100,6 +103,7 @@ extern uint8_t XAIE_STRIXB0_MEM_TILE_ROW_START; extern uint8_t XAIE_STRIXB0_NUM_COLS; extern uint8_t XAIE_STRIXB0_NUM_NOC_INTR_OFFSET; extern uint8_t XAIE_STRIXB0_NUM_ROWS; +extern uint8_t XAIE_STRIXB0_PACKET_ID_MAX; extern uint8_t XAIE_STRIXB0_ROW_SHIFT; extern uint8_t XAIE_STRIXB0_SHIM_DMA_NUM_CH; extern uint8_t XAIE_STRIXB0_SHIM_NUM_LOCKS;